[beignet] 01/05: Imported Upstream version 1.2.1

Rebecca Palmer rnpalmer-guest at moszumanska.debian.org
Tue Nov 15 07:27:47 UTC 2016


This is an automated email from the git hooks/post-receive script.

rnpalmer-guest pushed a commit to branch master
in repository beignet.

commit 1ec864a62660f37a3fe30e024d52ed8dd776970c
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date:   Sun Nov 13 21:30:16 2016 +0000

    Imported Upstream version 1.2.1
---
 CMakeLists.txt                                     |   2 +-
 GetGenID.sh                                        |   5 +-
 backend/src/backend/gen8_context.cpp               |  16 +-
 backend/src/backend/gen_context.cpp                |  55 ++--
 backend/src/backend/gen_encoder.cpp                |   2 +-
 backend/src/backend/gen_insn_compact.cpp           |   2 +-
 backend/src/backend/gen_insn_selection.cpp         |  73 ++---
 backend/src/backend/gen_insn_selection.hpp         |   4 +-
 backend/src/backend/gen_program.cpp                |  13 +-
 backend/src/backend/gen_register.hpp               |   5 +
 backend/src/backend/program.cpp                    |  10 +
 backend/src/ir/function.hpp                        |  12 +-
 backend/src/libocl/include/ocl.h                   |  75 +++++
 backend/src/libocl/include/ocl_image.h             | 331 ++++++++++++---------
 backend/src/libocl/src/ocl_image.cl                | 229 ++++++++++----
 backend/src/libocl/tmpl/ocl_defines.tmpl.h         |  13 +-
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |   4 +-
 backend/src/llvm/llvm_bitcode_link.cpp             |  26 +-
 backend/src/llvm/llvm_gen_backend.cpp              | 128 +++++++-
 backend/src/llvm/llvm_includes.hpp                 |   5 +
 backend/src/llvm/llvm_passes.cpp                   |   6 +-
 backend/src/llvm/llvm_to_gen.cpp                   |  19 +-
 backend/src/llvm/llvm_to_gen.hpp                   |   6 +
 backend/src/llvm/llvm_unroll.cpp                   |  34 ++-
 docs/Beignet.mdwn                                  |  22 +-
 docs/NEWS.mdwn                                     |   3 +
 docs/howto/cross-compiler-howto.mdwn               |   4 +-
 docs/howto/stand-alone-utest-howto.mdwn            |   8 +-
 kernels/compiler_subgroup_broadcast.cl             |   2 +-
 kernels/test_get_arg_info.cl                       |   2 +-
 src/CMakeLists.txt                                 |   2 +-
 src/cl_api.c                                       |  11 +-
 src/cl_command_queue_gen7.c                        |  13 +-
 src/cl_device_data.h                               |  10 +-
 src/cl_device_id.c                                 |  52 +++-
 src/cl_driver.h                                    |   4 +
 src/cl_driver_defs.c                               |   1 +
 src/cl_enqueue.c                                   |   8 +-
 src/cl_extensions.c                                |  14 +-
 src/cl_khr_icd.c                                   |  21 +-
 src/cl_mem.c                                       |   1 -
 src/intel/intel_driver.c                           |  19 +-
 src/intel/intel_gpgpu.c                            |  12 +-
 .../cl_internal_block_motion_estimate_intel.cl     |   2 +-
 src/kernels/cl_internal_copy_buffer_to_image_2d.cl |   2 +-
 src/kernels/cl_internal_copy_buffer_to_image_3d.cl |   2 +-
 utests/CMakeLists.txt                              |   9 +-
 utests/builtin_global_linear_id.cpp                |   4 +-
 utests/builtin_global_size.cpp                     |   2 +-
 .../builtin_kernel_block_motion_estimate_intel.cpp |  10 +-
 utests/builtin_kernel_max_global_size.cpp          |   4 +-
 utests/builtin_local_size.cpp                      |   2 +-
 utests/builtin_num_groups.cpp                      |   2 +-
 utests/compiler_sub_group_shuffle_up.cpp           |   2 +-
 utests/compiler_subgroup_broadcast.cpp             |   2 +-
 utests/image_1D_buffer.cpp                         |   6 +-
 utests/runtime_climage_from_boname.cpp             |   2 +-
 utests/runtime_cmrt.cpp                            |  16 +-
 utests/runtime_flat_address_space.cpp              |   2 +-
 utests/utest_generator.py                          |   2 +-
 utests/utest_helper.cpp                            |  24 +-
 utests/utest_helper.hpp                            |   3 +
 utests/utest_math_gen.py                           |  12 +-
 63 files changed, 962 insertions(+), 432 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bc2100..bac054c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 1)
 set (LIBCL_DRIVER_VERSION_MINOR 2)
-set (LIBCL_DRIVER_VERSION_PATCH 0)
+set (LIBCL_DRIVER_VERSION_PATCH 1)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 2)
 if( ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
diff --git a/GetGenID.sh b/GetGenID.sh
index 6181105..a0e5f85 100755
--- a/GetGenID.sh
+++ b/GetGenID.sh
@@ -15,7 +15,10 @@ genpciid+=(22b0 22b1 22b2 22b3)
 #SKL
 genpciid+=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 191a 192a 193a)
 #BXT
-genpciid+=(5a84)
+genpciid+=(5a84 5a85)
+#KBL
+genpciid+=(5906 5916 5926 5913 5921 5923 5927 5902 5912 5917)
+genpciid+=(590b 591b 593b 5908 590e 591e 5915 590a 591a 591d)
 pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] '{print $1}'))
 n=${#pciid[*]}
 i=0
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 5809835..09b38b2 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -328,7 +328,7 @@ namespace gbe
     assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
     assert (src1.file != GEN_IMMEDIATE_VALUE);
 
-    uint32_t base = src0.nr * 32 + src0.subnr * 4;
+    uint32_t base = src0.nr * 32 + src0.subnr;
     GenRegister baseReg = GenRegister::immuw(base);
     const GenRegister a0 = GenRegister::addr8(0);
     p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
@@ -1590,7 +1590,7 @@ namespace gbe
        wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
        wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
    {
-     p->curr.execWidth = 16;
+     p->curr.execWidth = simd;
      /* value exchanged with other threads */
      p->MOV(threadExchangeData, result[0]);
      /* partial result thread */
@@ -1600,7 +1600,7 @@ namespace gbe
        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
    {
-     p->curr.execWidth = 16;
+     p->curr.execWidth = simd;
      /* value exchanged with other threads */
      p->MOV(threadExchangeData, result[simd - 1]);
      /* partial result thread */
@@ -1614,7 +1614,7 @@ namespace gbe
      /* set result[0] to min/max/null */
      wgOpInitValue(p, result[0], wg_op);
 
-     p->curr.execWidth = 16;
+     p->curr.execWidth = simd;
      /* value exchanged with other threads */
      wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
      /* partial result thread */
@@ -1675,7 +1675,7 @@ namespace gbe
     /* do some calculation within each thread */
     wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
 
-    p->curr.execWidth = 16;
+    p->curr.execWidth = simd;
     p->MOV(theVal, dst);
     threadData = GenRegister::toUniform(threadData, dst.type);
 
@@ -1790,13 +1790,13 @@ namespace gbe
       wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
     {
       /* save result to final register location dst */
-      p->curr.execWidth = 16;
+      p->curr.execWidth = simd;
       p->MOV(dst, partialData);
     }
     else
     {
       /* save result to final register location dst */
-      p->curr.execWidth = 16;
+      p->curr.execWidth = simd;
 
       if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
           || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
@@ -1845,7 +1845,7 @@ namespace gbe
         p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
 
-        p->curr.execWidth = 16;
+        p->curr.execWidth = simd;
         p->MOV(dst, theVal);
       } p->pop();
     }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index b429ec3..4f73237 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -702,7 +702,7 @@ namespace gbe
     assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
     assert (src1.file != GEN_IMMEDIATE_VALUE);
 
-    uint32_t base = src0.nr * 32 + src0.subnr * 4;
+    uint32_t base = src0.nr * 32 + src0.subnr;
     GenRegister baseReg = GenRegister::immuw(base);
     const GenRegister a0 = GenRegister::addr8(0);
     uint32_t simd = p->curr.execWidth;
@@ -3113,7 +3113,7 @@ namespace gbe
        wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
        wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
    {
-     p->curr.execWidth = 16;
+     p->curr.execWidth = simd;
      /* value exchanged with other threads */
      p->MOV(threadExchangeData, result[0]);
      /* partial result thread */
@@ -3123,7 +3123,7 @@ namespace gbe
        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
    {
-     p->curr.execWidth = 16;
+     p->curr.execWidth = simd;
      /* value exchanged with other threads */
      p->MOV(threadExchangeData, result[simd - 1]);
      /* partial result thread */
@@ -3137,7 +3137,7 @@ namespace gbe
      /* set result[0] to min/max/null */
      wgOpInitValue(p, result[0], wg_op);
 
-     p->curr.execWidth = 16;
+     p->curr.execWidth = simd;
      /* value exchanged with other threads */
      wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
      /* partial result thread */
@@ -3198,7 +3198,7 @@ namespace gbe
     /* do some calculation within each thread */
     wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
 
-    p->curr.execWidth = 16;
+    p->curr.execWidth = simd;
     p->MOV(theVal, dst);
     threadData = GenRegister::toUniform(threadData, dst.type);
 
@@ -3313,13 +3313,13 @@ namespace gbe
       wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
     {
       /* save result to final register location dst */
-      p->curr.execWidth = 16;
+      p->curr.execWidth = simd;
       p->MOV(dst, partialData);
     }
     else
     {
       /* save result to final register location dst */
-      p->curr.execWidth = 16;
+      p->curr.execWidth = simd;
 
       if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
           || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
@@ -3368,7 +3368,7 @@ namespace gbe
         p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
 
-        p->curr.execWidth = 16;
+        p->curr.execWidth = simd;
         p->MOV(dst, theVal);
       } p->pop();
     }
@@ -3411,13 +3411,14 @@ namespace gbe
   void GenContext::emitPrintfInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister tmp0 = ra->genReg(insn.dst(1));
+    const GenRegister tmp1 = ra->genReg(insn.dst(2));
     GenRegister src;
     uint32_t srcNum = insn.srcNum;
     if (insn.extra.continueFlag)
       srcNum--;
 
     GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
-    GenRegister data = GenRegister::offset(addr, 2);
+    GenRegister data = GenRegister::retype(tmp1, GEN_TYPE_UD);
 
     if (!insn.extra.continueFlag) {
       p->push(); {
@@ -3507,7 +3508,7 @@ namespace gbe
 
       // Update the header with the current address
       p->curr.execWidth = 1;
-      p->SHR(headeraddr, addr, GenRegister::immud(4));
+      p->MOV(headeraddr, addr);
 
       // Put zero in the general state base address
       p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
@@ -3540,7 +3541,7 @@ namespace gbe
             {
               // Update the address in header
               p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
             }
             p->pop();
           }
@@ -3561,7 +3562,7 @@ namespace gbe
             {
               // Update the address in header
               p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
             }
             p->pop();
           }
@@ -3661,18 +3662,13 @@ namespace gbe
   }
 
   void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister dst = ra->genReg(insn.dst(1));
     const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
     const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
-    GenRegister header, offsetx, offsety, blocksizereg;
-    if (simdWidth == 8)
-      header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    else
-      header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(2)),1), GEN_TYPE_UD);
-
-    offsetx = GenRegister::offset(header, 0, 0*4);
-    offsety = GenRegister::offset(header, 0, 1*4);
-    blocksizereg = GenRegister::offset(header, 0, 2*4);
+    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
+    const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
+    const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
     size_t vec_size = insn.extra.elem;
     uint32_t blocksize = 0x1F | (vec_size-1) << 16;
 
@@ -3699,7 +3695,7 @@ namespace gbe
     }
     else if (simdWidth == 16)
     {
-      const GenRegister tmp = ra->genReg(insn.dst(vec_size));
+      const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
       p->push();
         // Copy r0 into the header first
         p->curr.execWidth = 8;
@@ -3717,23 +3713,22 @@ namespace gbe
         // Now read the data
         p->curr.execWidth = 8;
         p->MBREAD(tmp, header, insn.getbti(), vec_size);
+        for (uint32_t i = 0; i < vec_size; i++)
+          p->MOV(ra->genReg(insn.dst(i + 1)), GenRegister::offset(tmp, i));
 
         // Second half
         // Update the header with the coord
         p->curr.execWidth = 1;
         p->ADD(offsetx, offsetx, GenRegister::immud(32));
 
-        const GenRegister tmp2 = GenRegister::offset(tmp, vec_size);
         // Now read the data
         p->curr.execWidth = 8;
-        p->MBREAD(tmp2, header, insn.getbti(), vec_size);
+        p->MBREAD(tmp, header, insn.getbti(), vec_size);
 
         // Move the reg to fit vector rule.
-        for (uint32_t i = 0; i < vec_size; i++) {
-          p->MOV(GenRegister::offset(dst, i * 2), GenRegister::offset(tmp, i));
-          p->MOV(GenRegister::offset(dst, i * 2 + 1),
-                 GenRegister::offset(tmp2, i));
-        }
+        for (uint32_t i = 0; i < vec_size; i++)
+          p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
+                 GenRegister::offset(tmp, i));
       p->pop();
     } else NOT_IMPLEMENTED;
   }
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index f8c99b2..975e1c7 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1270,7 +1270,7 @@ namespace gbe
                 insn,
                 bti,
                 size,
-                GEN7_OBLOCK_READ,
+                GEN7_UNALIGNED_OBLOCK_READ,
                 msg_length,
                 response_length);
   }
diff --git a/backend/src/backend/gen_insn_compact.cpp b/backend/src/backend/gen_insn_compact.cpp
index 5de451c..62fcb61 100644
--- a/backend/src/backend/gen_insn_compact.cpp
+++ b/backend/src/backend/gen_insn_compact.cpp
@@ -587,7 +587,7 @@ namespace gbe {
     if(src0->file == GEN_IMMEDIATE_VALUE)
       return -1;
 
-    compact_table_entry *r;
+    compact_table_entry *r = NULL;
     if(p->getCompactVersion() == 7) {
       DataTypeBits b;
       b.data = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 6cfa87f..2b89c7f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1770,13 +1770,15 @@ namespace gbe
   GenRegister Selection::Opaque::getLaneIDReg()
   {
     const GenRegister laneID = GenRegister::immv(0x76543210);
-    ir::Register r = reg(ir::RegisterFamily::FAMILY_WORD);
-    const GenRegister dst = selReg(r, ir::TYPE_U16);
+    GenRegister dst;
 
     uint32_t execWidth = curr.execWidth;
-    if (execWidth == 8)
+    if (execWidth == 8) {
+      // Work around to force the register 32 alignmet
+      dst = selReg(reg(ir::RegisterFamily::FAMILY_DWORD), ir::TYPE_U16);
       MOV(dst, laneID);
-    else {
+    } else {
+      dst = selReg(reg(ir::RegisterFamily::FAMILY_WORD), ir::TYPE_U16);
       push();
       curr.execWidth = 8;
       curr.noMask = 1;
@@ -2086,30 +2088,33 @@ namespace gbe
                                  uint32_t vec_size) {
 
     uint32_t simdWidth = curr.execWidth;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8, 3);
-    SelectionVector *vector = this->appendVector();
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
+
+    insn->dst(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i) {
-      insn->dst(i) = dsts[i];
+      insn->dst(i + 1) = dsts[i];
       if(simdWidth == 16)
-        insn->dst(i + vec_size) = tmp[i];
+        insn->dst(i + vec_size + 1) = tmp[i];
     }
     insn->src(0) = coordx;
     insn->src(1) = coordy;
-    insn->src(2) = header;
     insn->setbti(bti);
     insn->extra.elem = vec_size; // vector size
 
-    vector->regNum = vec_size;
-    vector->reg = &insn->dst(0);
-    vector->offsetID = 0;
-    vector->isSrc = 0;
-
+    // Only in simd 8 the data is in vector form
+    if(simdWidth == 8) {
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = vec_size;
+      vector->reg = &insn->dst(1);
+      vector->offsetID = 1;
+      vector->isSrc = 0;
+    }
     if(simdWidth == 16)
     {
       SelectionVector *vectortmp = this->appendVector();
       vectortmp->regNum = vec_size;
-      vectortmp->reg = &insn->dst(vec_size);
-      vectortmp->offsetID = vec_size;
+      vectortmp->reg = &insn->dst(vec_size + 1);
+      vectortmp->offsetID = vec_size + 1;
       vectortmp->isSrc = 0;
     }
   }
@@ -2218,6 +2223,16 @@ namespace gbe
         if (!ld.isAligned())
           return false;
       }
+      //If dst is a bool reg, the insn may modify flag, can't use this flag
+      //as predication, so can't remove if/endif. For example ir:
+      //%or.cond1244 = or i1 %cmp.i338, %cmp2.i403
+      //%or.cond1245 = or i1 %or.cond1244, %cmp3.i405
+      //asm:
+      //(+f1.0) or.ne(16)       g20<1>:W        g9<8,8,1>:W     g1<8,8,1>:W
+      //(+f1.1) or.ne.f1.1(16)  g21<1>:W        g20<8,8,1>:W    g30<8,8,1>:W
+      //The second insn is error.
+      if(insn.getDstNum() && getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL)
+          return false;
     }
 
     // there would generate a extra CMP instruction for predicated BRA with extern flag,
@@ -2783,17 +2798,12 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType, bool isSrc = false) {
       if (opcode == ir::OP_CBIT)
         return isSrc ? insnType : ir::TYPE_U32;
-      if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
-        return insnType;
-      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL || opcode == ir::OP_LZD)
-        return ir::TYPE_U32;
-      if (opcode == ir::OP_SIMD_ANY || opcode == ir::OP_SIMD_ALL)
-        return ir::TYPE_S32;
-      if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
-        return insnType;
       if (insnType == ir::TYPE_BOOL)
         return ir::TYPE_U16;
-      return ir::TYPE_FLOAT;
+      else if (opcode == ir::OP_MOV && (insnType == ir::TYPE_U32 || insnType == ir::TYPE_S32))
+        return ir::TYPE_FLOAT;
+      else
+        return insnType;
     }
 
     INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn, bool &markChildren) const {
@@ -6166,13 +6176,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       uint32_t totalSize = 0;
       bool isContinue = false;
       GBE_ASSERT(sel.ctx.getSimdWidth() == 16 || sel.ctx.getSimdWidth() == 8);
-      if (sel.ctx.getSimdWidth() == 16) {
-        tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-        tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-      } else {
-        tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), GEN_TYPE_UD);
-        tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), GEN_TYPE_UD);
-      }
+      tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
 
       /* Get the total size for one printf statement. */
       for (i = 0; i < srcNum; i++) {
@@ -6713,11 +6718,11 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       for (uint32_t i = 0; i < vec_size; ++i) {
         valuesVec.push_back(sel.selReg(insn.getDst(i), TYPE_U32));
         if(simdWidth == 16)
-          tmpVec.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
       }
       const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
       const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
-      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
       GenRegister *tmp = NULL;
       if(simdWidth == 16)
         tmp = &tmpVec[0];
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 5e28ec9..14ac05f 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -94,8 +94,8 @@ namespace gbe
     void setDBGInfo(DebugInfo in) { DBGInfo = in; }
     /*! No more than 40 sources (40 sources are used by vme for payload passing and setting) */
     enum { MAX_SRC_NUM = 40 };
-    /*! No more than 16 destinations (15 used by I64DIV/I64REM) */
-    enum { MAX_DST_NUM = 16 };
+    /*! No more than 17 destinations (17 used by image block read8) */
+    enum { MAX_DST_NUM = 17 };
     /*! State of the instruction (extra fields neeed for the encoding) */
     GenInstructionState state;
     union {
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 32f7794..4ef82d1 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -334,7 +334,11 @@ namespace gbe {
     //the first byte stands for binary_type.
     binary_content.assign(binary+1, size-1);
     llvm::StringRef llvm_bin_str(binary_content);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    llvm::LLVMContext& c = GBEGetLLVMContext();
+#else
     llvm::LLVMContext& c = llvm::getGlobalContext();
+#endif
     llvm::SMDiagnostic Err;
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
     std::unique_ptr<llvm::MemoryBuffer> memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
@@ -488,10 +492,17 @@ namespace gbe {
 #endif
       errSize = 0;
     }else{
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+      // Src now will be removed automatically. So clone it.
+      llvm::Module* src = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module).release();
+#else
       llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
+#endif
       llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+      if (LLVMLinkModules2(wrap(dst), wrap(src))) {
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
       if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource_Removed, &errMsg)) {
 #else
       if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource, &errMsg)) {
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index bbea761..a8eb2e4 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -1267,6 +1267,11 @@ namespace gbe
         } else if (reg.type == GEN_TYPE_W) {
           const uint16_t uw = reg.value.ud & 0xffff;
           reg = GenRegister::immw(-(int16_t)uw);
+        } else if (reg.type == GEN_TYPE_HF_IMM) {
+          const uint16_t uw = reg.value.ud & 0xffff;
+          reg = GenRegister::immh(uw ^ 0x8000);
+        } else if (reg.type == GEN_TYPE_DF_IMM) {
+          reg.value.df = -reg.value.df;
         } else
           NOT_SUPPORTED;
       }
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index b7dc00e..2224880 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -133,7 +133,13 @@ namespace gbe {
     bool strictMath = true;
     if (fast_relaxed_math || !OCL_STRICT_CONFORMANCE)
       strictMath = false;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    llvm::Module * linked_module = module ? llvm::CloneModule((llvm::Module*)module).release() : NULL;
+    // Src now will be removed automatically. So clone it.
+    if (llvmToGen(*unit, fileName, linked_module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
+#else
     if (llvmToGen(*unit, fileName, module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
+#endif
       if (fileName)
         error = std::string(fileName) + " not found";
       delete unit;
@@ -1057,7 +1063,11 @@ EXTEND_QUOTE:
     //FIXME: if use new allocated context to link two modules there would be context mismatch
     //for some functions, so we use global context now, need switch to new context later.
     llvm::Module * out_module;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    llvm::LLVMContext* llvm_ctx = &GBEGetLLVMContext();
+#else
     llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
+#endif
 
     if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, dumpSPIRBinaryName, clOpt,
                               stringSize, err, errSize)) {
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index ae0a702..71a6d07 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -208,22 +208,22 @@ namespace ir {
       }
 #else
       bool isImage1dT() const {
-        return typeBaseName.compare("image1d_t") == 0;
+        return typeBaseName.find("image1d_t") !=std::string::npos;
       }
       bool isImage1dArrayT() const {
-        return typeBaseName.compare("image1d_array_t") == 0;
+        return typeBaseName.find("image1d_array_t") !=std::string::npos;
       }
       bool isImage1dBufferT() const {
-        return typeBaseName.compare("image1d_buffer_t") == 0;
+        return typeBaseName.find("image1d_buffer_t") !=std::string::npos;
       }
       bool isImage2dT() const {
-        return typeBaseName.compare("image2d_t") == 0;
+        return typeBaseName.find("image2d_t") !=std::string::npos;
       }
       bool isImage2dArrayT() const {
-        return typeBaseName.compare("image2d_array_t") == 0;
+        return typeBaseName.find("image2d_array_t") !=std::string::npos;
       }
       bool isImage3dT() const {
-        return typeBaseName.compare("image3d_t") == 0;
+        return typeBaseName.find("image3d_t") !=std::string::npos;
       }
       bool isSamplerType() const {
         return typeBaseName.compare("sampler_t") == 0;
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index abb2bd4..5e3a788 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -18,6 +18,67 @@
 #ifndef __OCL_H__
 #define __OCL_H__
 
+/* LLVM 3.9 has these pre defined undef them first */
+#ifdef cl_khr_3d_image_writes
+#undef cl_khr_3d_image_writes
+#endif
+#ifdef cl_khr_byte_addressable_store
+#undef cl_khr_byte_addressable_store
+#endif
+#ifdef cl_khr_fp16
+#undef cl_khr_fp16
+#endif
+#ifdef cl_khr_fp64
+#undef cl_khr_fp64
+#endif
+#ifdef cl_khr_global_int32_base_atomics
+#undef cl_khr_global_int32_base_atomics
+#endif
+#ifdef cl_khr_global_int32_extended_atomics
+#undef cl_khr_global_int32_extended_atomics
+#endif
+#ifdef cl_khr_gl_sharing
+#undef cl_khr_gl_sharing
+#endif
+#ifdef cl_khr_icd
+#undef cl_khr_icd
+#endif
+#ifdef cl_khr_local_int32_base_atomics
+#undef cl_khr_local_int32_base_atomics
+#endif
+#ifdef cl_khr_local_int32_extended_atomics
+#undef cl_khr_local_int32_extended_atomics
+#endif
+
+#ifdef cl_khr_d3d10_sharing
+#undef cl_khr_d3d10_sharing
+#endif
+#ifdef cl_khr_gl_event
+#undef cl_khr_gl_event
+#endif
+#ifdef cl_khr_int64_base_atomics
+#undef cl_khr_int64_base_atomics
+#endif
+#ifdef cl_khr_int64_extended_atomics
+#undef cl_khr_int64_extended_atomics
+#endif
+
+#ifdef cl_khr_d3d11_sharing
+#undef cl_khr_d3d11_sharing
+#endif
+#ifdef cl_khr_depth_images
+#undef cl_khr_depth_images
+#endif
+#ifdef cl_khr_dx9_media_sharing
+#undef cl_khr_dx9_media_sharing
+#endif
+#ifdef cl_khr_gl_depth_images
+#undef cl_khr_gl_depth_images
+#endif
+#ifdef cl_khr_spir
+#undef cl_khr_spir
+#endif
+
 #include "ocl_defines.h"
 #include "ocl_types.h"
 #include "ocl_as.h"
@@ -40,6 +101,20 @@
 #include "ocl_workitem.h"
 #include "ocl_simd.h"
 #include "ocl_work_group.h"
+
+/* Move these out from ocl_defines.h for only one define */
+#define cl_khr_global_int32_base_atomics
+#define cl_khr_global_int32_extended_atomics
+#define cl_khr_local_int32_base_atomics
+#define cl_khr_local_int32_extended_atomics
+#define cl_khr_byte_addressable_store
+#define cl_khr_icd
+#define cl_khr_gl_sharing
+#define cl_khr_spir
+#define cl_khr_fp16
+#define cl_khr_3d_image_writes
+#define cl_intel_subgroups
+
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 #endif
diff --git a/backend/src/libocl/include/ocl_image.h b/backend/src/libocl/include/ocl_image.h
index 39106cf..cdb3411 100644
--- a/backend/src/libocl/include/ocl_image.h
+++ b/backend/src/libocl/include/ocl_image.h
@@ -20,152 +20,189 @@
 
 #include "ocl_types.h"
 
-OVERLOADABLE int4 read_imagei(image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE int4 read_imagei(image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE int4 read_imagei(image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagei(image1d_t cl_image, int coord, int4 color);
-OVERLOADABLE void write_imagei(image1d_t cl_image, float coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE uint4 read_imageui(image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE uint4 read_imageui(image1d_t cl_image, int coord);
-OVERLOADABLE void write_imageui(image1d_t cl_image, int coord, uint4 color);
-OVERLOADABLE void write_imageui(image1d_t cl_image, float coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE float4 read_imagef(image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE float4 read_imagef(image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagef(image1d_t cl_image, int coord, float4 color);
-OVERLOADABLE void write_imagef(image1d_t cl_image, float coord, float4 color);
-OVERLOADABLE int4 read_imagei(image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imagei(image1d_buffer_t cl_image, int coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imageui(image1d_buffer_t cl_image, int coord, uint4 color);
-OVERLOADABLE void write_imageui(image1d_buffer_t cl_image, float coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imagef(image1d_buffer_t cl_image, int coord, float4 color);
-
-OVERLOADABLE int get_image_channel_data_type(image1d_t image);
-OVERLOADABLE int get_image_channel_order(image1d_t image);
-OVERLOADABLE int get_image_width(image1d_t image);
-OVERLOADABLE int get_image_channel_data_type(image1d_buffer_t image);
-OVERLOADABLE int get_image_channel_order(image1d_buffer_t image);
-OVERLOADABLE int get_image_width(image1d_buffer_t image);
-OVERLOADABLE int4 read_imagei(image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE int4 read_imagei(image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE int4 read_imagei(image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imagei(image2d_t cl_image, int2 coord, int4 color);
-OVERLOADABLE void write_imagei(image2d_t cl_image, float2 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE uint4 read_imageui(image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE uint4 read_imageui(image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imageui(image2d_t cl_image, int2 coord, uint4 color);
-OVERLOADABLE void write_imageui(image2d_t cl_image, float2 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE float4 read_imagef(image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE float4 read_imagef(image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imagef(image2d_t cl_image, int2 coord, float4 color);
-OVERLOADABLE void write_imagef(image2d_t cl_image, float2 coord, float4 color);
-OVERLOADABLE int4 read_imagei(image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE int4 read_imagei(image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE int4 read_imagei(image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imagei(image1d_array_t cl_image, int2 coord, int4 color);
-OVERLOADABLE void write_imagei(image1d_array_t cl_image, float2 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE uint4 read_imageui(image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE uint4 read_imageui(image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imageui(image1d_array_t cl_image, int2 coord, uint4 color);
-OVERLOADABLE void write_imageui(image1d_array_t cl_image, float2 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE float4 read_imagef(image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE float4 read_imagef(image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imagef(image1d_array_t cl_image, int2 coord, float4 color);
-OVERLOADABLE void write_imagef(image1d_array_t cl_image, float2 coord, float4 color);
-
-OVERLOADABLE int get_image_channel_data_type(image2d_t image);
-OVERLOADABLE int get_image_channel_order(image2d_t image);
-OVERLOADABLE int get_image_width(image2d_t image);
-OVERLOADABLE int get_image_height(image2d_t image);
-OVERLOADABLE int2 get_image_dim(image2d_t image);
-
-OVERLOADABLE int get_image_channel_data_type(image1d_array_t image);
-OVERLOADABLE int get_image_channel_order(image1d_array_t image);
-OVERLOADABLE int get_image_width(image1d_array_t image);
-OVERLOADABLE size_t get_image_array_size(image1d_array_t image);
-OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE int4 read_imagei(image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imagei(image3d_t cl_image, int4 coord, int4 color);
-OVERLOADABLE void write_imagei(image3d_t cl_image, float4 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE uint4 read_imageui(image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imageui(image3d_t cl_image, int4 coord, uint4 color);
-OVERLOADABLE void write_imageui(image3d_t cl_image, float4 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE float4 read_imagef(image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imagef(image3d_t cl_image, int4 coord, float4 color);
-OVERLOADABLE void write_imagef(image3d_t cl_image, float4 coord, float4 color);
-
-OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE int4 read_imagei(image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imagei(image3d_t cl_image, int3 coord, int4 color);
-OVERLOADABLE void write_imagei(image3d_t cl_image, float3 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE uint4 read_imageui(image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imageui(image3d_t cl_image, int3 coord, uint4 color);
-OVERLOADABLE void write_imageui(image3d_t cl_image, float3 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE float4 read_imagef(image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imagef(image3d_t cl_image, int3 coord, float4 color);
-OVERLOADABLE void write_imagef(image3d_t cl_image, float3 coord, float4 color);
-OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imagei(image2d_array_t cl_image, int4 coord, int4 color);
-OVERLOADABLE void write_imagei(image2d_array_t cl_image, float4 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imageui(image2d_array_t cl_image, int4 coord, uint4 color);
-OVERLOADABLE void write_imageui(image2d_array_t cl_image, float4 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imagef(image2d_array_t cl_image, int4 coord, float4 color);
-OVERLOADABLE void write_imagef(image2d_array_t cl_image, float4 coord, float4 color);
-
-OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imagei(image2d_array_t cl_image, int3 coord, int4 color);
-OVERLOADABLE void write_imagei(image2d_array_t cl_image, float3 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imageui(image2d_array_t cl_image, int3 coord, uint4 color);
-OVERLOADABLE void write_imageui(image2d_array_t cl_image, float3 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imagef(image2d_array_t cl_image, int3 coord, float4 color);
-OVERLOADABLE void write_imagef(image2d_array_t cl_image, float3 coord, float4 color);
-
-OVERLOADABLE int get_image_channel_data_type(image3d_t image);
-OVERLOADABLE int get_image_channel_order(image3d_t image);
-OVERLOADABLE int get_image_width(image3d_t image);
-OVERLOADABLE int get_image_height(image3d_t image);
-OVERLOADABLE int get_image_depth(image3d_t image);
-OVERLOADABLE int4 get_image_dim(image3d_t image);
-
-
-OVERLOADABLE int get_image_channel_data_type(image2d_array_t image);
-OVERLOADABLE int get_image_channel_order(image2d_array_t image);
-OVERLOADABLE int get_image_width(image2d_array_t image);
-OVERLOADABLE int get_image_height(image2d_array_t image);
-OVERLOADABLE int2 get_image_dim(image2d_array_t image);
-OVERLOADABLE size_t get_image_array_size(image2d_array_t image);
+OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, int coord);
+OVERLOADABLE void write_imagei(write_only image1d_t cl_image, int coord, int4 color);
+OVERLOADABLE void write_imagei(write_only image1d_t cl_image, float coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, int coord);
+OVERLOADABLE void write_imageui(write_only image1d_t cl_image, int coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image1d_t cl_image, float coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, int coord);
+OVERLOADABLE void write_imagef(write_only image1d_t cl_image, int coord, float4 color);
+OVERLOADABLE void write_imagef(write_only image1d_t cl_image, float coord, float4 color);
+OVERLOADABLE int4 read_imagei(read_only image1d_buffer_t cl_image, int coord);
+OVERLOADABLE void write_imagei(write_only image1d_buffer_t cl_image, int coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image1d_buffer_t cl_image, int coord);
+OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, int coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, float coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image1d_buffer_t cl_image, int coord);
+OVERLOADABLE void write_imagef(write_only image1d_buffer_t cl_image, int coord, float4 color);
+
+OVERLOADABLE int get_image_channel_data_type(read_only image1d_t image);
+OVERLOADABLE int get_image_channel_order(read_only image1d_t image);
+OVERLOADABLE int get_image_width(read_only image1d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_only image1d_buffer_t image);
+OVERLOADABLE int get_image_channel_order(read_only image1d_buffer_t image);
+OVERLOADABLE int get_image_width(read_only image1d_buffer_t image);
+
+OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, int2 coord);
+OVERLOADABLE void write_imagei(write_only image2d_t cl_image, int2 coord, int4 color);
+OVERLOADABLE void write_imagei(write_only image2d_t cl_image, float2 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, int2 coord);
+OVERLOADABLE void write_imageui(write_only image2d_t cl_image, int2 coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image2d_t cl_image, float2 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, int2 coord);
+OVERLOADABLE void write_imagef(write_only image2d_t cl_image, int2 coord, float4 color);
+OVERLOADABLE void write_imagef(write_only image2d_t cl_image, float2 coord, float4 color);
+OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, int2 coord);
+OVERLOADABLE void write_imagei(write_only image1d_array_t cl_image, int2 coord, int4 color);
+OVERLOADABLE void write_imagei(write_only image1d_array_t cl_image, float2 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, int2 coord);
+OVERLOADABLE void write_imageui(write_only image1d_array_t cl_image, int2 coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image1d_array_t cl_image, float2 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, int2 coord);
+OVERLOADABLE void write_imagef(write_only image1d_array_t cl_image, int2 coord, float4 color);
+OVERLOADABLE void write_imagef(write_only image1d_array_t cl_image, float2 coord, float4 color);
+
+OVERLOADABLE int get_image_channel_data_type(read_only image2d_t image);
+OVERLOADABLE int get_image_channel_order(read_only image2d_t image);
+OVERLOADABLE int get_image_width(read_only image2d_t image);
+OVERLOADABLE int get_image_height(read_only image2d_t image);
+OVERLOADABLE int2 get_image_dim(read_only image2d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_only image1d_array_t image);
+OVERLOADABLE int get_image_channel_order(read_only image1d_array_t image);
+OVERLOADABLE int get_image_width(read_only image1d_array_t image);
+OVERLOADABLE size_t get_image_array_size(read_only image1d_array_t image);
+
+OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, int4 coord);
+OVERLOADABLE void write_imagei(write_only image3d_t cl_image, int4 coord, int4 color);
+OVERLOADABLE void write_imagei(write_only image3d_t cl_image, float4 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, int4 coord);
+OVERLOADABLE void write_imageui(write_only image3d_t cl_image, int4 coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image3d_t cl_image, float4 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, int4 coord);
+OVERLOADABLE void write_imagef(write_only image3d_t cl_image, int4 coord, float4 color);
+OVERLOADABLE void write_imagef(write_only image3d_t cl_image, float4 coord, float4 color);
+
+OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, int3 coord);
+OVERLOADABLE void write_imagei(write_only image3d_t cl_image, int3 coord, int4 color);
+OVERLOADABLE void write_imagei(write_only image3d_t cl_image, float3 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, int3 coord);
+OVERLOADABLE void write_imageui(write_only image3d_t cl_image, int3 coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image3d_t cl_image, float3 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, int3 coord);
+OVERLOADABLE void write_imagef(write_only image3d_t cl_image, int3 coord, float4 color);
+OVERLOADABLE void write_imagef(write_only image3d_t cl_image, float3 coord, float4 color);
+OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, int4 coord);
+OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, int4 coord, int4 color);
+OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, float4 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, int4 coord);
+OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, int4 coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, float4 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, int4 coord);
+OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, int4 coord, float4 color);
+OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, float4 coord, float4 color);
+
+OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, int3 coord);
+OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, int3 coord, int4 color);
+OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, float3 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, int3 coord);
+OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, int3 coord, uint4 color);
+OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, float3 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, int3 coord);
+OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, int3 coord, float4 color);
+OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, float3 coord, float4 color);
+
+OVERLOADABLE int get_image_channel_data_type(read_only image3d_t image);
+OVERLOADABLE int get_image_channel_order(read_only image3d_t image);
+OVERLOADABLE int get_image_width(read_only image3d_t image);
+OVERLOADABLE int get_image_height(read_only image3d_t image);
+OVERLOADABLE int get_image_depth(read_only image3d_t image);
+OVERLOADABLE int4 get_image_dim(read_only image3d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_only image2d_array_t image);
+OVERLOADABLE int get_image_channel_order(read_only image2d_array_t image);
+OVERLOADABLE int get_image_width(read_only image2d_array_t image);
+OVERLOADABLE int get_image_height(read_only image2d_array_t image);
+OVERLOADABLE int2 get_image_dim(read_only image2d_array_t image);
+OVERLOADABLE size_t get_image_array_size(read_only image2d_array_t image);
+
+#if __clang_major__*10 + __clang_minor__ >= 39
+OVERLOADABLE int get_image_channel_data_type(write_only image1d_t image);
+OVERLOADABLE int get_image_channel_order(write_only image1d_t image);
+OVERLOADABLE int get_image_width(write_only image1d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(write_only image1d_buffer_t image);
+OVERLOADABLE int get_image_channel_order(write_only image1d_buffer_t image);
+OVERLOADABLE int get_image_width(write_only image1d_buffer_t image);
+
+OVERLOADABLE int get_image_channel_data_type(write_only image2d_t image);
+OVERLOADABLE int get_image_channel_order(write_only image2d_t image);
+OVERLOADABLE int get_image_width(write_only image2d_t image);
+OVERLOADABLE int get_image_height(write_only image2d_t image);
+OVERLOADABLE int2 get_image_dim(write_only image2d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(write_only image1d_array_t image);
+OVERLOADABLE int get_image_channel_order(write_only image1d_array_t image);
+OVERLOADABLE int get_image_width(write_only image1d_array_t image);
+OVERLOADABLE size_t get_image_array_size(write_only image1d_array_t image);
+
+OVERLOADABLE int get_image_channel_data_type(write_only image3d_t image);
+OVERLOADABLE int get_image_channel_order(write_only image3d_t image);
+OVERLOADABLE int get_image_width(write_only image3d_t image);
+OVERLOADABLE int get_image_height(write_only image3d_t image);
+OVERLOADABLE int get_image_depth(write_only image3d_t image);
+OVERLOADABLE int4 get_image_dim(write_only image3d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(write_only image2d_array_t image);
+OVERLOADABLE int get_image_channel_order(write_only image2d_array_t image);
+OVERLOADABLE int get_image_width(write_only image2d_array_t image);
+OVERLOADABLE int get_image_height(write_only image2d_array_t image);
+OVERLOADABLE int2 get_image_dim(write_only image2d_array_t image);
+OVERLOADABLE size_t get_image_array_size(write_only image2d_array_t image);
+#endif
 
 #endif
diff --git a/backend/src/libocl/src/ocl_image.cl b/backend/src/libocl/src/ocl_image.cl
index eb1a2ff..a1125a8 100644
--- a/backend/src/libocl/src/ocl_image.cl
+++ b/backend/src/libocl/src/ocl_image.cl
@@ -29,21 +29,21 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 #define DECL_GEN_OCL_RW_IMAGE(image_type, n) \
-  OVERLOADABLE int4 __gen_ocl_read_imagei(image_type image, sampler_t sampler,            \
+  OVERLOADABLE int4 __gen_ocl_read_imagei(read_only image_type image, sampler_t sampler,            \
                                           float ##n coord, uint sampler_offset);          \
-  OVERLOADABLE int4 __gen_ocl_read_imagei(image_type image, sampler_t sampler,            \
+  OVERLOADABLE int4 __gen_ocl_read_imagei(read_only image_type image, sampler_t sampler,            \
                                           int ##n coord, uint sampler_offset);            \
-  OVERLOADABLE uint4 __gen_ocl_read_imageui(image_type image, sampler_t sampler,          \
+  OVERLOADABLE uint4 __gen_ocl_read_imageui(read_only image_type image, sampler_t sampler,          \
                                             float ##n coord, uint sampler_offset);        \
-  OVERLOADABLE uint4 __gen_ocl_read_imageui(image_type image, sampler_t sampler,          \
+  OVERLOADABLE uint4 __gen_ocl_read_imageui(read_only image_type image, sampler_t sampler,          \
                                             int ##n coord, uint sampler_offset);          \
-  OVERLOADABLE float4 __gen_ocl_read_imagef(image_type image, sampler_t sampler,          \
+  OVERLOADABLE float4 __gen_ocl_read_imagef(read_only image_type image, sampler_t sampler,          \
                                             float ##n coord, uint sampler_offset);        \
-  OVERLOADABLE float4 __gen_ocl_read_imagef(image_type image, sampler_t sampler,          \
+  OVERLOADABLE float4 __gen_ocl_read_imagef(read_only image_type image, sampler_t sampler,          \
                                             int ##n coord, uint sampler_offset);          \
-  OVERLOADABLE void __gen_ocl_write_imagei(image_type image, int ##n coord , int4 color); \
-  OVERLOADABLE void __gen_ocl_write_imageui(image_type image, int ##n coord, uint4 color);\
-  OVERLOADABLE void __gen_ocl_write_imagef(image_type image, int ##n coord, float4 color);
+  OVERLOADABLE void __gen_ocl_write_imagei(write_only image_type image, int ##n coord , int4 color); \
+  OVERLOADABLE void __gen_ocl_write_imageui(write_only image_type image, int ##n coord, uint4 color);\
+  OVERLOADABLE void __gen_ocl_write_imagef(write_only image_type image, int ##n coord, float4 color);
 
 #define DECL_GEN_OCL_QUERY_IMAGE(image_type) \
   OVERLOADABLE int __gen_ocl_get_image_width(image_type image);                           \
@@ -62,57 +62,104 @@ DECL_GEN_OCL_RW_IMAGE(image3d_t, 3)
 DECL_GEN_OCL_RW_IMAGE(image2d_array_t, 4)
 DECL_GEN_OCL_RW_IMAGE(image3d_t, 4)
 
-DECL_GEN_OCL_QUERY_IMAGE(image1d_t)
-DECL_GEN_OCL_QUERY_IMAGE(image1d_buffer_t)
-DECL_GEN_OCL_QUERY_IMAGE(image1d_array_t)
-DECL_GEN_OCL_QUERY_IMAGE(image2d_t)
-DECL_GEN_OCL_QUERY_IMAGE(image2d_array_t)
-DECL_GEN_OCL_QUERY_IMAGE(image3d_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_only image1d_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_only image1d_buffer_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_only image1d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_only image2d_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_only image2d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_only image3d_t)
+
+#if __clang_major__*10 + __clang_minor__ >= 39
+DECL_GEN_OCL_QUERY_IMAGE(write_only image1d_t)
+DECL_GEN_OCL_QUERY_IMAGE(write_only image1d_buffer_t)
+DECL_GEN_OCL_QUERY_IMAGE(write_only image1d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(write_only image2d_t)
+DECL_GEN_OCL_QUERY_IMAGE(write_only image2d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(write_only image3d_t)
+#endif
 ///////////////////////////////////////////////////////////////////////////////
 // helper functions to validate array index.
 ///////////////////////////////////////////////////////////////////////////////
-INLINE_OVERLOADABLE float2 __gen_validate_array_index(float2 coord, image1d_array_t image)
+INLINE_OVERLOADABLE float2 __gen_validate_array_index(float2 coord, read_only image1d_array_t image)
 {
   float array_size = __gen_ocl_get_image_depth(image);
   coord.s1 = clamp(rint(coord.s1), 0.f, array_size - 1.f);
   return coord;
 }
 
-INLINE_OVERLOADABLE float4 __gen_validate_array_index(float4 coord, image2d_array_t image)
+INLINE_OVERLOADABLE float4 __gen_validate_array_index(float4 coord, read_only image2d_array_t image)
 {
   float array_size = __gen_ocl_get_image_depth(image);
   coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
   return coord;
 }
 
-INLINE_OVERLOADABLE float3 __gen_validate_array_index(float3 coord, image2d_array_t image)
+INLINE_OVERLOADABLE float3 __gen_validate_array_index(float3 coord, read_only image2d_array_t image)
 {
   float array_size = __gen_ocl_get_image_depth(image);
   coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
   return coord;
 }
 
-INLINE_OVERLOADABLE int2 __gen_validate_array_index(int2 coord, image1d_array_t image)
+INLINE_OVERLOADABLE int2 __gen_validate_array_index(int2 coord, read_only image1d_array_t image)
 {
   int array_size = __gen_ocl_get_image_depth(image);
   coord.s1 = clamp(coord.s1, 0, array_size - 1);
   return coord;
 }
 
-INLINE_OVERLOADABLE int4 __gen_validate_array_index(int4 coord, image2d_array_t image)
+INLINE_OVERLOADABLE int4 __gen_validate_array_index(int4 coord, read_only image2d_array_t image)
 {
   int array_size = __gen_ocl_get_image_depth(image);
   coord.s2 = clamp(coord.s2, 0, array_size - 1);
   return coord;
 }
 
-INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, image2d_array_t image)
+INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, read_only image2d_array_t image)
 {
   int array_size = __gen_ocl_get_image_depth(image);
   coord.s2 = clamp(coord.s2, 0, array_size - 1);
   return coord;
 }
 
+#if __clang_major__*10 + __clang_minor__ >= 39
+INLINE_OVERLOADABLE float2 __gen_validate_array_index(float2 coord, write_only image1d_array_t image)
+{
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s1 = clamp(rint(coord.s1), 0.f, array_size - 1.f);
+  return coord;
+}
+INLINE_OVERLOADABLE float4 __gen_validate_array_index(float4 coord, write_only image2d_array_t image)
+{
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+  return coord;
+}
+INLINE_OVERLOADABLE float3 __gen_validate_array_index(float3 coord, write_only image2d_array_t image)
+{
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+  return coord;
+}
+INLINE_OVERLOADABLE int2 __gen_validate_array_index(int2 coord, write_only image1d_array_t image)
+{
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s1 = clamp(coord.s1, 0, array_size - 1);
+  return coord;
+}
+INLINE_OVERLOADABLE int4 __gen_validate_array_index(int4 coord, write_only image2d_array_t image)
+{
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(coord.s2, 0, array_size - 1);
+  return coord;
+}
+INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, write_only image2d_array_t image)
+{
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(coord.s2, 0, array_size - 1);
+  return coord;
+}
+#endif
 // For non array image type, we need to do nothing.
 #define GEN_VALIDATE_ARRAY_INDEX(coord_type, image_type) \
 INLINE_OVERLOADABLE coord_type __gen_validate_array_index(coord_type coord, image_type image) \
@@ -120,17 +167,29 @@ INLINE_OVERLOADABLE coord_type __gen_validate_array_index(coord_type coord, imag
   return coord; \
 }
 
-GEN_VALIDATE_ARRAY_INDEX(float, image1d_t)
-GEN_VALIDATE_ARRAY_INDEX(int, image1d_t)
-GEN_VALIDATE_ARRAY_INDEX(float2, image2d_t)
-GEN_VALIDATE_ARRAY_INDEX(int2, image2d_t)
-GEN_VALIDATE_ARRAY_INDEX(float4, image3d_t)
-GEN_VALIDATE_ARRAY_INDEX(int4, image3d_t)
-GEN_VALIDATE_ARRAY_INDEX(float3, image3d_t)
-GEN_VALIDATE_ARRAY_INDEX(int3, image3d_t)
-GEN_VALIDATE_ARRAY_INDEX(float, image1d_buffer_t)
-GEN_VALIDATE_ARRAY_INDEX(int, image1d_buffer_t)
-
+GEN_VALIDATE_ARRAY_INDEX(float, read_only image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(int, read_only image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(float2, read_only image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(int2, read_only image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(float4, read_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int4, read_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float3, read_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int3, read_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float, read_only image1d_buffer_t)
+GEN_VALIDATE_ARRAY_INDEX(int, read_only image1d_buffer_t)
+
+#if __clang_major__*10 + __clang_minor__ >= 39
+GEN_VALIDATE_ARRAY_INDEX(float, write_only image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(int, write_only image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(float2, write_only image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(int2, write_only image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(float4, write_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int4, write_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float3, write_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int3, write_only image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float, write_only image1d_buffer_t)
+GEN_VALIDATE_ARRAY_INDEX(int, write_only image1d_buffer_t)
+#endif
 ///////////////////////////////////////////////////////////////////////////////
 // Helper functions to work around some coordiate boundary issues.
 // The major issue on Gen7/Gen7.5 are the sample message could not sampling
@@ -293,7 +352,7 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
 // For integer coordinates
 #define DECL_READ_IMAGE0(int_clamping_fix, image_type,                        \
                          image_data_type, suffix, coord_type, n)              \
-  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+  OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image,       \
                                         const sampler_t sampler,              \
                                         coord_type coord)                     \
   {                                                                           \
@@ -308,7 +367,7 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
 // For float coordinates
 #define DECL_READ_IMAGE1(int_clamping_fix, image_type,                        \
                          image_data_type, suffix, coord_type, n)              \
-  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+  OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image,       \
                                         const sampler_t sampler,              \
                                         coord_type coord)                     \
   {                                                                           \
@@ -333,7 +392,7 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
 
 #define DECL_READ_IMAGE_NOSAMPLER(image_type, image_data_type,                \
                                   suffix, coord_type, n)                      \
-  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+  OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image,       \
                                                coord_type coord)              \
   {                                                                           \
     coord = __gen_validate_array_index(coord, cl_image);                      \
@@ -344,7 +403,7 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
   }
 
 #define DECL_WRITE_IMAGE(image_type, image_data_type, suffix, coord_type)     \
-  OVERLOADABLE void write_image ##suffix(image_type cl_image,                 \
+  OVERLOADABLE void write_image ##suffix(write_only image_type cl_image,                 \
                                          coord_type coord,                    \
                                          image_data_type color)               \
   {                                                                           \
@@ -375,7 +434,7 @@ DECL_IMAGE_TYPE(image2d_array_t, 3)
 
 #define DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(image_type, image_data_type,       \
                                   suffix, coord_type)                         \
-  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+  OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image,       \
                                                coord_type coord)              \
   {                                                                           \
     sampler_t defaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE \
@@ -388,7 +447,7 @@ DECL_IMAGE_TYPE(image2d_array_t, 3)
   }
 
 #define DECL_WRITE_IMAGE1D_BUFFER(image_type, image_data_type, suffix, coord_type)     \
-  OVERLOADABLE void write_image ##suffix(image_type cl_image,                 \
+  OVERLOADABLE void write_image ##suffix(write_only image_type cl_image,                 \
                                          coord_type coord,                    \
                                          image_data_type color)               \
   {                                                                           \
@@ -493,69 +552,123 @@ DECL_IMAGE_1DArray(0, float4, f)
 #define DECL_IMAGE_INFO_COMMON(image_type)                                    \
   OVERLOADABLE  int get_image_channel_data_type(image_type image)             \
   {                                                                           \
-    return __gen_ocl_get_image_channel_data_type(image);                 \
+    return __gen_ocl_get_image_channel_data_type(image);                      \
   }                                                                           \
   OVERLOADABLE  int get_image_channel_order(image_type image)                 \
   {                                                                           \
-    return __gen_ocl_get_image_channel_order(image);                     \
+    return __gen_ocl_get_image_channel_order(image);                          \
   }                                                                           \
   OVERLOADABLE int get_image_width(image_type image)                          \
   {                                                                           \
-    return __gen_ocl_get_image_width(image);                             \
+    return __gen_ocl_get_image_width(image);                                  \
   }
 
-DECL_IMAGE_INFO_COMMON(image1d_t)
-DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
-DECL_IMAGE_INFO_COMMON(image1d_array_t)
-DECL_IMAGE_INFO_COMMON(image2d_t)
-DECL_IMAGE_INFO_COMMON(image3d_t)
-DECL_IMAGE_INFO_COMMON(image2d_array_t)
+DECL_IMAGE_INFO_COMMON(read_only image1d_t)
+DECL_IMAGE_INFO_COMMON(read_only image1d_buffer_t)
+DECL_IMAGE_INFO_COMMON(read_only image1d_array_t)
+DECL_IMAGE_INFO_COMMON(read_only image2d_t)
+DECL_IMAGE_INFO_COMMON(read_only image3d_t)
+DECL_IMAGE_INFO_COMMON(read_only image2d_array_t)
+
+#if __clang_major__*10 + __clang_minor__ >= 39
+DECL_IMAGE_INFO_COMMON(write_only image1d_t)
+DECL_IMAGE_INFO_COMMON(write_only image1d_buffer_t)
+DECL_IMAGE_INFO_COMMON(write_only image1d_array_t)
+DECL_IMAGE_INFO_COMMON(write_only image2d_t)
+DECL_IMAGE_INFO_COMMON(write_only image3d_t)
+DECL_IMAGE_INFO_COMMON(write_only image2d_array_t)
+#endif
 
 // 2D extra Info
-OVERLOADABLE int get_image_height(image2d_t image)
+OVERLOADABLE int get_image_height(read_only image2d_t image)
 {
   return __gen_ocl_get_image_height(image);
 }
-OVERLOADABLE int2 get_image_dim(image2d_t image)
+OVERLOADABLE int2 get_image_dim(read_only image2d_t image)
 {
   return (int2){get_image_width(image), get_image_height(image)};
 }
+#if __clang_major__*10 + __clang_minor__ >= 39
+OVERLOADABLE int get_image_height(write_only image2d_t image)
+{
+  return __gen_ocl_get_image_height(image);
+}
+OVERLOADABLE int2 get_image_dim(write_only image2d_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+#endif
 // End of 2D
 
 // 3D extra Info
-OVERLOADABLE int get_image_height(image3d_t image)
+OVERLOADABLE int get_image_height(read_only image3d_t image)
 {
   return __gen_ocl_get_image_height(image);
 }
-OVERLOADABLE int get_image_depth(image3d_t image)
+OVERLOADABLE int get_image_depth(read_only image3d_t image)
 {
   return __gen_ocl_get_image_depth(image);
 }
-OVERLOADABLE int4 get_image_dim(image3d_t image)
+OVERLOADABLE int4 get_image_dim(read_only image3d_t image)
 {
   return (int4) (get_image_width(image),
                  get_image_height(image),
                  get_image_depth(image),
                  0);
 }
-
+#if __clang_major__*10 + __clang_minor__ >= 39
+OVERLOADABLE int get_image_height(write_only image3d_t image)
+{
+  return __gen_ocl_get_image_height(image);
+}
+OVERLOADABLE int get_image_depth(write_only image3d_t image)
+{
+  return __gen_ocl_get_image_depth(image);
+}
+OVERLOADABLE int4 get_image_dim(write_only image3d_t image)
+{
+  return (int4) (get_image_width(image),
+                 get_image_height(image),
+                 get_image_depth(image),
+                 0);
+}
+#endif
 // 2D Array extra Info
-OVERLOADABLE int get_image_height(image2d_array_t image)
+OVERLOADABLE int get_image_height(read_only image2d_array_t image)
 {
   return __gen_ocl_get_image_height(image);
 }
-OVERLOADABLE int2 get_image_dim(image2d_array_t image)
+OVERLOADABLE int2 get_image_dim(read_only image2d_array_t image)
 {
   return (int2){get_image_width(image), get_image_height(image)};
 }
-OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
+OVERLOADABLE size_t get_image_array_size(read_only image2d_array_t image)
 {
   return __gen_ocl_get_image_depth(image);
 }
-
+#if __clang_major__*10 + __clang_minor__ >= 39
+OVERLOADABLE int get_image_height(write_only image2d_array_t image)
+{
+  return __gen_ocl_get_image_height(image);
+}
+OVERLOADABLE int2 get_image_dim(write_only image2d_array_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+OVERLOADABLE size_t get_image_array_size(write_only image2d_array_t image)
+{
+  return __gen_ocl_get_image_depth(image);
+}
+#endif
 // 1D Array info
-OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
+OVERLOADABLE size_t get_image_array_size(read_only image1d_array_t image)
 {
   return __gen_ocl_get_image_depth(image);
 }
+#if __clang_major__*10 + __clang_minor__ >= 39
+OVERLOADABLE size_t get_image_array_size(write_only image1d_array_t image)
+{
+  return __gen_ocl_get_image_depth(image);
+}
+#endif
 // End of 1DArray
diff --git a/backend/src/libocl/tmpl/ocl_defines.tmpl.h b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
index 8fb5d2b..f5c65df 100644
--- a/backend/src/libocl/tmpl/ocl_defines.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
@@ -22,20 +22,13 @@
 #define __CL_VERSION_1_0__ 100
 #define __CL_VERSION_1_1__ 110
 #define __CL_VERSION_1_2__ 120
+#define CL_VERSION_1_0 100
+#define CL_VERSION_1_1 110
+#define CL_VERSION_1_2 120
 #define __ENDIAN_LITTLE__ 1
 #define __IMAGE_SUPPORT__ 1
 #define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
                                         __attribute__((vec_type_hint(TYPE)))
 #define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
-#define cl_khr_global_int32_base_atomics
-#define cl_khr_global_int32_extended_atomics
-#define cl_khr_local_int32_base_atomics
-#define cl_khr_local_int32_extended_atomics
-#define cl_khr_byte_addressable_store
-#define cl_khr_icd
-#define cl_khr_gl_sharing
-#define cl_khr_spir
-#define cl_khr_fp16
-#define cl_khr_3d_image_writes
 
 #endif /* end of __OCL_COMMON_DEF_H__ */
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 9c09b21..8e22015 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -30,8 +30,8 @@ uint get_sub_group_size(void)
 {
   uint threadn = get_num_sub_groups();
   uint threadid = get_sub_group_id();
-  if((threadid == (threadn - 1)) && (threadn > 1))
-    return (get_local_size(0)*get_local_size(1)*get_local_size(2)) % get_max_sub_group_size();
+  if (threadid == (threadn - 1))
+    return (get_local_size(0)*get_local_size(1)*get_local_size(2) -1) % get_max_sub_group_size() + 1;
   else
     return get_max_sub_group_size();
 }
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 748a7fe..a3f9886 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -145,6 +145,7 @@ namespace gbe
       return NULL;
 
     std::vector<const char *> kernels;
+    std::vector<const char *> kerneltmp;
     std::vector<const char *> builtinFuncs;
     /* Add the memset and memcpy functions here. */
     builtinFuncs.push_back("__gen_memcpy_gg");
@@ -184,7 +185,12 @@ namespace gbe
     for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
       if (SF->isDeclaration()) continue;
       if (!isKernelFunction(*SF)) continue;
-      kernels.push_back(SF->getName().data());
+      // mod will be deleted after link, copy the names.
+      const char *funcName = SF->getName().data();
+      char * tmp = new char[strlen(funcName)+1];
+      strcpy(tmp,funcName);
+      kernels.push_back(tmp);
+      kerneltmp.push_back(tmp);
 
       if (!materializedFuncCall(*mod, *clonedLib, *SF, materializedFuncs, Gvs)) {
         delete clonedLib;
@@ -273,7 +279,11 @@ namespace gbe
     /* We use beignet's bitcode as dst because it will have a lot of
        lazy functions which will not be loaded. */
     char* errorMsg;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    if(LLVMLinkModules2(wrap(clonedLib), wrap(mod))) {
+#else
     if(LLVMLinkModules(wrap(clonedLib), wrap(mod), LLVMLinkerDestroySource, &errorMsg)) {
+#endif
       delete clonedLib;
       printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
       return NULL;
@@ -284,11 +294,25 @@ namespace gbe
     llvm::PassManager passes;
 #endif
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=9
+    auto PreserveKernel = [=](const GlobalValue &GV) {
+      for(size_t i = 0;i < kernels.size(); ++i)
+        if(strcmp(GV.getName().data(), kernels[i]))
+          return true;
+      return false;
+    };
+
+    passes.add(createInternalizePass(PreserveKernel));
+#else
     passes.add(createInternalizePass(kernels));
+#endif
     passes.add(createGlobalDCEPass());
 
     passes.run(*clonedLib);
 
+    for(size_t i = 0;i < kerneltmp.size(); i++)
+      delete[] kerneltmp[i];
+
     return clonedLib;
   }
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 5135950..0570766 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1230,6 +1230,10 @@ namespace gbe
     }
     MDNode *typeNameNode = NULL;
     MDNode *typeBaseNameNode = NULL;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    typeNameNode = F.getMetadata("kernel_arg_type");
+    typeBaseNameNode = F.getMetadata("kernel_arg_base_type");
+#else
     MDNode *node = getKernelFunctionMetadata(&F);
     for(uint j = 0;node && j < node->getNumOperands() - 1; j++) {
       MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
@@ -1243,15 +1247,21 @@ namespace gbe
         typeBaseNameNode = attrNode;
       }
     }
+#endif
 
     unsigned argID = 0;
     ir::FunctionArgument::InfoFromLLVM llvmInfo;
     for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
+      unsigned opID = argID;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9
+      opID += 1;
+#endif
+
       if(typeNameNode) {
-        llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+        llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(opID)))->getString();
       }
       if(typeBaseNameNode) {
-        llvmInfo.typeBaseName= (cast<MDString>(typeBaseNameNode->getOperand(1 + argID)))->getString();
+        llvmInfo.typeBaseName= (cast<MDString>(typeBaseNameNode->getOperand(opID)))->getString();
       }
       bool isImage = llvmInfo.isImageType();
       if (I->getType()->isPointerTy() || isImage) {
@@ -1974,6 +1984,92 @@ namespace gbe
 
     std::string functionAttributes;
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    /* LLVM 3.9 change kernel arg info as function metadata */
+    addrSpaceNode = F.getMetadata("kernel_arg_addr_space");
+    accessQualNode = F.getMetadata("kernel_arg_access_qual");
+    typeNameNode = F.getMetadata("kernel_arg_type");
+    typeBaseNameNode = F.getMetadata("kernel_arg_base_type");
+    typeQualNode = F.getMetadata("kernel_arg_type_qual");
+    argNameNode = F.getMetadata("kernel_arg_name");
+    MDNode *attrNode;
+    if ((attrNode = F.getMetadata("vec_type_hint"))) {
+      GBE_ASSERT(attrNode->getNumOperands() == 2);
+      functionAttributes += "vec_type_hint";
+      auto *Op1 = cast<ValueAsMetadata>(attrNode->getOperand(0));
+      Value *V = Op1 ? Op1->getValue() : NULL;
+      ConstantInt *sign =
+          mdconst::extract<ConstantInt>(attrNode->getOperand(1));
+      size_t signValue = sign->getZExtValue();
+      Type *vtype = V->getType();
+      Type *stype = vtype;
+      uint32_t elemNum = 0;
+      if (vtype->isVectorTy()) {
+        VectorType *vectorType = cast<VectorType>(vtype);
+        stype = vectorType->getElementType();
+        elemNum = vectorType->getNumElements();
+      }
+
+      std::string typeName = getTypeName(ctx, stype, signValue);
+
+      std::stringstream param;
+      char buffer[100] = {0};
+      param << "(";
+      param << typeName;
+      if (vtype->isVectorTy())
+        param << elemNum;
+      param << ")";
+      param >> buffer;
+      functionAttributes += buffer;
+      functionAttributes += " ";
+    }
+    if ((attrNode = F.getMetadata("reqd_work_group_size"))) {
+      GBE_ASSERT(attrNode->getNumOperands() == 3);
+      ConstantInt *x = mdconst::extract<ConstantInt>(attrNode->getOperand(0));
+      ConstantInt *y = mdconst::extract<ConstantInt>(attrNode->getOperand(1));
+      ConstantInt *z = mdconst::extract<ConstantInt>(attrNode->getOperand(2));
+      GBE_ASSERT(x && y && z);
+      reqd_wg_sz[0] = x->getZExtValue();
+      reqd_wg_sz[1] = y->getZExtValue();
+      reqd_wg_sz[2] = z->getZExtValue();
+      functionAttributes += "reqd_work_group_size";
+      std::stringstream param;
+      char buffer[100] = {0};
+      param << "(";
+      param << reqd_wg_sz[0];
+      param << ",";
+      param << reqd_wg_sz[1];
+      param << ",";
+      param << reqd_wg_sz[2];
+      param << ")";
+      param >> buffer;
+      functionAttributes += buffer;
+      functionAttributes += " ";
+    }
+    if ((attrNode = F.getMetadata("work_group_size_hint"))) {
+      GBE_ASSERT(attrNode->getNumOperands() == 3);
+      ConstantInt *x = mdconst::extract<ConstantInt>(attrNode->getOperand(0));
+      ConstantInt *y = mdconst::extract<ConstantInt>(attrNode->getOperand(1));
+      ConstantInt *z = mdconst::extract<ConstantInt>(attrNode->getOperand(2));
+      GBE_ASSERT(x && y && z);
+      hint_wg_sz[0] = x->getZExtValue();
+      hint_wg_sz[1] = y->getZExtValue();
+      hint_wg_sz[2] = z->getZExtValue();
+      functionAttributes += "work_group_size_hint";
+      std::stringstream param;
+      char buffer[100] = {0};
+      param << "(";
+      param << hint_wg_sz[0];
+      param << ",";
+      param << hint_wg_sz[1];
+      param << ",";
+      param << hint_wg_sz[2];
+      param << ")";
+      param >> buffer;
+      functionAttributes += buffer;
+      functionAttributes += " ";
+    }
+#else
     /* First find the meta data belong to this function. */
     MDNode *node = getKernelFunctionMetadata(&F);
 
@@ -2095,6 +2191,7 @@ namespace gbe
         functionAttributes += " ";
       }
     }
+#endif /* LLVM 3.9 Function metadata */
 
     ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
 
@@ -2110,29 +2207,33 @@ namespace gbe
       const AttrListPtr &PAL = F.getAttributes();
 #endif /* LLVM_VERSION_MINOR <= 1 */
       for (; I != E; ++I, ++argID) {
+        uint32_t opID = argID;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9
+        opID += 1;
+#endif
         const std::string &argName = I->getName().str();
         Type *type = I->getType();
         if(addrSpaceNode) {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
-          llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
+          llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(opID)))->getZExtValue();
 #else
-          llvmInfo.addrSpace = (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
+          llvmInfo.addrSpace = (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(opID)))->getZExtValue();
 #endif
         }
         if(typeNameNode) {
-          llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+          llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(opID)))->getString();
         }
         if(typeBaseNameNode){
-          llvmInfo.typeBaseName = (cast<MDString>(typeBaseNameNode->getOperand(1 + argID)))->getString();
+          llvmInfo.typeBaseName = (cast<MDString>(typeBaseNameNode->getOperand(opID)))->getString();
         }
         if(accessQualNode) {
-          llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
+          llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(opID)))->getString();
         }
         if(typeQualNode) {
-          llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
+          llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(opID)))->getString();
         }
         if(argNameNode){
-          llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+          llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(opID)))->getString();
         }
 
         // function arguments are uniform values.
@@ -4195,13 +4296,14 @@ namespace gbe
               ir::Register tmp1 = ctx.reg(getFamily(tmpType));
               ir::Register tmp2 = ctx.reg(getFamily(tmpType));
               ctx.CVT(tmpType, srcType, tmp0, src);
-              ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
+              ctx.ALU1(ir::OP_LZD, ir::TYPE_U32, tmp1, tmp0);
               ctx.SUB(tmpType, tmp2, tmp1, immReg);
               ctx.CVT(dstType, tmpType, dst, tmp2);
             }
             else
             {
-              ctx.ALU1(ir::OP_LZD, dstType, dst, src);
+              GBE_ASSERT(srcType == ir::TYPE_U32);
+              ctx.ALU1(ir::OP_LZD, srcType, dst, src);
             }
           }
           break;
@@ -4258,8 +4360,8 @@ namespace gbe
 #endif /* GBE_DEBUG */
 
         switch (genIntrinsicID) {
-          case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
-          case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+          case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH, ir::TYPE_U32); break;
+          case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL, ir::TYPE_U32); break;
           case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
           case GEN_OCL_ABS:
           {
diff --git a/backend/src/llvm/llvm_includes.hpp b/backend/src/llvm/llvm_includes.hpp
index d2deb90..0b80979 100644
--- a/backend/src/llvm/llvm_includes.hpp
+++ b/backend/src/llvm/llvm_includes.hpp
@@ -127,4 +127,9 @@
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #endif
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#endif
+
 #endif /* __GBE_IR_LLVM_INCLUDES_HPP__ */
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index b925e5f..02dd4bf 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -41,9 +41,12 @@ using namespace llvm;
 namespace gbe
 {
   bool isKernelFunction(const llvm::Function &F) {
+    bool bKernel = false;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    bKernel = F.getMetadata("kernel_arg_name") != NULL;
+#else
     const Module *module = F.getParent();
     const Module::NamedMDListType& globalMD = module->getNamedMDList();
-    bool bKernel = false;
     for(auto i = globalMD.begin(); i != globalMD.end(); i++) {
       const NamedMDNode &md = *i;
       if(strcmp(md.getName().data(), "opencl.kernels") != 0) continue;
@@ -58,6 +61,7 @@ namespace gbe
         if(op == &F) bKernel = true;
       }
     }
+#endif
     return bKernel;
   }
 
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 02a69ec..e108810 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -46,6 +46,13 @@ namespace gbe
   BVAR(OCL_OUTPUT_CFG_GEN_IR, false);
   using namespace llvm;
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+  llvm::LLVMContext& GBEGetLLVMContext() {
+    static llvm::LLVMContext GBEContext;
+    return GBEContext;
+  }
+#endif
+
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
   #define TARGETLIBRARY  TargetLibraryInfoImpl
 #else
@@ -142,7 +149,9 @@ namespace gbe
     MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
     MPM.add(createFunctionInliningPass(20000));
     MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+    MPM.add(createPostOrderFunctionAttrsLegacyPass());
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
     MPM.add(createPostOrderFunctionAttrsPass());       // Set readonly/readnone attrs
 #else
     MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
@@ -294,7 +303,11 @@ namespace gbe
     if (module) {
       cl_mod = reinterpret_cast<Module*>(const_cast<void*>(module));
     } else if (fileName){
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+      llvm::LLVMContext& c = GBEGetLLVMContext();
+#else
       llvm::LLVMContext& c = llvm::getGlobalContext();
+#endif
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
       cl_mod = parseIRFile(fileName, Err, c).release();
 #else
@@ -349,7 +362,11 @@ namespace gbe
     passes.add(createIntrinsicLoweringPass());
     passes.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
     passes.add(createFunctionInliningPass(20000));
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+    passes.add(createSROAPass());
+#else
     passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
+#endif
     passes.add(createLoadStoreOptimizationPass());
     passes.add(createConstantPropagationPass());
     passes.add(createPromoteMemoryToRegisterPass());
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
index e0a6145..d3928c6 100644
--- a/backend/src/llvm/llvm_to_gen.hpp
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -23,6 +23,9 @@
  */
 #ifndef __GBE_IR_LLVM_TO_GEN_HPP__
 #define __GBE_IR_LLVM_TO_GEN_HPP__
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#include "llvm/IR/LLVMContext.h"
+#endif
 
 namespace gbe {
   namespace ir {
@@ -34,6 +37,9 @@ namespace gbe {
 		  optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
   bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module,
                  int optLevel, bool strictMath, int profiling, std::string &errors);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+  extern llvm::LLVMContext& GBEGetLLVMContext();
+#endif
 
 } /* namespace gbe */
 
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
index a289c11..8a492d6 100644
--- a/backend/src/llvm/llvm_unroll.cpp
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -65,8 +65,8 @@ namespace gbe {
       // Returns the value associated with the given metadata node name (for
       // example, "llvm.loop.unroll.count").  If no such named metadata node
       // exists, then nullptr is returned.
-      static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
-                                                     StringRef Name) {
+      static const MDNode *GetUnrollMetadataValue(const Loop *L,
+                                                          StringRef Name) {
         MDNode *LoopID = L->getLoopID();
         if (!LoopID) return nullptr;
         // First operand should refer to the loop id itself.
@@ -78,16 +78,28 @@ namespace gbe {
           const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
           if (!S) continue;
           if (Name.equals(S->getString())) {
-            assert(MD->getNumOperands() == 2 &&
-                   "Unroll hint metadata should have two operands.");
+            return MD;
+          }
+        }
+        return nullptr;
+      }
+
+      static unsigned GetUnrollCount(const Loop *L,
+                                            StringRef Name) {
+        const MDNode *MD = GetUnrollMetadataValue(L, "llvm.loop.unroll.count");
+        if (MD) {
+          assert(MD->getNumOperands() == 2 &&
+                 "Unroll count hint metadata should have two operands.");
+          unsigned Count;
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
-            return mdconst::extract<ConstantInt>(MD->getOperand(1));
+          Count = mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
 #else
-            return cast<ConstantInt>(MD->getOperand(1));
+          Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue();
 #endif
-          }
+          assert(Count >= 1 && "Unroll count must be positive.");
+          return Count;
         }
-        return nullptr;
+        return 0;
       }
 
       void setUnrollID(Loop *L, bool enable) {
@@ -212,11 +224,11 @@ namespace gbe {
       // some private load or store, we change it's loop meta data
       // to indicate more aggresive unrolling on it.
       virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
-        const ConstantInt *Enable = GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
+        const MDNode *Enable = GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
         if (Enable)
           return false;
-        const ConstantInt *Count = GetUnrollMetadataValue(L, "llvm.loop.unroll.count");
-        if (Count)
+        const unsigned Count = GetUnrollCount(L, "llvm.loop.unroll.count");
+        if (Count > 0)
           return false;
 
         if (!handleParentLoops(L, LPM))
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 407886a..64d33dc 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -16,7 +16,7 @@ News
 Prerequisite
 ------------
 
-The project depends on the following external libaries:
+The project depends on the following external libraries:
 
 - libdrm libraries (libdrm and libdrm\_intel)
 - Various LLVM components
@@ -33,7 +33,7 @@ you can still link to the beignet OpenCL library. You can find the beignet/libcl
 in your system's library installation directories.
 
 Note that the compiler depends on LLVM (Low-Level Virtual Machine project), and the
-project normally support 3 latest LLVM released version.
+project normally supports the 3 latest LLVM released versions.
 Right now, the code has been compiled with LLVM 3.6, 3.7 and 3.8. With older
 version LLVM from 3.3, build still support, but no full tests cover.
 
@@ -48,11 +48,11 @@ A simple command to install all the above dependencies for ubuntu or debian is:
 
 **The recommended LLVM/CLANG version is 3.6 and/or 3.7**
 
-Based on our test result, LLVM 3.6 and 3.7 has best pass rate on all the test suites. Compare
+Based on our test result, LLVM 3.6 and 3.7 has the best pass rate on all the test suites. Compared
 to LLVM 3.6 and 3.7, if you used LLVM 3.8, you should pay attention to float immediate. For example,
-if you use 1.0 in the kernel, LLVM 3.6 will treate it as 1.0f, a single float, because the project
-don't support double float. but LLVM 3.8 will treate it as 1.0, a double foat, at the last it may cause
-error. So we recommend use 1.0f instead of 1.0 if you don't need double float.
+if you use 1.0 in the kernel, LLVM 3.6 will treat it as 1.0f, a single float, because the project
+doesn't support double float. but LLVM 3.8 will treat it as 1.0, a double float, at the last it may cause
+error. So we recommend using 1.0f instead of 1.0 if you don't need double float.
 
 For LLVM 3.4 and 3.5, Beignet still support them, but it may be limited to support the
 build and major functions.
@@ -91,7 +91,7 @@ The cmake will build the backend firstly. Please refer to:
 Once built, the run-time produces a shared object libcl.so which basically
 directly implements the OpenCL API.
 
-`> make`
+`> make utest`
 
 A set of tests are also produced. They may be found in `utests/`.
 
@@ -112,12 +112,12 @@ It installs the OCL icd vendor files to /etc/OpenCL/vendors, if the system suppo
 
 `> make package`
 
-It packages the driver binaries, you may copy&install the package to another machine with simillar system.
+It packages the driver binaries, you may copy&install the package to another machine with similar system.
 
 How to run
 ----------
 
-After build and install of beignet, you may need to check whether it works on your
+After building and installing Beignet, you may need to check whether it works on your
 platform. Beignet also produces various tests to ensure the compiler and the run-time
 consistency. This small test framework uses a simple c++ registration system to
 register all the unit tests.
@@ -173,7 +173,7 @@ Known Issues
 
   `# echo -n 0 > /sys/module/i915/parameters/enable_hangcheck`
 
-  But this command is a little bit dangerous, as if your kernel really hang, then the gpu will lock up
+  But this command is a little bit dangerous, as if your kernel really hangs, then the GPU will lock up
   forever until a reboot.
 
 * "Beignet: self-test failed" and almost all unit tests fail.
@@ -207,7 +207,7 @@ Known Issues
 
   `# export OCL_STRICT_CONFORMANCE=0`.
 
-  This would lost some precision but gain performance.
+  This loses some precision but gains performance.
 
 * cl\_khr\_gl\_sharing.
   This extension highly depends on mesa support. It seems that mesa would not provide
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index d1e94ef..2ef0a89 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,5 +1,8 @@
 # News
 
+## Nov 4, 2016
+[Beignet 1.2.1](https://01.org/beignet/downloads/beignet-1.2.1-2016-11-04) is released. This is a bug-fix release.
+
 ## Aug 30, 2016
 [Beignet 1.2.0](https://01.org/beignet/downloads/beignet-1.2.0-2016-08-30) is released. This is a major release. Please see the release notes for more information.
 
diff --git a/docs/howto/cross-compiler-howto.mdwn b/docs/howto/cross-compiler-howto.mdwn
index d541816..a8a696d 100644
--- a/docs/howto/cross-compiler-howto.mdwn
+++ b/docs/howto/cross-compiler-howto.mdwn
@@ -2,7 +2,7 @@ Cross Compiler HowTo
 ====================
 
 Beignet supports both PC devices with full profile and embedded/handheld
-devices with embeded profile. This document describes how to build Beignet
+devices with embedded profile. This document describes how to build Beignet
 and OpenCL kernels for a target machine (embedded/handheld devices) in a
 host machine with the help of cross compiler, and also the large-size-reduced
 Beignet driver package for the target machine.
@@ -65,7 +65,7 @@ provide only the OpenCL runtime library without OpenCL compiler, and only the
 executable binary kernel is supported on such devices.
 
 It means that just distribute libcl.so and libgbeinterp.so (~320k in total after strip)
-are enough for OpenCL embeded profile in the target machine. The whole Beignet
+are enough for OpenCL embedded profile in the target machine. The whole Beignet
 driver set can be separated into several packages for different usage.
 
 
diff --git a/docs/howto/stand-alone-utest-howto.mdwn b/docs/howto/stand-alone-utest-howto.mdwn
index bca23d3..ddd8c5e 100644
--- a/docs/howto/stand-alone-utest-howto.mdwn
+++ b/docs/howto/stand-alone-utest-howto.mdwn
@@ -1,14 +1,14 @@
 Stand Alone Unit Test HowTo
 ====================
 
-Beignet provides an independent unit test suite covered most OpenCL language feautures,
+Beignet provides an independent unit test suite covering most OpenCL language features,
 including more than 800 cases which could run in a few minutes, it should be useful for
 testing and comparing different OpenCL implementations.
 
 Prerequisite
 ------------
 
-OpenCL ICD. Please check your OpenCL ICD existance by command
+OpenCL ICD. Please check your OpenCL ICD existence by command
 `pkg-config --libs OpenCL`.
 
 Build Stand Alone Unit Test
@@ -27,7 +27,7 @@ Basically, from the root directory of the project
 
 `> make`
 
-Once built, the 'utest_run' is generated in currenty directory.
+Once built, the 'utest_run' is generated in current directory.
 
 How to run
 ----------
@@ -42,4 +42,4 @@ Then in `utests/`:
 `> ./utest_run`
 
 if the utest_run fail to run, please check /etc/vendors/OpenCL to confirm it calls the expected
-OpenCL driver, or export LD_LIBRARAY_PATH to establish the correct link.
+OpenCL driver, or export LD_LIBRARY_PATH to establish the correct link.
diff --git a/kernels/compiler_subgroup_broadcast.cl b/kernels/compiler_subgroup_broadcast.cl
index 4f21cf5..96d38d9 100644
--- a/kernels/compiler_subgroup_broadcast.cl
+++ b/kernels/compiler_subgroup_broadcast.cl
@@ -9,7 +9,7 @@ kernel void compiler_subgroup_broadcast_imm_int(global int *src,
   uint index = get_global_id(0);
 
   int val = src[index];
-  int broadcast_val = sub_group_broadcast(val, 10);
+  int broadcast_val = sub_group_broadcast(val, 2);
   dst[index] = broadcast_val;
 }
 kernel void compiler_subgroup_broadcast_int(global int *src,
diff --git a/kernels/test_get_arg_info.cl b/kernels/test_get_arg_info.cl
index ae08887..b9201a4 100644
--- a/kernels/test_get_arg_info.cl
+++ b/kernels/test_get_arg_info.cl
@@ -3,6 +3,6 @@ typedef struct _test_arg_struct {
     int b;
 }test_arg_struct;
 
-kernel void test_get_arg_info(read_only global float const volatile *src, read_write local int *dst, test_arg_struct extra) {
+kernel void test_get_arg_info(global float const volatile *src, local int *dst, test_arg_struct extra) {
 
 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a002865..82be7ff 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -103,7 +103,7 @@ endif (X11_FOUND)
 
 if (CMRT_FOUND)
   set(CMAKE_CXX_FLAGS "-DHAS_CMRT ${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBRARY_DIRS}/libcmrt.so ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBDIR}/libcmrt.so.1 ${CMAKE_CXX_FLAGS}")
   set(CMAKE_C_FLAGS "-DHAS_CMRT ${CMAKE_C_FLAGS}")
   set(OPENCL_SRC ${OPENCL_SRC} cl_cmrt.cpp)
 endif (CMRT_FOUND)
diff --git a/src/cl_api.c b/src/cl_api.c
index d0d4dc5..a7c78f0 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -3032,8 +3032,17 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
         }
         realGroupSize *= fixed_local_sz[i];
       }
-      if (realGroupSize % 8 != 0)
+
+      //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop:
+      //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
+      //it triggers the following message for many times.
+      //to avoid too many messages, only print it for the first time of the process.
+      //just use static variable since it doesn't matter to print a few times at multi-thread case.
+      static int warn_no_good_localsize = 1;
+      if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
+        warn_no_good_localsize = 0;
         DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide local_work_size[] explicitly, you can find good values with trial-and-error method.");
+      }
     }
   }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 6a9cf1f..a7b967d 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -272,12 +272,13 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   assert(offset >= 0);
   stack_sz *= interp_kernel_get_simd_width(ker->opaque);
   stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
-  /* Because HSW calc stack offset per thread is relative with half slice, when
-     thread schedule in half slice is not balance, would out of bound. Because
-     the max half slice is 4 in GT4, multiply stack size with 4 for safe.
+
+  /* for some hardware, part of EUs are disabled with EU id reserved,
+   * it makes the active EU id larger than count of EUs within a subslice,
+   * need to enlarge stack size for such case to avoid out of range.
    */
-  if(cl_driver_get_ver(ctx->drv) == 75)
-    stack_sz *= 4;
+  cl_driver_enlarge_stack_size(ctx->drv, &stack_sz);
+
   cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
 }
 
@@ -350,7 +351,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   void* printf_info = NULL;
 
   /* Setup kernel */
-  kernel.name = "KERNEL";
+  kernel.name = interp_kernel_get_name(ker->opaque);
   kernel.grf_blocks = 128;
   kernel.bo = ker->bo;
   kernel.barrierID = 0;
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index f680219..4ee4ca3 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -297,10 +297,16 @@
 #define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || IS_SKL_GT3(devid) || IS_SKL_GT4(devid))
 
 /* BXT */
-#define PCI_CHIP_BROXTON_P	0x5A84   /* Intel(R) BXT-P for mobile desktop */
+#define PCI_CHIP_BROXTON_0	0x5A84
+#define PCI_CHIP_BROXTON_1	0x5A85
+#define PCI_CHIP_BROXTON_2	0x1A84
+#define PCI_CHIP_BROXTON_3	0x1A85
 
 #define IS_BROXTON(devid)               \
-  (devid == PCI_CHIP_BROXTON_P)
+  (devid == PCI_CHIP_BROXTON_0 ||       \
+   devid == PCI_CHIP_BROXTON_1 ||       \
+   devid == PCI_CHIP_BROXTON_2 ||       \
+   devid == PCI_CHIP_BROXTON_3)
 
 #define PCI_CHIP_KABYLAKE_ULT_GT1     0x5906
 #define PCI_CHIP_KABYLAKE_ULT_GT2     0x5916
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index d29138d..ded2f1e 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -199,7 +199,7 @@ static struct _cl_device_id intel_skl_gt4_device = {
 #include "cl_gen9_device.h"
 };
 
-static struct _cl_device_id intel_bxt_device = {
+static struct _cl_device_id intel_bxt18eu_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 18,
   .max_thread_per_unit = 6,
@@ -210,6 +210,16 @@ static struct _cl_device_id intel_bxt_device = {
 #include "cl_gen9_device.h"
 };
 
+static struct _cl_device_id intel_bxt12eu_device = {
+  .max_compute_unit = 12,
+  .max_thread_per_unit = 6,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
 static struct _cl_device_id intel_kbl_gt1_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 12,
@@ -434,6 +444,7 @@ ivb_gt1_break:
       intel_ivb_gt1_device.platform = cl_get_platform_default();
       ret = &intel_ivb_gt1_device;
       cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_intel_motion_estimation_ext_id);
       break;
 
     case PCI_CHIP_IVYBRIDGE_GT2:
@@ -447,6 +458,7 @@ ivb_gt2_break:
       intel_ivb_gt2_device.platform = cl_get_platform_default();
       ret = &intel_ivb_gt2_device;
       cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_intel_motion_estimation_ext_id);
       break;
 
     case PCI_CHIP_BAYTRAIL_T:
@@ -456,6 +468,7 @@ baytrail_t_device_break:
       intel_baytrail_t_device.platform = cl_get_platform_default();
       ret = &intel_baytrail_t_device;
       cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_intel_motion_estimation_ext_id);
       break;
 
     case PCI_CHIP_BROADWLL_M_GT1:
@@ -625,12 +638,26 @@ skl_gt4_break:
       cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
-    case PCI_CHIP_BROXTON_P:
-      DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD Graphics Broxton-P");
-bxt_break:
-      intel_bxt_device.device_id = device_id;
-      intel_bxt_device.platform = cl_get_platform_default();
-      ret = &intel_bxt_device;
+    case PCI_CHIP_BROXTON_0:
+      DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD Graphics Broxton 0");
+    case PCI_CHIP_BROXTON_2:
+      DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD Graphics Broxton 2");
+bxt18eu_break:
+      intel_bxt18eu_device.device_id = device_id;
+      intel_bxt18eu_device.platform = cl_get_platform_default();
+      ret = &intel_bxt18eu_device;
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
+    case PCI_CHIP_BROXTON_1:
+      DECL_INFO_STRING(bxt12eu_break, intel_bxt12eu_device, name, "Intel(R) HD Graphics Broxton 1");
+    case PCI_CHIP_BROXTON_3:
+      DECL_INFO_STRING(bxt12eu_break, intel_bxt12eu_device, name, "Intel(R) HD Graphics Broxton 3");
+bxt12eu_break:
+      intel_bxt12eu_device.device_id = device_id;
+      intel_bxt12eu_device.platform = cl_get_platform_default();
+      ret = &intel_bxt12eu_device;
       cl_intel_platform_get_default_extension(ret);
       cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
@@ -757,7 +784,7 @@ kbl_gt4_break:
   /* Apply any driver-dependent updates to the device info */
   cl_driver_update_device_info(ret);
 
-  #define toMB(size) (size)&(0xfffffffffffffff<<20)
+  #define toMB(size) (size)&(UINT64_MAX<<20)
   /* Get the global_mem_size and max_mem_alloc size from
    * driver, system ram and hardware*/
   struct sysinfo info;
@@ -949,7 +976,8 @@ LOCAL cl_bool is_gen_device(cl_device_id device) {
          device == &intel_skl_gt2_device ||
          device == &intel_skl_gt3_device ||
          device == &intel_skl_gt4_device ||
-         device == &intel_bxt_device     ||
+         device == &intel_bxt18eu_device ||
+         device == &intel_bxt12eu_device ||
          device == &intel_kbl_gt1_device ||
          device == &intel_kbl_gt15_device ||
          device == &intel_kbl_gt2_device ||
@@ -1080,7 +1108,7 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
     *ver = 8;
   } else if (device == &intel_skl_gt1_device || device == &intel_skl_gt2_device
         || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device
-        || device == &intel_bxt_device || device == &intel_kbl_gt1_device
+        || device == &intel_bxt18eu_device || device == &intel_bxt12eu_device || device == &intel_kbl_gt1_device
         || device == &intel_kbl_gt2_device || device == &intel_kbl_gt3_device
         || device == &intel_kbl_gt4_device || device == &intel_kbl_gt15_device) {
     *ver = 9;
@@ -1157,10 +1185,12 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
 {
   int err = CL_SUCCESS;
   int dimension = 0;
+  CHECK_KERNEL(kernel);
+  if (device == NULL)
+    device = kernel->program->ctx->device;
   if (UNLIKELY(is_gen_device(device) == CL_FALSE))
     return CL_INVALID_DEVICE;
 
-  CHECK_KERNEL(kernel);
   switch (param_name) {
     case CL_KERNEL_WORK_GROUP_SIZE:
     {
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 16730db..584be9d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -51,6 +51,10 @@ extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr;
 typedef uint32_t (cl_driver_get_ver_cb)(cl_driver);
 extern cl_driver_get_ver_cb *cl_driver_get_ver;
 
+/* enlarge stack size from the driver */
+typedef void (cl_driver_enlarge_stack_size_cb)(cl_driver, int32_t*);
+extern cl_driver_enlarge_stack_size_cb *cl_driver_enlarge_stack_size;
+
 typedef enum cl_self_test_res{
   SELF_TEST_PASS = 0,
   SELF_TEST_SLM_FAIL  = 1,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 31176a4..ea4e90a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -25,6 +25,7 @@ LOCAL cl_driver_new_cb *cl_driver_new = NULL;
 LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
 LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
 LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+LOCAL cl_driver_enlarge_stack_size_cb *cl_driver_enlarge_stack_size = NULL;
 LOCAL cl_driver_set_atomic_flag_cb *cl_driver_set_atomic_flag = NULL;
 LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
 LOCAL cl_driver_update_device_info_cb *cl_driver_update_device_info = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 081ffce..54c0ffa 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -204,7 +204,7 @@ cl_int cl_enqueue_read_image(enqueue_data *data)
     goto error;
   }
 
-  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  size_t offset = image->offset + image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
   src_ptr = (char*)src_ptr + offset;
 
   if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
@@ -246,8 +246,8 @@ cl_int cl_enqueue_write_image(enqueue_data *data)
     err = CL_MAP_FAILURE;
     goto error;
   }
-  //dst need to add offset
-  cl_mem_copy_image_region(data->origin, data->region, dst_ptr,
+  cl_mem_copy_image_region(data->origin, data->region,
+                           dst_ptr + image->offset,
                            image->row_pitch, image->slice_pitch,
                            data->const_ptr, data->row_pitch,
                            data->slice_pitch, image, CL_TRUE, CL_FALSE);
@@ -311,7 +311,7 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
     err = CL_MAP_FAILURE;
     goto error;
   }
-  data->ptr = ptr;
+  data->ptr = (char*)ptr + image->offset;
   if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     row_pitch = image->slice_pitch;
   else
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 183aafc..40b6ddc 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -69,7 +69,8 @@ check_intel_extension(cl_extensions_t *extensions)
 {
   int id;
   for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
-    extensions->extensions[id].base.ext_enabled = 1;
+    if(id != EXT_ID(intel_motion_estimation))
+      extensions->extensions[id].base.ext_enabled = 1;
 }
 
 void
@@ -117,16 +118,7 @@ cl_intel_platform_enable_extension(cl_device_id device, uint32_t ext)
   cl_platform_id pf = device->platform;
   assert(pf);
 
-  for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++) {
-    if (id == ext) {
-      if (!pf->internal_extensions->extensions[id].base.ext_enabled)
-        ext_str = pf->internal_extensions->extensions[id].base.ext_name;
-
-      break;
-    }
-  }
-
-  for(id = BASE_EXT_START_ID; id <= BASE_EXT_END_ID; id++) {
+  for(id = BASE_EXT_START_ID; id < cl_khr_extension_id_max; id++) {
     if (id == ext) {
       if (!pf->internal_extensions->extensions[id].base.ext_enabled)
         ext_str = pf->internal_extensions->extensions[id].base.ext_name;
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index 8715bbd..84b4beb 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -17,7 +17,7 @@
 #include <ocl_icd.h>
 
 #include "cl_platform_id.h"
-
+#include "CL/cl_intel.h" // for clGetKernelSubGroupInfoKHR
 /* The interop functions are not implemented in Beignet */
 #define CL_GL_INTEROP(x) NULL
 /* OpenCL 1.2 is not implemented in Beignet */
@@ -168,7 +168,24 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
   (void *) NULL,
   (void *) NULL,
   (void *) NULL,
-  (void *) NULL
+  (void *) NULL,
+#if (OCL_ICD_IDENTIFIED_FUNCTIONS > 110)
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) clGetKernelSubGroupInfoKHR,
+#endif
 #endif
 };
 
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 06e7c18..ad1c8c2 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1118,7 +1118,6 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
   if (buffer->flags & CL_MEM_USE_HOST_PTR)
     image->host_ptr = buffer->host_ptr + offset;
   cl_mem_image(image)->offset = offset;
-  cl_mem_image(image)->w = image_desc->image_width;
   cl_mem_add_ref(buffer);
   cl_mem_image(image)->buffer_1d = buffer;
   return image;
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index e561725..7a46c1d 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -464,6 +464,16 @@ intel_driver_get_ver(struct intel_driver *drv)
 }
 
 static void
+intel_driver_enlarge_stack_size(struct intel_driver *drv, int32_t *stack_size)
+{
+    if (drv->gen_ver == 75)
+      *stack_size = *stack_size * 4;
+    else if (drv->device_id == PCI_CHIP_BROXTON_1 || drv->device_id == PCI_CHIP_BROXTON_3 ||
+             IS_CHERRYVIEW(drv->device_id))
+      *stack_size = *stack_size * 2;
+}
+
+static void
 intel_driver_set_atomic_flag(intel_driver_t *drv, int atomic_flag)
 {
   drv->atomic_test_result = atomic_flag;
@@ -891,14 +901,14 @@ intel_update_device_info(cl_device_id device)
 
 #ifdef HAS_POOLED_EU
   /* BXT pooled eu, 3*6 to 2*9, like sub slice count is 2 */
-  unsigned int has_pooled_eu = 0;
-  if(!drm_intel_get_pooled_eu(driver->fd, &has_pooled_eu) && has_pooled_eu)
+  int has_pooled_eu;
+  if((has_pooled_eu = drm_intel_get_pooled_eu(driver->fd)) > 0)
     device->sub_slice_count = 2;
 
 #ifdef HAS_MIN_EU_IN_POOL
-  unsigned int min_eu;
+  int min_eu;
   /* for fused down 2x6 devices, beignet don't support. */
-  if (has_pooled_eu && !drm_intel_get_min_eu_in_pool(driver->fd, &min_eu)) {
+  if (has_pooled_eu > 0 && (min_eu = drm_intel_get_min_eu_in_pool(driver->fd)) > 0) {
     assert(min_eu == 9); //don't support fuse down device.
   }
 #endif //HAS_MIN_EU_IN_POOL
@@ -921,6 +931,7 @@ intel_setup_callbacks(void)
   cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
   cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
   cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+  cl_driver_enlarge_stack_size = (cl_driver_enlarge_stack_size_cb *) intel_driver_enlarge_stack_size;
   cl_driver_set_atomic_flag = (cl_driver_set_atomic_flag_cb *) intel_driver_set_atomic_flag;
   cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
   cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 3314ab4..a643f5c 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1215,12 +1215,15 @@ intel_get_surface_type(cl_mem_object_type type)
 static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type)
 {
   uint32_t surface_type;
-  if (((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
+   //Now all platforms need it, so disable platform, re-enable it
+   //when some platform don't need this workaround
+  if (/*((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
         IS_HASWELL(gpgpu->drv->device_id) ||
         IS_BROADWELL(gpgpu->drv->device_id) ||
         IS_CHERRYVIEW(gpgpu->drv->device_id) ||
         IS_SKYLAKE(gpgpu->drv->device_id) ||
-        IS_BROXTON(gpgpu->drv->device_id))) &&
+        IS_BROXTON(gpgpu->drv->device_id) ||
+        IS_KABYLAKE(gpgpu->drv_device_id))) && */
       index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
       type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     surface_type = I965_SURFACE_2D;
@@ -1537,8 +1540,9 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   drm_intel_bo* old = gpgpu->scratch_b.bo;
   uint32_t total = per_thread_size * gpgpu->max_threads;
-  /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
-  if (IS_HASWELL(gpgpu->drv->device_id) || IS_CHERRYVIEW(gpgpu->drv->device_id))
+  /* Per Bspec, scratch should 2X the desired size when EU index is not continuous */
+  if (IS_HASWELL(gpgpu->drv->device_id) || IS_CHERRYVIEW(gpgpu->drv->device_id) ||
+      PCI_CHIP_BROXTON_1 == gpgpu->drv->device_id  || PCI_CHIP_BROXTON_3 == gpgpu->drv->device_id)
       total *= 2;
 
   gpgpu->per_thread_scratch = per_thread_size;
diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl
index e56520a..7216229 100644
--- a/src/kernels/cl_internal_block_motion_estimate_intel.cl
+++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -54,7 +54,7 @@ void block_motion_estimate_intel(accelerator_intel_t accel,
   uint src_grf4_dw1;
   uint src_grf4_dw0;
 
-  uint8 vme_result = (0, 0, 0, 0, 0, 0, 0, 0);
+  uint8 vme_result = (0);
 
   int lgid_x = get_group_id(0);
   int lgid_y = get_group_id(1);
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
index a218b58..e162393 100644
--- a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
@@ -1,4 +1,4 @@
-kernel void __cl_copy_buffer_to_image_2d(__read_only image2d_t image, global uchar* buffer,
+kernel void __cl_copy_buffer_to_image_2d(__write_only image2d_t image, global uchar* buffer,
                                         unsigned int region0, unsigned int region1, unsigned int region2,
                                         unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
                                         unsigned int src_offset)
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
index 84d3b27..5f0e890 100644
--- a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
+++ b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
@@ -1,4 +1,4 @@
-kernel void __cl_copy_buffer_to_image_3d(__read_only image3d_t image, global uchar* buffer,
+kernel void __cl_copy_buffer_to_image_3d(__write_only image3d_t image, global uchar* buffer,
                                         unsigned int region0, unsigned int region1, unsigned int region2,
                                         unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
                                         unsigned int src_offset)
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 0c3cb00..4957b7c 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -26,8 +26,8 @@ if (NOT NOT_BUILD_STAND_ALONE_UTEST)
   # Threads
   Find_Package(Threads)
 
-  set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof")
-  set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_CXX_FLAGS}")
   set (CMAKE_CXX_FLAGS_DEBUG          "-O0 -g -DGBE_DEBUG=1")
   set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
   set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
@@ -255,7 +255,6 @@ set (utests_sources
   compiler_double_div.cpp
   compiler_double_convert.cpp
   load_program_from_gen_bin.cpp
-  load_program_from_spir.cpp
   get_arg_info.cpp
   profiling_exec.cpp
   enqueue_copy_buf.cpp
@@ -284,7 +283,8 @@ set (utests_sources
   builtin_local_linear_id.cpp
   compiler_mix.cpp
   compiler_math_3op.cpp
-  compiler_bsort.cpp)
+  compiler_bsort.cpp
+  builtin_kernel_block_motion_estimate_intel.cpp)
 
 if (LLVM_VERSION_NODOT VERSION_GREATER 34)
   SET(utests_sources
@@ -325,7 +325,6 @@ else(GEN_PCI_ID)
 endif(GEN_PCI_ID)
 
 if (NOT_BUILD_STAND_ALONE_UTEST)
-  SET(utests_sources ${utests_sources} builtin_kernel_block_motion_estimate_intel.cpp)
   ADD_CUSTOM_TARGET(kernel_bin.bin DEPENDS ${kernel_bin}.bin)
 endif (NOT_BUILD_STAND_ALONE_UTEST)
 
diff --git a/utests/builtin_global_linear_id.cpp b/utests/builtin_global_linear_id.cpp
index cda7e84..3e92518 100644
--- a/utests/builtin_global_linear_id.cpp
+++ b/utests/builtin_global_linear_id.cpp
@@ -61,8 +61,8 @@ static void builtin_global_linear_id(void)
     err = clEnqueueNDRangeKernel(queue, kernel, dim, offsets, globals, locals, 0, NULL, NULL);
     if (err != CL_SUCCESS)
     {
-      printf("Error: Failed to excute kernel! %d\n", err);
-      exit(1);
+      printf("Error: Failed to execute kernel! %d\n", err);
+      OCL_ASSERT(0);
     }
 
     clFinish(queue);
diff --git a/utests/builtin_global_size.cpp b/utests/builtin_global_size.cpp
index a2ec24a..51ad054 100644
--- a/utests/builtin_global_size.cpp
+++ b/utests/builtin_global_size.cpp
@@ -74,7 +74,7 @@ static void builtin_global_size(void)
       if (err != CL_SUCCESS)
       {
         printf("Error: Failed to write to source array!\n");
-        exit(1);
+        OCL_ASSERT(0);
       }
 
       // Run the kernel
diff --git a/utests/builtin_kernel_block_motion_estimate_intel.cpp b/utests/builtin_kernel_block_motion_estimate_intel.cpp
index 5a48753..092b1d5 100644
--- a/utests/builtin_kernel_block_motion_estimate_intel.cpp
+++ b/utests/builtin_kernel_block_motion_estimate_intel.cpp
@@ -8,6 +8,9 @@ OCLRELEASEACCELERATORINTEL * oclReleaseAcceleratorIntel = NULL;
 
 void builtin_kernel_block_motion_estimate_intel(void)
 {
+  if (!cl_check_motion_estimation()) {
+    return;
+  }
   char* built_in_kernel_names;
   size_t built_in_kernels_size;
   cl_int err = CL_SUCCESS;
@@ -21,7 +24,8 @@ void builtin_kernel_block_motion_estimate_intel(void)
   if (strstr(built_in_kernel_names, "block_motion_estimate_intel") == NULL)
   {
         free(built_in_kernel_names);
-        return;
+        fprintf(stderr, "Can't find block_motion_estimate_intel built-in kernel");
+        OCL_ASSERT(0);
   }
 
   cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
@@ -41,7 +45,7 @@ void builtin_kernel_block_motion_estimate_intel(void)
 #endif
   if(!oclCreateAcceleratorIntel){
     fprintf(stderr, "Failed to get extension clCreateImageFromLibvaIntel\n");
-    exit(1);
+    OCL_ASSERT(0);
   }
   cl_accelerator_intel accel = oclCreateAcceleratorIntel(ctx, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL,sizeof(cl_motion_estimation_desc_intel), &vmedesc, &err);
   OCL_ASSERT(accel != NULL);
@@ -123,7 +127,7 @@ void builtin_kernel_block_motion_estimate_intel(void)
 #endif
   if(!oclReleaseAcceleratorIntel){
     fprintf(stderr, "Failed to get extension clCreateImageFromLibvaIntel\n");
-    exit(1);
+    OCL_ASSERT(0);
   }
   oclReleaseAcceleratorIntel(accel);
   clReleaseProgram(built_in_prog);
diff --git a/utests/builtin_kernel_max_global_size.cpp b/utests/builtin_kernel_max_global_size.cpp
index d3e8373..ad9c028 100644
--- a/utests/builtin_kernel_max_global_size.cpp
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -10,8 +10,10 @@ void builtin_kernel_max_global_size(void)
 
 
   OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
-  if(built_in_kernels_size == 0)
+  if(built_in_kernels_size <= 1) { //the size of empty string is 1
+    printf(" no built in kernel, Skip!");
     return;
+  }
 
   built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
   OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
diff --git a/utests/builtin_local_size.cpp b/utests/builtin_local_size.cpp
index 491175d..a55769b 100644
--- a/utests/builtin_local_size.cpp
+++ b/utests/builtin_local_size.cpp
@@ -59,7 +59,7 @@ static void builtin_local_size(void)
       if (err != CL_SUCCESS)
       {
         printf("Error: Failed to write to source array!\n");
-        exit(1);
+        OCL_ASSERT(0);
       }
 
       // Run the kernel
diff --git a/utests/builtin_num_groups.cpp b/utests/builtin_num_groups.cpp
index 832766e..764c70b 100644
--- a/utests/builtin_num_groups.cpp
+++ b/utests/builtin_num_groups.cpp
@@ -56,7 +56,7 @@ static void builtin_num_groups(void)
       if (err != CL_SUCCESS)
       {
         printf("Error: Failed to write to source array!\n");
-        exit(1);
+        OCL_ASSERT(0);
       }
 
       // Run the kernel
diff --git a/utests/compiler_sub_group_shuffle_up.cpp b/utests/compiler_sub_group_shuffle_up.cpp
index 6c32ca4..d2e054b 100644
--- a/utests/compiler_sub_group_shuffle_up.cpp
+++ b/utests/compiler_sub_group_shuffle_up.cpp
@@ -12,7 +12,7 @@ void compiler_sub_group_shuffle_up(void)
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
-  int c = 13;
+  int c = 3;
   OCL_SET_ARG(1, sizeof(int), &c);
 
   globals[0] = n;
diff --git a/utests/compiler_subgroup_broadcast.cpp b/utests/compiler_subgroup_broadcast.cpp
index 2835161..2e3fabb 100644
--- a/utests/compiler_subgroup_broadcast.cpp
+++ b/utests/compiler_subgroup_broadcast.cpp
@@ -103,7 +103,7 @@ static void subgroup_generic(T* input,
   size_t SIMD_SIZE = 0;
   OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
 
-  cl_uint SIMD_ID = 10;
+  cl_uint SIMD_ID = 2;
   /* input and expected data */
   generate_data(input, expected, SIMD_ID, SIMD_SIZE);
 
diff --git a/utests/image_1D_buffer.cpp b/utests/image_1D_buffer.cpp
index 66eb6e7..fefb241 100644
--- a/utests/image_1D_buffer.cpp
+++ b/utests/image_1D_buffer.cpp
@@ -3,8 +3,8 @@
 
 void image_1D_buffer(void)
 {
-  size_t buffer_sz = 65536;
-  char *buf_content = (char *)malloc(buffer_sz * sizeof(int));
+  size_t buffer_sz = 8192 * 2 + 32;
+  int *buf_content = (int *)malloc(buffer_sz * sizeof(int));
   int error;
   cl_image_desc image_desc;
   cl_image_format image_format;
@@ -13,7 +13,7 @@ void image_1D_buffer(void)
   OCL_CREATE_KERNEL("image_1D_buffer");
 
   for (int32_t i = 0; i < (int32_t)buffer_sz; ++i)
-    buf_content[i] = (rand() & 0xFFFFFFFF);
+    buf_content[i] = rand();
 
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, buffer_sz * sizeof(int), buf_content);
   OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, buffer_sz * sizeof(int), NULL);
diff --git a/utests/runtime_climage_from_boname.cpp b/utests/runtime_climage_from_boname.cpp
index 2160886..a228c97 100644
--- a/utests/runtime_climage_from_boname.cpp
+++ b/utests/runtime_climage_from_boname.cpp
@@ -161,7 +161,7 @@ void runtime_climage_from_boname(void)
 #endif
   if(!oclCreateImageFromLibvaIntel){
     fprintf(stderr, "Failed to get extension clCreateImageFromLibvaIntel\n");
-    exit(1);
+    OCL_ASSERT(0);
   }
   cl_mem dst = oclCreateImageFromLibvaIntel(ctx, &imageParam, NULL);
 
diff --git a/utests/runtime_cmrt.cpp b/utests/runtime_cmrt.cpp
index 837f09a..92bd368 100644
--- a/utests/runtime_cmrt.cpp
+++ b/utests/runtime_cmrt.cpp
@@ -236,8 +236,8 @@ void runtime_cmrt(void)
   OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
 
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
   uint8_t* src = (uint8_t*)buf_data[0];
   uint8_t* dst = (uint8_t*)buf_data[1];
   for (uint32_t j = 0; j < h; ++j)
@@ -245,8 +245,8 @@ void runtime_cmrt(void)
       src[j * w * 4 + i] = i;
       dst[j * w * 4 + i] = 0;
     }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
 
   unsigned int d = 3;
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -259,16 +259,16 @@ void runtime_cmrt(void)
   //if kernel uses cm_linear_global_id, locals must be not NULL to invoke pCmQueue->EnqueueWithGroup
   OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, globals, NULL, 0, NULL, NULL);
 
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
   src = (uint8_t*)buf_data[0];
   dst = (uint8_t*)buf_data[1];
   for (uint32_t j = 0; j < h; ++j)
     for (uint32_t i = 0; i < w*4; i++) {
       OCL_ASSERT(src[j * w * 4 + i] / d == dst[j * w * 4 + i]);
     }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
 }
 
 MAKE_UTEST_FROM_FUNCTION(runtime_cmrt);
diff --git a/utests/runtime_flat_address_space.cpp b/utests/runtime_flat_address_space.cpp
index cf94cf5..c2d25de 100644
--- a/utests/runtime_flat_address_space.cpp
+++ b/utests/runtime_flat_address_space.cpp
@@ -59,7 +59,7 @@ main(int argc, char *argv[])
     for (uint32_t i = 0; dst_buffer && i < n; ++i)
       if (dst_buffer[i] != int(i)) {
         fprintf(stderr, "run-time flat address space failed\n");
-        exit(-1);
+        OCL_ASSERT(0);
       }
     clEnqueueUnmapMemObject(queue, dst[j], dst_buffer, 0, NULL, NULL);
   }
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index 2c02ad6..f8fe35e 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -247,7 +247,7 @@ which can print more values and information to assist debuging the issue.
   def argvector(self,paraN,index):
     vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
     if vector:
-      vector=string.atoi(vector[0])
+      vector=int(vector[0])
     else:
       vector=1
     return vector
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 70a69cc..4f50f3f 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -875,6 +875,24 @@ int cl_check_beignet(void)
   return 1;
 }
 
+int cl_check_motion_estimation(void)
+{
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_intel_motion_estimation") == NULL) {
+    printf("No cl_intel_motion_estimation, Skip!");
+    return 0;
+  }
+  return 1;
+}
+
 int cl_check_subgroups(void)
 {
   std::string extStr;
@@ -891,7 +909,11 @@ int cl_check_subgroups(void)
     return 0;
   }
   if(utestclGetKernelSubGroupInfoKHR == NULL)
-    utestclGetKernelSubGroupInfoKHR  = (clGetKernelSubGroupInfoKHR_cb*) clGetExtensionFunctionAddress("clGetKernelSubGroupInfoKHR");
+    utestclGetKernelSubGroupInfoKHR  = (clGetKernelSubGroupInfoKHR_cb*) clGetExtensionFunctionAddressForPlatform(platform,"clGetKernelSubGroupInfoKHR");
+  if(utestclGetKernelSubGroupInfoKHR == NULL) {
+    printf("Can't find clGetKernelSubGroupInfoKHR");
+    OCL_ASSERT(0);
+  }
   return 1;
 }
 
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 5f2fea6..a6e8180 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -302,6 +302,9 @@ typedef cl_int(clGetKernelSubGroupInfoKHR_cb)(cl_kernel, cl_device_id,
                                               size_t *);
 extern clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR;
 
+/* Check if cl_intel_motion_estimation enabled. */
+extern int cl_check_motion_estimation(void);
+
 /* Check is cl version 2.0. */
 extern int cl_check_ocl20(void);
 
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
index a4bfd51..6771421 100755
--- a/utests/utest_math_gen.py
+++ b/utests/utest_math_gen.py
@@ -447,14 +447,20 @@ static float minmag(float x, float y){
   nextafterUtests = func('nextafter','nextafterf',[nextafter_input_type1,nextafter_input_type2],nextafter_output_type,[nextafter_input_values1,nextafter_input_values2],'0 * FLT_ULP')
   
   ##### gentype pow(gentype x, gentype y)
-  pow_base_values = base_input_values1
+  pow_base_values = [80, -80, 3.14, -3.14, 0.5, 1, -3,-4,2,0.0,-0.0,1500.24,-1500.24]
   pow_input_values1 = []
   pow_input_values2 = []
   pow_input_values1,pow_input_values2=gene2ValuesLoop(pow_input_values1,pow_input_values2,pow_base_values)
   pow_input_type1 = ['float','float2','float4','float8','float16']
   pow_input_type2 = ['float','float2','float4','float8','float16']
   pow_output_type = ['float','float2','float4','float8','float16']
-  powUtests = func('pow','powf',[pow_input_type1,pow_input_type2],pow_output_type,[pow_input_values1,pow_input_values2],'16 * FLT_ULP')
+  pow_cpu_func='''
+static float pow_utest(float x, float y){
+    if ((x == 0.0f) && (y == -INFINITY))
+        return INFINITY;
+    return pow(x,y);
+} '''
+  powUtests = func('pow','pow_utest',[pow_input_type1,pow_input_type2],pow_output_type,[pow_input_values1,pow_input_values2],'16 * FLT_ULP',pow_cpu_func)
   
   ##### floatn pown(floatn x, intn y)
   pown_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, 0.5, 1, 0.0,1500.24,-1500.24]
@@ -469,7 +475,7 @@ static float pown(float x, int y){
   pownUtests = func('pown','pown',[pown_input_type1,pown_input_type2],pown_output_type,[pown_input_values1,pown_input_values2],'16 * FLT_ULP', pown_cpu_func)
   
   ##### gentype powr(gentype x, gentype y)
-  powr_input_values1 = [80, -80, 3.14, 1, 1.257, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +1, +1, -80, +0.0, -0.0, +0.0, -0.0, 'INFINITY','INFINITY', +1, +1, +0.0, 2.5,' NAN', 'NAN', 'NAN']
+  powr_input_values1 = [80, 80, 3.14, 1, 1.257, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +1, +1, 80, +0.0, -0.0, +0.0, -0.0, 'INFINITY','INFINITY', +1, +1, +0.0, 2.5,' NAN', 'NAN', 'NAN']
   powr_input_values2 = [5.5, 6,7, +0.0, -0.0, -1, -15.67, '-INFINITY', '-INFINITY', 1,  -2.7, 10.5, 3.1415, 3.5, -0.0, -0.0, +0.0, +0.0, +0.0, -0.0, 'INFINITY', '-INFINITY', 'NAN', 'NAN', -1.5, +0.0, 1.5]
   powr_input_type1 = ['float','float2','float4','float8','float16']
   powr_input_type2 = ['float','float2','float4','float8','float16']

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-commits mailing list