[beignet] 01/09: New upstream version 1.3.2

Rebecca Palmer rnpalmer-guest at moszumanska.debian.org
Sat Oct 28 07:34:13 UTC 2017


This is an automated email from the git hooks/post-receive script.

rnpalmer-guest pushed a commit to branch master
in repository beignet.

commit e55975147457d5a666c2744da5caef738b44842d
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date:   Thu Oct 26 22:48:43 2017 +0100

    New upstream version 1.3.2
---
 CMakeLists.txt                                     |   2 +-
 backend/src/backend/context.cpp                    |   6 +-
 backend/src/backend/gen75_encoder.cpp              |  10 +-
 backend/src/backend/gen7_encoder.cpp               |  10 +-
 backend/src/backend/gen7_instruction.hpp           |   5 +-
 backend/src/backend/gen9_context.cpp               |   4 +
 backend/src/backend/gen9_context.hpp               |  14 ++
 backend/src/backend/gen_context.cpp                |  38 +++--
 backend/src/backend/gen_insn_selection.cpp         |  80 +++++++++
 backend/src/backend/gen_insn_selection.hpp         |   7 +
 .../src/backend/gen_insn_selection_optimize.cpp    |  67 +++++++-
 backend/src/backend/gen_insn_selection_output.cpp  |  42 +++++
 backend/src/backend/gen_program.cpp                | 116 ++++++++-----
 backend/src/backend/program.cpp                    | 163 +++++++++---------
 backend/src/backend/program.h                      |  10 +-
 backend/src/backend/program.hpp                    |   4 +-
 backend/src/ir/function.hpp                        |   2 +-
 backend/src/ir/half.cpp                            |  20 +++
 backend/src/ir/lowering.cpp                        |  17 +-
 backend/src/ir/profiling.cpp                       |   1 +
 backend/src/ir/unit.hpp                            |   4 +-
 backend/src/libocl/CMakeLists.txt                  |   4 +-
 backend/src/libocl/include/ocl.h                   |   4 +
 backend/src/libocl/include/ocl_enqueue.h           |  41 +++--
 backend/src/libocl/include/ocl_image.h             |   2 +
 backend/src/libocl/include/ocl_misc.h              |   2 +
 backend/src/libocl/src/ocl_enqueue.cl              |  54 +++---
 backend/src/libocl/src/ocl_memcpy.cl               |   3 +-
 backend/src/libocl/src/ocl_misc.cl                 |  62 +++----
 backend/src/libocl/src/ocl_sampler.ll              |  10 ++
 backend/src/libocl/src/ocl_sampler_20.ll           |  10 ++
 backend/src/llvm/ExpandLargeIntegers.cpp           |   6 +-
 backend/src/llvm/ExpandUtils.cpp                   |   4 +
 backend/src/llvm/PromoteIntegers.cpp               |   5 +
 backend/src/llvm/StripAttributes.cpp               |  15 +-
 backend/src/llvm/llvm_barrier_nodup.cpp            |  11 +-
 backend/src/llvm/llvm_bitcode_link.cpp             |  67 ++++++--
 backend/src/llvm/llvm_device_enqueue.cpp           |  52 +++---
 backend/src/llvm/llvm_gen_backend.cpp              | 188 ++++++++++++++-------
 backend/src/llvm/llvm_gen_backend.hpp              |  13 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   4 +
 backend/src/llvm/llvm_includes.hpp                 |  20 ++-
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |   7 +-
 backend/src/llvm/llvm_loadstore_optimization.cpp   |  14 +-
 backend/src/llvm/llvm_passes.cpp                   |  45 +++--
 backend/src/llvm/llvm_printf_parser.cpp            |  16 +-
 backend/src/llvm/llvm_profiling.cpp                |  46 +++--
 backend/src/llvm/llvm_sampler_fix.cpp              |  51 +++++-
 backend/src/llvm/llvm_scalarize.cpp                |   8 +-
 backend/src/llvm/llvm_to_gen.cpp                   |  96 +++++------
 backend/src/llvm/llvm_to_gen.hpp                   |   8 +-
 backend/src/llvm/llvm_unroll.cpp                   |  21 ++-
 docs/Beignet.mdwn                                  |   2 +-
 docs/NEWS.mdwn                                     |   3 +
 docs/howto/gl-buffer-sharing-howto.mdwn            |  14 +-
 include/CL/cl_intel.h                              |   6 +
 kernels/compiler_if_else.cl                        |   2 -
 kernels/compiler_remove_negative_add.cl            |   4 +
 kernels/compiler_reqd_sub_group_size.cl            |   5 +
 kernels/compiler_sqrt_div.cl                       |   8 +
 kernels/test_fill_gl_image.cl                      |   2 +-
 src/CMakeLists.txt                                 |   4 +-
 src/cl_context.c                                   |  58 +++----
 src/cl_context.h                                   |   2 -
 src/cl_device_data.h                               |  15 +-
 src/cl_device_id.c                                 |  78 ++++++++-
 src/cl_device_id.h                                 |   2 +
 src/cl_extensions.c                                |  10 +-
 src/cl_extensions.h                                |   3 +-
 src/cl_gbe_loader.cpp                              |   5 +
 src/cl_gbe_loader.h                                |   1 +
 src/cl_gl_api.c                                    | 150 ++++++++++++++++
 src/cl_gt_device.h                                 |   2 +
 src/cl_program.c                                   |  57 ++-----
 src/intel/intel_driver.c                           |  31 ++--
 src/intel/intel_gpgpu.c                            |   2 +
 utests/CMakeLists.txt                              |   5 +-
 utests/compiler_if_else.cpp                        |  15 +-
 utests/compiler_remove_negative_add.cpp            |  40 +++++
 utests/compiler_reqd_sub_group_size.cpp            |  46 +++++
 utests/compiler_sqrt_div.cpp                       |  61 +++++++
 utests/enqueue_built_in_kernels.cpp                |   1 +
 utests/utest_helper.cpp                            |  20 +++
 utests/utest_helper.hpp                            |   3 +
 84 files changed, 1569 insertions(+), 599 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index face3ce..c11acbb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -276,7 +276,7 @@ ENDIF(CAN_OPENCL_20)
 
 set (LIBCL_DRIVER_VERSION_MAJOR 1)
 set (LIBCL_DRIVER_VERSION_MINOR 3)
-set (LIBCL_DRIVER_VERSION_PATCH 1)
+set (LIBCL_DRIVER_VERSION_PATCH 2)
 if (ENABLE_OPENCL_20)
   set (LIBCL_C_VERSION_MAJOR 2)
   set (LIBCL_C_VERSION_MINOR 0)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index e9ddd17..c9500c8 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -340,7 +340,6 @@ namespace gbe
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
-  IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
 
   Context::Context(const ir::Unit &unit, const std::string &name) :
     unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL), useDWLabel(false)
@@ -361,10 +360,7 @@ namespace gbe
   }
 
   void Context::startNewCG(uint32_t simdWidth) {
-    if (simdWidth == 0 || OCL_SIMD_WIDTH != 15)
-      this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
-    else
-      this->simdWidth = simdWidth;
+    this->simdWidth = simdWidth;
     GBE_SAFE_DELETE(this->registerAllocator);
     GBE_SAFE_DELETE(this->scratchAllocator);
     GBE_ASSERT(dag != NULL && liveness != NULL);
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index b82cc43..06cca3c 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -53,8 +53,14 @@ namespace gbe
     gen7_insn->header.quarter_control = this->curr.quarterControl;
     gen7_insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
     gen7_insn->header.mask_control = this->curr.noMask;
-    gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
-    gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    if (insn->header.opcode == GEN_OPCODE_MAD || insn->header.opcode == GEN_OPCODE_LRP)
+    {
+      gen7_insn->bits1.da3src.flag_reg_nr = this->curr.flag;
+      gen7_insn->bits1.da3src.flag_sub_reg_nr = this->curr.subFlag;
+    } else {
+      gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+      gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    }
     if (this->curr.predicate != GEN_PREDICATE_NONE) {
       gen7_insn->header.predicate_control = this->curr.predicate;
       gen7_insn->header.predicate_inverse = this->curr.inversePredicate;
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index 4b2cd9a..d526f5d 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -46,8 +46,14 @@ namespace gbe
     gen7_insn->header.quarter_control = this->curr.quarterControl;
     gen7_insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
     gen7_insn->header.mask_control = this->curr.noMask;
-    gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
-    gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    if (insn->header.opcode == GEN_OPCODE_MAD || insn->header.opcode == GEN_OPCODE_LRP)
+    {
+      gen7_insn->bits1.da3src.flag_reg_nr = this->curr.flag;
+      gen7_insn->bits1.da3src.flag_sub_reg_nr = this->curr.subFlag;
+    } else {
+      gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+      gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    }
     if (this->curr.predicate != GEN_PREDICATE_NONE) {
       gen7_insn->header.predicate_control = this->curr.predicate;
       gen7_insn->header.predicate_inverse = this->curr.inversePredicate;
diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp
index 7d7eada..c985fb8 100644
--- a/backend/src/backend/gen7_instruction.hpp
+++ b/backend/src/backend/gen7_instruction.hpp
@@ -142,8 +142,9 @@ union Gen7NativeInstruction
 
       struct {
         uint32_t dest_reg_file:1;
-        uint32_t flag_subreg_num:1;
-        uint32_t pad0:2;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad0:1;
         uint32_t src0_abs:1;
         uint32_t src0_negate:1;
         uint32_t src1_abs:1;
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 483b2c3..2ce53b6 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -236,4 +236,8 @@ namespace gbe
     this->sel = GBE_NEW(SelectionKbl, *this);
   }
 
+  void GlkContext::newSelection(void) {
+    this->sel = GBE_NEW(SelectionGlk, *this);
+  }
+
 }
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 9977e9a..0476661 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -82,5 +82,19 @@ namespace gbe
     private:
       virtual void newSelection(void);
   };
+
+  /* This class is used to implement the geminilake
+     specific logic for context. */
+  class GlkContext : public BxtContext
+  {
+    public:
+      virtual ~GlkContext(void) { };
+      GlkContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+        : BxtContext(unit, name, deviceID, relaxMath) {
+        };
+
+    private:
+      virtual void newSelection(void);
+  };
 }
 #endif /* __GBE_GEN9_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index bb104cf..874e79f 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1949,23 +1949,33 @@ namespace gbe
     indirect_src = GenRegister::indirect(dst.type, 0, GEN_WIDTH_1,
                                          GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL, GEN_HORIZONTAL_STRIDE_0);
 
-    p->push();
-      p->curr.execWidth = 8;
-      p->curr.quarterControl = GEN_COMPRESSION_Q1;
-      p->MOV(a0, tmp);
-      p->MOV(dst, indirect_src);
-    p->pop();
-
-    if (simdWidth == 16) {
+    if (sel->isScalarReg(dst.reg())) {
+      p->push();
+        p->curr.execWidth = 1;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(a0, tmp);
+        p->MOV(dst, indirect_src);
+      p->pop();
+    } else {
       p->push();
         p->curr.execWidth = 8;
-        p->curr.quarterControl = GEN_COMPRESSION_Q2;
-
-        const GenRegister nextDst = GenRegister::Qn(dst, 1);
-        const GenRegister nextOffset = GenRegister::Qn(tmp, 1);
-        p->MOV(a0, nextOffset);
-        p->MOV(nextDst, indirect_src);
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        p->MOV(a0, tmp);
+        p->MOV(dst, indirect_src);
       p->pop();
+
+      if (simdWidth == 16) {
+        p->push();
+          p->curr.execWidth = 8;
+          p->curr.quarterControl = GEN_COMPRESSION_Q2;
+
+          const GenRegister nextDst = GenRegister::Qn(dst, 1);
+          const GenRegister nextOffset = GenRegister::Qn(tmp, 1);
+          p->MOV(a0, nextOffset);
+          p->MOV(nextDst, indirect_src);
+        p->pop();
+      }
     }
   }
 
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 22b0ddc..e91a14b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2859,6 +2859,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     opt_features = SIOF_LOGICAL_SRCMOD;
   }
 
+  SelectionGlk::SelectionGlk(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setLongRegRestrict(true);
+    this->opaque->setHasDoubleType(true);
+    this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+    this->opaque->setSlowByteGather(false);
+    this->opaque->setHasHalfType(true);
+    opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT;
+  }
+
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
                                       uint32_t bti, bool is3D) {
     uint32_t elemID = 0;
@@ -3741,6 +3752,74 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     }
   };
 
+  /*! there some patterns like:
+    sqrt r1, r2;
+    load r4, 1.0;       ===> rqrt r3, r2
+    div r3, r4, r1; */
+  class SqrtDivInstructionPattern : public SelectionPattern {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    SqrtDivInstructionPattern(void) : SelectionPattern(1, 1) { this->opcodes.push_back(ir::OP_DIV); }
+
+    /*! Implements base class */
+    virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+
+      // We are good to try. We need a MUL for one of the two sources
+      const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+      if (insn.getType() != TYPE_FLOAT)
+        return false;
+      SelectionDAG *child0 = dag.child[0];
+      SelectionDAG *child1 = dag.child[1];
+      const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT);
+
+      if (child1 && child1->insn.getOpcode() == OP_SQR) {
+        GBE_ASSERT(cast<ir::UnaryInstruction>(child1->insn).getType() == TYPE_FLOAT);
+        GenRegister srcSQR = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
+        const GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_FLOAT);
+        const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_FLOAT);
+        float immVal = 0.0f;
+
+        if (child0 && child0->insn.getOpcode() == OP_LOADI) {
+          const auto &loadimm = cast<LoadImmInstruction>(child0->insn);
+          const Immediate imm = loadimm.getImmediate();
+          const Type type = imm.getType();
+          if (type == TYPE_FLOAT)
+            immVal = imm.getFloatValue();
+          else if (type == TYPE_S32 || type == TYPE_U32)
+            immVal = imm.getIntegerValue();
+        }
+
+        sel.push();
+        if (sel.isScalarReg(insn.getDst(0)))
+          sel.curr.execWidth = 1;
+
+        if (immVal == 1.0f) {
+          sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, srcSQR);
+          if (child1->child[0])
+            child1->child[0]->isRoot = 1;
+        } else {
+          sel.MATH(tmp, GEN_MATH_FUNCTION_RSQ, srcSQR);
+          if (immVal != 0.0f) {
+            GenRegister isrc = GenRegister::immf(immVal);
+            sel.MUL(dst, tmp, isrc);
+          } else {
+            sel.MUL(dst, src0, tmp);
+            if (child0)
+              child0->isRoot = 1;
+          }
+
+          if (child1->child[0])
+            child1->child[0]->isRoot = 1;
+        }
+        sel.pop();
+
+        return true;
+      }
+      return false;
+    }
+  };
+
   /*! sel.{le,l,ge...} like patterns */
   class SelectModifierInstructionPattern : public SelectionPattern
   {
@@ -8078,6 +8157,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
 
   SelectionLibrary::SelectionLibrary(void) {
     this->insert<UnaryInstructionPattern>();
+    this->insert<SqrtDivInstructionPattern>();
     this->insert<BinaryInstructionPattern>();
     this->insert<TypedWriteInstructionPattern>();
     this->insert<SyncInstructionPattern>();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index a99b8a9..8f34678 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -374,6 +374,13 @@ namespace gbe
       SelectionKbl(GenContext &ctx);
   };
 
+  class SelectionGlk: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      SelectionGlk(GenContext &ctx);
+  };
+
 } /* namespace gbe */
 
 #endif /*  __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp b/backend/src/backend/gen_insn_selection_optimize.cpp
index d2e0fb9..2b2f8bc 100644
--- a/backend/src/backend/gen_insn_selection_optimize.cpp
+++ b/backend/src/backend/gen_insn_selection_optimize.cpp
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <climits>
 #include <map>
+#include <math.h>
 
 namespace gbe
 {
@@ -74,8 +75,7 @@ namespace gbe
                   const GenRegister& replacement) :
                   insn(insn), intermedia(intermedia), replacement(replacement)
       {
-        assert(insn.opcode == SEL_OP_MOV);
-        assert(&(insn.src(0)) == &replacement);
+        assert(insn.opcode == SEL_OP_MOV || insn.opcode == SEL_OP_ADD);
         assert(&(insn.dst(0)) == &intermedia);
         this->elements = CalculateElements(intermedia, insn.state.execWidth);
         replacementOverwritten = false;
@@ -102,6 +102,7 @@ namespace gbe
     void doReplacement(ReplaceInfo* info);
     bool CanBeReplaced(const ReplaceInfo* info, const SelectionInstruction& insn, const GenRegister& var);
     void cleanReplaceInfoMap();
+    void doNegAddOptimization(SelectionInstruction &insn);
 
     SelectionBlock &bb;
     const ir::Liveness::LiveOut& liveout;
@@ -159,8 +160,13 @@ namespace gbe
 
   void SelBasicBlockOptimizer::addToReplaceInfoMap(SelectionInstruction& insn)
   {
-    assert(insn.opcode == SEL_OP_MOV);
-    const GenRegister& src = insn.src(0);
+    assert(insn.opcode == SEL_OP_MOV || insn.opcode == SEL_OP_ADD);
+    GenRegister &src = insn.src(0);
+    if (insn.opcode == SEL_OP_ADD) {
+      if (src.file == GEN_IMMEDIATE_VALUE)
+        src = insn.src(1);
+    }
+
     const GenRegister& dst = insn.dst(0);
     if (src.type != dst.type || src.file != dst.file)
       return;
@@ -184,6 +190,40 @@ namespace gbe
     if (insn.opcode == SEL_OP_BSWAP) //should remove once bswap issue is fixed
       return false;
 
+    //the src modifier is not supported by the following instructions
+    if(info->replacement.negation || info->replacement.absolute)
+    {
+      switch(insn.opcode)
+      {
+        case SEL_OP_MATH:
+        {
+          switch(insn.extra.function)
+          {
+            case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT:
+            case GEN_MATH_FUNCTION_INT_DIV_REMAINDER:
+            case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+              return false;
+            default:
+              break;
+          }
+
+          break;
+        }
+        case SEL_OP_CBIT:
+        case SEL_OP_FBH:
+        case SEL_OP_FBL:
+        case SEL_OP_BRC:
+        case SEL_OP_BRD:
+        case SEL_OP_BFREV:
+        case SEL_OP_LZD:
+        case SEL_OP_HADD:
+        case SEL_OP_RHADD:
+          return false;
+        default:
+          break;
+      }
+    }
+
     if (insn.isWrite() || insn.isRead()) //register in selection vector
       return false;
 
@@ -249,10 +289,29 @@ namespace gbe
 
       if (insn.opcode == SEL_OP_MOV)
         addToReplaceInfoMap(insn);
+
+      doNegAddOptimization(insn);
     }
     cleanReplaceInfoMap();
   }
 
+  /* LLVM transform Mad(a, -b, c) to
+     Add b, -b, 0
+     Mad val, a, b, c
+     for Gen support negtive modifier, mad(a, -b, c) is native suppoted.
+     Also it can be used for the same like instruction sequence.
+     Do it just like a:  mov b, -b, so it is a Mov operation like LocalCopyPropagation
+  */
+  void SelBasicBlockOptimizer::doNegAddOptimization(SelectionInstruction &insn) {
+    if (insn.opcode == SEL_OP_ADD) {
+      GenRegister src0 = insn.src(0);
+      GenRegister src1 = insn.src(1);
+      if ((src0.negation && src1.file == GEN_IMMEDIATE_VALUE && src1.value.f == 0.0f) ||
+          (src1.negation && src0.file == GEN_IMMEDIATE_VALUE && src0.value.f == 0.0f))
+        addToReplaceInfoMap(insn);
+    }
+  }
+
   void SelBasicBlockOptimizer::run()
   {
     for (size_t i = 0; i < MaxTries; ++i) {
diff --git a/backend/src/backend/gen_insn_selection_output.cpp b/backend/src/backend/gen_insn_selection_output.cpp
index f23e8c8..33f0d15 100644
--- a/backend/src/backend/gen_insn_selection_output.cpp
+++ b/backend/src/backend/gen_insn_selection_output.cpp
@@ -142,6 +142,48 @@ namespace gbe
       }
     }
 
+    if (insn.opcode == SEL_OP_MATH) {
+      switch (insn.extra.function) {
+        case GEN_MATH_FUNCTION_INV:
+          strcat(opname, ".inv");
+          break;
+        case GEN_MATH_FUNCTION_LOG:
+          strcat(opname, ".log");
+          break;
+        case GEN_MATH_FUNCTION_EXP:
+          strcat(opname, ".exp");
+          break;
+        case GEN_MATH_FUNCTION_SQRT:
+          strcat(opname, ".sqrt");
+          break;
+        case GEN_MATH_FUNCTION_RSQ:
+          strcat(opname, ".rsq");
+          break;
+        case GEN_MATH_FUNCTION_SIN:
+          strcat(opname, ".sin");
+          break;
+        case GEN_MATH_FUNCTION_COS:
+          strcat(opname, ".cos");
+          break;
+        case GEN_MATH_FUNCTION_FDIV:
+          strcat(opname, ".fdiv");
+          break;
+        case GEN_MATH_FUNCTION_POW:
+          strcat(opname, ".pow");
+          break;
+        case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+          strcat(opname, ".intdivmod");
+          break;
+        case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT:
+          strcat(opname, ".intdiv");
+          break;
+        case GEN_MATH_FUNCTION_INT_DIV_REMAINDER:
+          strcat(opname, ".intmod");
+          break;
+      }
+    }
+
+
     int n = strlen(opname);
     if(n >= OP_NAME_LENGTH - 20) {
       cout << "opname too long: " << opname << endl;
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 073ede6..e06ed40 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -24,18 +24,18 @@
 
 #ifdef GBE_COMPILER_AVAILABLE
 #include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/DataLayout.h"
-#else
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm-c/Linker.h"
+#include "llvm-c/BitReader.h"
+#include "llvm-c/BitWriter.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+#include "llvm/Bitcode/BitcodeWriter.h"
+#else
 #include "llvm/Bitcode/ReaderWriter.h"
+#endif /* LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40 */
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -61,6 +61,7 @@
 #include <clang/CodeGen/CodeGenAction.h>
 #endif
 
+#include "sys/cvar.hpp"
 #include <cstring>
 #include <sstream>
 #include <memory>
@@ -127,30 +128,46 @@ namespace gbe {
 
   void GenProgram::CleanLlvmResource(void){
 #ifdef GBE_COMPILER_AVAILABLE
+    llvm::LLVMContext* ctx = NULL;
     if(module){
+      ctx = &((llvm::Module*)module)->getContext();
+      (void)ctx;
       delete (llvm::Module*)module;
       module = NULL;
     }
-
+//llvm's version < 3.9, ctx is global ctx, can't be deleted.
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+    //each module's context is individual, just delete it, ignaor llvm_ctx.
+    if (ctx != NULL)
+      delete ctx;
+#else
     if(llvm_ctx){
       delete (llvm::LLVMContext*)llvm_ctx;
       llvm_ctx = NULL;
     }
 #endif
+#endif
   }
 
   /*! We must avoid spilling at all cost with Gen */
-  static const struct CodeGenStrategy {
+  struct CodeGenStrategy {
     uint32_t simdWidth;
     uint32_t reservedSpillRegs;
     bool limitRegisterPressure;
-  } codeGenStrategy[] = {
+  };
+  static const struct CodeGenStrategy codeGenStrategyDefault[] = {
     {16, 0, false},
     {8, 0, false},
     {8, 8, false},
     {8, 16, false},
   };
+  static const struct CodeGenStrategy codeGenStrategySimd16[] = {
+    {16, 0, false},
+    {16, 8, false},
+    {16, 16, false},
+  };
 
+  IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
   Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name,
                                     bool relaxMath, int profiling) {
 #ifdef GBE_COMPILER_AVAILABLE
@@ -158,19 +175,23 @@ namespace gbe {
     // when the function already provides the simd width we need to use (i.e.
     // non zero)
     const ir::Function *fn = unit.getFunction(name);
+    const struct CodeGenStrategy* codeGenStrategy = codeGenStrategyDefault;
     if(fn == NULL)
       GBE_ASSERT(0);
-    uint32_t codeGenNum = sizeof(codeGenStrategy) / sizeof(codeGenStrategy[0]);
+    uint32_t codeGenNum = sizeof(codeGenStrategyDefault) / sizeof(codeGenStrategyDefault[0]);
     uint32_t codeGen = 0;
     GenContext *ctx = NULL;
-    if (fn->getSimdWidth() == 8) {
+    if ( fn->getSimdWidth() != 0 && OCL_SIMD_WIDTH != 15) {
+      GBE_ASSERTM(0, "unsupported SIMD width!");
+    }else if (fn->getSimdWidth() == 8 || OCL_SIMD_WIDTH == 8) {
       codeGen = 1;
-    } else if (fn->getSimdWidth() == 16) {
-      codeGenNum = 1;
-    } else if (fn->getSimdWidth() == 0) {
+    } else if (fn->getSimdWidth() == 16 || OCL_SIMD_WIDTH == 16){
+      codeGenStrategy = codeGenStrategySimd16;
+      codeGenNum = sizeof(codeGenStrategySimd16) / sizeof(codeGenStrategySimd16[0]);
+    } else if (fn->getSimdWidth() == 0 && OCL_SIMD_WIDTH == 15) {
       codeGen = 0;
     } else
-      GBE_ASSERT(0);
+      GBE_ASSERTM(0, "unsupported SIMD width!");
     Kernel *kernel = NULL;
 
     // Stop when compilation is successful
@@ -188,6 +209,8 @@ namespace gbe {
       ctx = GBE_NEW(BxtContext, unit, name, deviceID, relaxMath);
     } else if (IS_KABYLAKE(deviceID)) {
       ctx = GBE_NEW(KblContext, unit, name, deviceID, relaxMath);
+    } else if (IS_GEMINILAKE(deviceID)) {
+      ctx = GBE_NEW(GlkContext, unit, name, deviceID, relaxMath);
     }
     GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
 
@@ -243,6 +266,7 @@ namespace gbe {
     GBHI_SKL = 5,
     GBHI_BXT = 6,
     GBHI_KBL = 7,
+    GBHI_GLK = 8,
     GBHI_MAX,
   };
 #define GEN_BINARY_VERSION  1
@@ -254,7 +278,8 @@ namespace gbe {
                                               {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'B', 'D', 'W'},
                                               {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'S', 'K', 'L'},
                                               {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'B', 'X', 'T'},
-                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'K', 'B', 'T'}
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'K', 'B', 'T'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'G', 'L', 'K'}
                                               };
 
 #define FILL_GEN_HEADER(binary, index)  do {int i = 0; do {*(binary+i) = gen_binary_header[index][i]; i++; }while(i < GEN_BINARY_HEADER_LENGTH);}while(0)
@@ -266,6 +291,7 @@ namespace gbe {
 #define FILL_SKL_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_SKL)
 #define FILL_BXT_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BXT)
 #define FILL_KBL_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_KBL)
+#define FILL_GLK_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_GLK)
 
   static bool genHeaderCompare(const unsigned char *BufPtr, GEN_BINARY_HEADER_INDEX index)
   {
@@ -291,6 +317,7 @@ namespace gbe {
 #define MATCH_SKL_HEADER(binary) genHeaderCompare(binary, GBHI_SKL)
 #define MATCH_BXT_HEADER(binary) genHeaderCompare(binary, GBHI_BXT)
 #define MATCH_KBL_HEADER(binary) genHeaderCompare(binary, GBHI_KBL)
+#define MATCH_GLK_HEADER(binary) genHeaderCompare(binary, GBHI_GLK)
 
 #define MATCH_DEVICE(deviceID, binary) ((IS_IVYBRIDGE(deviceID) && MATCH_IVB_HEADER(binary)) ||  \
                                       (IS_IVYBRIDGE(deviceID) && MATCH_IVB_HEADER(binary)) ||  \
@@ -300,7 +327,8 @@ namespace gbe {
                                       (IS_CHERRYVIEW(deviceID) && MATCH_CHV_HEADER(binary)) ||  \
                                       (IS_SKYLAKE(deviceID) && MATCH_SKL_HEADER(binary)) || \
                                       (IS_BROXTON(deviceID) && MATCH_BXT_HEADER(binary)) || \
-                                      (IS_KABYLAKE(deviceID) && MATCH_KBL_HEADER(binary)) \
+                                      (IS_KABYLAKE(deviceID) && MATCH_KBL_HEADER(binary)) || \
+                                      (IS_GEMINILAKE(deviceID) && MATCH_GLK_HEADER(binary)) \
                                       )
 
   static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
@@ -335,20 +363,20 @@ namespace gbe {
     //the first byte stands for binary_type.
     binary_content.assign(binary+1, size-1);
     llvm::StringRef llvm_bin_str(binary_content);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-    llvm::LLVMContext& c = GBEGetLLVMContext();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+    llvm::LLVMContext *c = new llvm::LLVMContext;
 #else
-    llvm::LLVMContext& c = llvm::getGlobalContext();
+    llvm::LLVMContext *c = &llvm::getGlobalContext();
 #endif
     llvm::SMDiagnostic Err;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
     std::unique_ptr<llvm::MemoryBuffer> memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
     acquireLLVMContextLock();
-    llvm::Module* module = llvm::parseIR(memory_buffer->getMemBufferRef(), Err, c).release();
+    llvm::Module* module = llvm::parseIR(memory_buffer->getMemBufferRef(), Err, *c).release();
 #else
     llvm::MemoryBuffer* memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
     acquireLLVMContextLock();
-    llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
+    llvm::Module* module = llvm::ParseIR(memory_buffer, Err, *c);
 #endif
     // if load 32 bit spir binary, the triple should be spir-unknown-unknown.
     llvm::Triple triple(module->getTargetTriple());
@@ -408,6 +436,8 @@ namespace gbe {
         FILL_BXT_HEADER(*binary);
       }else if(IS_KABYLAKE(prog->deviceID)){
         FILL_KBL_HEADER(*binary);
+      }else if(IS_GEMINILAKE(prog->deviceID)){
+        FILL_GLK_HEADER(*binary);
       }else {
         free(*binary);
         *binary = NULL;
@@ -436,7 +466,6 @@ namespace gbe {
   }
 
   static gbe_program genProgramNewFromLLVM(uint32_t deviceID,
-                                           const char *fileName,
                                            const void* module,
                                            const void* llvm_ctx,
                                            const char* asm_file_name,
@@ -456,7 +485,7 @@ namespace gbe {
 #ifdef GBE_COMPILER_AVAILABLE
     std::string error;
     // Try to compile the program
-    if (program->buildFromLLVMFile(fileName, module, error, optLevel) == false) {
+    if (program->buildFromLLVMModule(module, error, optLevel) == false) {
       if (err != NULL && errSize != NULL && stringSize > 0u) {
         const size_t msgSize = std::min(error.size(), stringSize-1u);
         std::memcpy(err, error.c_str(), msgSize);
@@ -486,31 +515,39 @@ namespace gbe {
   {
 #ifdef GBE_COMPILER_AVAILABLE
     using namespace gbe;
-    char* errMsg;
+    char* errMsg = NULL;
     if(((GenProgram*)dst_program)->module == NULL){
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+      LLVMModuleRef modRef;
+      LLVMParseBitcodeInContext2(wrap(new llvm::LLVMContext()),
+                                 LLVMWriteBitcodeToMemoryBuffer(wrap((llvm::Module*)((GenProgram*)src_program)->module)),
+                                 &modRef);
+      ((GenProgram*)dst_program)->module = llvm::unwrap(modRef);
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
       ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module).release();
 #else
       ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module);
 #endif
       errSize = 0;
-    }else{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-      // Src now will be removed automatically. So clone it.
-      llvm::Module* src = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module).release();
-#else
+    } else {
       llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
-#endif
       llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-      if (LLVMLinkModules2(wrap(dst), wrap(src))) {
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+      if (&src->getContext() != &dst->getContext()) {
+        LLVMModuleRef modRef;
+        LLVMParseBitcodeInContext2(wrap(&dst->getContext()),
+                                    LLVMWriteBitcodeToMemoryBuffer(wrap(src)),
+                                    &modRef);
+        src = llvm::unwrap(modRef);
+      }
+      llvm::Module* clone = llvm::CloneModule(src).release();
+      if (LLVMLinkModules2(wrap(dst), wrap(clone))) {
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
       if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource_Removed, &errMsg)) {
 #else
       if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource, &errMsg)) {
 #endif
-        if (err != NULL && errSize != NULL && stringSize > 0u) {
+        if (err != NULL && errSize != NULL && stringSize > 0u && errMsg) {
           strncpy(err, errMsg, stringSize-1);
           err[stringSize-1] = '\0';
           *errSize = strlen(err);
@@ -518,7 +555,6 @@ namespace gbe {
         return true;
       }
     }
-    // Everything run fine
 #endif
     return false;
   }
@@ -579,7 +615,7 @@ namespace gbe {
     acquireLLVMContextLock();
     llvm::Module* module = (llvm::Module*)p->module;
 
-    if (p->buildFromLLVMFile(NULL, module, error, optLevel) == false) {
+    if (p->buildFromLLVMModule(module, error, optLevel) == false) {
       if (err != NULL && errSize != NULL && stringSize > 0u) {
         const size_t msgSize = std::min(error.size(), stringSize-1u);
         std::memcpy(err, error.c_str(), msgSize);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 09c79d8..c37c595 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IRReader/IRReader.h"
 #endif
 
 #include <cstring>
@@ -52,34 +53,23 @@
 #include <mutex>
 
 #ifdef GBE_COMPILER_AVAILABLE
-/* Not defined for LLVM 3.0 */
-#if !defined(LLVM_VERSION_MAJOR)
-#define LLVM_VERSION_MAJOR 3
-#endif /* !defined(LLVM_VERSION_MAJOR) */
-
-/* Not defined for LLVM 3.0 */
-#if !defined(LLVM_VERSION_MINOR)
-#define LLVM_VERSION_MINOR 0
-#endif /* !defined(LLVM_VERSION_MINOR) */
 
 #include <clang/CodeGen/CodeGenAction.h>
 #include <clang/Frontend/CompilerInstance.h>
 #include <clang/Frontend/CompilerInvocation.h>
-#if LLVM_VERSION_MINOR <= 1
-#include <clang/Frontend/DiagnosticOptions.h>
-#else
 #include <clang/Basic/DiagnosticOptions.h>
-#endif  /* LLVM_VERSION_MINOR <= 1 */
 #include <clang/Frontend/TextDiagnosticPrinter.h>
 #include <clang/Basic/TargetInfo.h>
 #include <clang/Basic/TargetOptions.h>
 #include <llvm/ADT/IntrusiveRefCntPtr.h>
-#if LLVM_VERSION_MINOR <= 2
-#include <llvm/Module.h>
-#else
 #include <llvm/IR/Module.h>
-#endif  /* LLVM_VERSION_MINOR <= 2 */
+
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+#include <llvm/Bitcode/BitcodeWriter.h>
+#include <clang/Lex/PreprocessorOptions.h>
+#else
 #include <llvm/Bitcode/ReaderWriter.h>
+#endif
 #include <llvm/Support/raw_ostream.h>
 #endif
 
@@ -124,32 +114,17 @@ namespace gbe {
   IVAR(OCL_PROFILING_LOG, 0, 0, 1); // Int for different profiling types.
   BVAR(OCL_OUTPUT_BUILD_LOG, false);
 
-  bool Program::buildFromLLVMFile(const char *fileName,
-                                         const void* module,
-                                         std::string &error,
-                                         int optLevel) {
+  bool Program::buildFromLLVMModule(const void* module,
+                                              std::string &error,
+                                              int optLevel) {
     ir::Unit *unit = new ir::Unit();
-    llvm::Module * cloned_module = NULL;
     bool ret = false;
-    if(module){
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
-      cloned_module = llvm::CloneModule((llvm::Module*)module).release();
-#else
-      cloned_module = llvm::CloneModule((llvm::Module*)module);
-#endif
-    }
+
     bool strictMath = true;
     if (fast_relaxed_math || !OCL_STRICT_CONFORMANCE)
       strictMath = false;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-    llvm::Module * linked_module = module ? llvm::CloneModule((llvm::Module*)module).release() : NULL;
-    // Src now will be removed automatically. So clone it.
-    if (llvmToGen(*unit, fileName, linked_module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
-#else
-    if (llvmToGen(*unit, fileName, module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
-#endif
-      if (fileName)
-        error = std::string(fileName) + " not found";
+
+    if (llvmToGen(*unit, module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
       delete unit;
       return false;
     }
@@ -158,13 +133,8 @@ namespace gbe {
     if(!unit->getValid()) {
       delete unit;   //clear unit
       unit = new ir::Unit();
-      if(cloned_module){
-        //suppose file exists and llvmToGen will not return false.
-        llvmToGen(*unit, fileName, cloned_module, 0, strictMath, OCL_PROFILING_LOG, error);
-      }else{
-        //suppose file exists and llvmToGen will not return false.
-        llvmToGen(*unit, fileName, module, 0, strictMath, OCL_PROFILING_LOG, error);
-      }
+      //suppose file exists and llvmToGen will not return false.
+      llvmToGen(*unit, module, 0, strictMath, OCL_PROFILING_LOG, error);
     }
     if(unit->getValid()){
       std::string error2;
@@ -174,9 +144,6 @@ namespace gbe {
       error = error + error2;
     }
     delete unit;
-    if(cloned_module){
-      delete (llvm::Module*) cloned_module;
-    }
     return ret;
   }
 
@@ -668,7 +635,7 @@ namespace gbe {
     // The ParseCommandLineOptions used for mllvm args can not be used with multithread
     // and GVN now have a 100 inst limit on block scan. Now only pass a bigger limit
     // for each context only once, this can also fix multithread bug.
-#if LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
     static bool ifsetllvm = false;
     if(!ifsetllvm) {
       args.push_back("-mllvm");
@@ -686,10 +653,6 @@ namespace gbe {
     args.push_back("-disable-llvm-optzns");
     if(bFastMath)
       args.push_back("-D __FAST_RELAXED_MATH__=1");
-#if LLVM_VERSION_MINOR <= 2
-    args.push_back("-triple");
-    args.push_back("nvptx");
-#else
     args.push_back("-x");
     args.push_back("cl");
     args.push_back("-triple");
@@ -698,7 +661,6 @@ namespace gbe {
       args.push_back("-fblocks");
     } else
       args.push_back("spir");
-#endif /* LLVM_VERSION_MINOR <= 2 */
     args.push_back("stringInput.cl");
     args.push_back("-ffp-contract=on");
     if(OCL_DEBUGINFO) args.push_back("-g");
@@ -716,24 +678,33 @@ namespace gbe {
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
     clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
 
+    llvm::StringRef srcString(source);
     // Create the compiler invocation
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    auto CI = std::make_shared<clang::CompilerInvocation>();
+    CI->getPreprocessorOpts().addRemappedFile("stringInput.cl",
+#else
     std::unique_ptr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
-    clang::CompilerInvocation::CreateFromArgs(*CI,
-                                              &args[0],
-                                              &args[0] + args.size(),
-                                              Diags);
-    llvm::StringRef srcString(source);
     (*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#endif
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
                 llvm::MemoryBuffer::getMemBuffer(srcString)
 #else
                 llvm::MemoryBuffer::getMemBuffer(srcString).release()
 #endif
                 );
 
+    clang::CompilerInvocation::CreateFromArgs(*CI,
+                                              &args[0],
+                                              &args[0] + args.size(),
+                                              Diags);
     // Create the compiler instance
     clang::CompilerInstance Clang;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    Clang.setInvocation(std::move(CI));
+#else
     Clang.setInvocation(CI.release());
+#endif
     // Get ready to report problems
     Clang.createDiagnostics(DiagClient, false);
 
@@ -777,7 +748,7 @@ namespace gbe {
     if (!retVal)
       return false;
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
     llvm::Module *module = Act->takeModule();
 #else
     llvm::Module *module = Act->takeModule().release();
@@ -786,16 +757,12 @@ namespace gbe {
     *out_module = module;
 
 // Dump the LLVM if requested.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 6)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 36
     if (!dumpLLVMFileName.empty()) {
       std::string err;
       llvm::raw_fd_ostream ostream (dumpLLVMFileName.c_str(),
                                     err,
-      #if LLVM_VERSION_MINOR == 3
-                                    0
-      #else
                                     llvm::sys::fs::F_None
-      #endif
                                     );
 
       if (err.empty()) {
@@ -807,11 +774,7 @@ namespace gbe {
       std::string err;
       llvm::raw_fd_ostream ostream (dumpSPIRBinaryName.c_str(),
                                     err,
-      #if LLVM_VERSION_MINOR == 3
-                                    0
-      #else
                                     llvm::sys::fs::F_None
-      #endif
                                     );
       if (err.empty())
         llvm::WriteBitcodeToFile(*out_module, ostream);
@@ -1109,13 +1072,15 @@ EXTEND_QUOTE:
           fclose(asmDumpStream);
       }
 
-      p = gbe_program_new_from_llvm(deviceID, NULL, out_module, llvm_ctx,
+      p = gbe_program_new_from_llvm(deviceID, out_module, llvm_ctx,
                                     dumpASMFileName.empty() ? NULL : dumpASMFileName.c_str(),
                                     stringSize, err, errSize, optLevel, options);
       if (err != NULL)
         *errSize += clangErrSize;
       if (OCL_OUTPUT_BUILD_LOG && options)
-        llvm::errs() << options;
+        llvm::errs() << "options:" << options << "\n";
+      if (OCL_OUTPUT_BUILD_LOG && err && *errSize)
+        llvm::errs() << err << "\n";
     } else
       p = NULL;
 
@@ -1128,6 +1093,45 @@ EXTEND_QUOTE:
 
 #ifdef GBE_COMPILER_AVAILABLE
 
+  static gbe_program programNewFromLLVMFile(uint32_t deviceID,
+                                            const char *fileName,
+                                            size_t string_size,
+                                            char *err,
+                                            size_t *err_size)
+  {
+    gbe_program p = NULL;
+    if (fileName == NULL)
+      return NULL;
+
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+    llvm::LLVMContext *c = new llvm::LLVMContext;
+#else
+    llvm::LLVMContext *c = &llvm::getGlobalContext();
+#endif
+    // Get the module from its file
+    llvm::SMDiagnostic errDiag;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
+    llvm::Module *module = parseIRFile(fileName, errDiag, *c).release();
+#else
+    llvm::Module *module = ParseIRFile(fileName, errDiag, *c);
+#endif
+
+    int optLevel = 1;
+
+    //module will be delete in programCleanLlvmResource
+    p = gbe_program_new_from_llvm(deviceID, module, c, NULL,
+                                  string_size, err, err_size, optLevel, NULL);
+    if (OCL_OUTPUT_BUILD_LOG && err && *err_size)
+      llvm::errs() << err << "\n";
+
+    return p;
+  }
+#endif
+
+
+
+#ifdef GBE_COMPILER_AVAILABLE
+
   static gbe_program programCompileFromSource(uint32_t deviceID,
                                           const char *source,
                                           const char *temp_header_path,
@@ -1148,11 +1152,10 @@ EXTEND_QUOTE:
 
     gbe_program p;
     acquireLLVMContextLock();
-    //FIXME: if use new allocated context to link two modules there would be context mismatch
-    //for some functions, so we use global context now, need switch to new context later.
+
     llvm::Module * out_module;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-    llvm::LLVMContext* llvm_ctx = &GBEGetLLVMContext();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+    llvm::LLVMContext* llvm_ctx = new llvm::LLVMContext;
 #else
     llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
 #endif
@@ -1169,7 +1172,9 @@ EXTEND_QUOTE:
       p = gbe_program_new_gen_program(deviceID, out_module, NULL, NULL);
 
       if (OCL_OUTPUT_BUILD_LOG && options)
-        llvm::errs() << options;
+        llvm::errs() << "options:" << options << "\n";
+      if (OCL_OUTPUT_BUILD_LOG && err && *errSize)
+        llvm::errs() << err << "\n";
     } else
       p = NULL;
     releaseLLVMContextLock();
@@ -1513,6 +1518,7 @@ void releaseLLVMContextLock()
 }
 
 GBE_EXPORT_SYMBOL gbe_program_new_from_source_cb *gbe_program_new_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_file_cb *gbe_program_new_from_llvm_file = NULL;
 GBE_EXPORT_SYMBOL gbe_program_compile_from_source_cb *gbe_program_compile_from_source = NULL;
 GBE_EXPORT_SYMBOL gbe_program_link_program_cb *gbe_program_link_program = NULL;
 GBE_EXPORT_SYMBOL gbe_program_check_opt_cb *gbe_program_check_opt = NULL;
@@ -1575,6 +1581,7 @@ namespace gbe
   {
     CallBackInitializer(void) {
       gbe_program_new_from_source = gbe::programNewFromSource;
+      gbe_program_new_from_llvm_file = gbe::programNewFromLLVMFile;
       gbe_program_compile_from_source = gbe::programCompileFromSource;
       gbe_program_link_program = gbe::programLinkProgram;
       gbe_program_check_opt = gbe::programCheckOption;
@@ -1625,7 +1632,7 @@ namespace gbe
     }
 
     ~CallBackInitializer() {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 34
       llvm::llvm_shutdown();
 #endif
     }
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index e601c97..2017845 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -180,6 +180,15 @@ extern gbe_dup_printfset_cb *gbe_dup_printfset;
 typedef void (gbe_output_printf_cb) (void* printf_info, void* buf_addr);
 extern gbe_output_printf_cb* gbe_output_printf;
 
+
+/*! Create a new program from the llvm file (zero terminated string) */
+typedef gbe_program (gbe_program_new_from_llvm_file_cb)(uint32_t deviceID,
+                                                        const char *fileName,
+                                                        size_t stringSize,
+                                                        char *err,
+                                                        size_t *err_size);
+extern gbe_program_new_from_llvm_file_cb *gbe_program_new_from_llvm_file;
+
 /*! Create a new program from the given source code (zero terminated string) */
 typedef gbe_program (gbe_program_new_from_source_cb)(uint32_t deviceID,
                                                      const char *source,
@@ -231,7 +240,6 @@ extern gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary;
 
 /*! Create a new program from the given LLVM file */
 typedef gbe_program (gbe_program_new_from_llvm_cb)(uint32_t deviceID,
-                                                   const char *fileName,
                                                    const void *module,
                                                    const void *llvm_ctx,
                                                    const char *asm_file_name,
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 1aff8b9..4a68e33 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -305,8 +305,8 @@ namespace gbe {
     }
     /*! Build a program from a ir::Unit */
     bool buildFromUnit(const ir::Unit &unit, std::string &error);
-    /*! Buils a program from a LLVM source code */
-    bool buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel);
+    /*! Buils a program from a LLVM Module */
+    bool buildFromLLVMModule(const void* module, std::string &error, int optLevel);
     /*! Buils a program from a OCL string */
     bool buildFromSource(const char *source, std::string &error);
     /*! Get size of the global constant arrays */
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 5fcb14a..64d9727 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -186,7 +186,7 @@ namespace ir {
 
 
       // only llvm-3.6 or later has kernel_arg_base_type in metadata.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
       bool isImage1dT() const {
         return typeName.compare("image1d_t") == 0;
       }
diff --git a/backend/src/ir/half.cpp b/backend/src/ir/half.cpp
index 1c0d7eb..0abc6cb 100644
--- a/backend/src/ir/half.cpp
+++ b/backend/src/ir/half.cpp
@@ -29,7 +29,11 @@ namespace ir {
   {
     uint64_t v64 = static_cast<uint64_t>(v);
     llvm::APInt apInt(16, v64, false);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    return llvm::APFloat(llvm::APFloat::IEEEhalf(), apInt);
+#else
     return llvm::APFloat(llvm::APFloat::IEEEhalf, apInt);
+#endif
   }
 
   static uint16_t convAPFloatToU16(const llvm::APFloat& apf)
@@ -42,14 +46,22 @@ namespace ir {
   half::operator float(void) const {
     bool loseInfo;
     llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    apf_self.convert(llvm::APFloat::IEEEsingle(), llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#else
     apf_self.convert(llvm::APFloat::IEEEsingle, llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#endif
     return apf_self.convertToFloat();
   }
 
   half::operator double(void) const {
     bool loseInfo;
     llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    apf_self.convert(llvm::APFloat::IEEEdouble(), llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#else
     apf_self.convert(llvm::APFloat::IEEEdouble, llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#endif
     return apf_self.convertToDouble();
   }
 
@@ -70,7 +82,11 @@ namespace ir {
   }
 
   half half::convToHalf(uint16_t u16) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    llvm::APFloat res(llvm::APFloat::IEEEhalf(), llvm::APInt(16, 0, false));
+#else
     llvm::APFloat res(llvm::APFloat::IEEEhalf, llvm::APInt(16, 0, false));
+#endif
     uint64_t u64 = static_cast<uint64_t>(u16);
     llvm::APInt apInt(16, u64, false);
     res.convertFromAPInt(apInt, false, llvm::APFloat::rmNearestTiesToEven);
@@ -78,7 +94,11 @@ namespace ir {
   }
 
   half half::convToHalf(int16_t v16) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    llvm::APFloat res(llvm::APFloat::IEEEhalf(), llvm::APInt(16, 0, true));
+#else
     llvm::APFloat res(llvm::APFloat::IEEEhalf, llvm::APInt(16, 0, true));
+#endif
     uint64_t u64 = static_cast<uint64_t>(v16);
     llvm::APInt apInt(16, u64, true);
     res.convertFromAPInt(apInt, true, llvm::APFloat::rmNearestTiesToEven);
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 93bd96a..bcf5940 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -199,6 +199,7 @@ namespace ir {
     GBE_SAFE_DELETE(liveness);
     this->liveness = GBE_NEW(ir::Liveness, *fn);
     this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+    bool needRefreshDag = false;
 
     // Process all structure arguments and find all the direct loads we can
     // replace
@@ -207,13 +208,27 @@ namespace ir {
     for (uint32_t argID = 0; argID < argNum; ++argID) {
       FunctionArgument &arg = fn->getArg(argID);
       if (arg.type != FunctionArgument::STRUCTURE) continue;
-      if(this->lower(argID) == ARG_INDIRECT_READ)
+      if(this->lower(argID) == ARG_INDIRECT_READ) {
         indirctReadArgs.push_back(argID);
+        //when the return value is ARG_INDIRECT_READ, there is still possible
+        //that some IRs read it directly, and will be handled in buildConstantPush()
+        //so we need to refresh the dag afer function buildConstantPush
+        for (const auto &loadAddImm : seq) {
+          if (loadAddImm.argID == argID)
+            needRefreshDag = true;
+        }
+      }
     }
 
     // Build the constant push description and remove the instruction that
     // therefore become useless
     this->buildConstantPush();
+    if (needRefreshDag) {
+      GBE_SAFE_DELETE(dag);
+      GBE_SAFE_DELETE(liveness);
+      this->liveness = GBE_NEW(ir::Liveness, *fn);
+      this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+    }
     for (uint32_t i = 0; i < indirctReadArgs.size(); ++i){
       lowerIndirectRead(indirctReadArgs[i]);
     }
diff --git a/backend/src/ir/profiling.cpp b/backend/src/ir/profiling.cpp
index ac61e9b..3289e76 100644
--- a/backend/src/ir/profiling.cpp
+++ b/backend/src/ir/profiling.cpp
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include "ir/profiling.hpp"
 #include "src/cl_device_data.h"
+#include <inttypes.h>
 
 namespace gbe
 {
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 46d7be7..d7a2a67 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -32,8 +32,6 @@
 #include "sys/map.hpp"
 #include <string.h>
 
-#include "llvm/IR/Instructions.h"
-
 namespace gbe {
 namespace ir {
 
@@ -46,7 +44,7 @@ namespace ir {
   public:
     typedef map<std::string, Function*> FunctionSet;
     /*! Moved from printf pass */
-    map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs;
+    map<void *, PrintfSet::PrintfFmt*> printfs;
     vector<std::string> blockFuncs;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index c68ecb0..2917e6d 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -211,7 +211,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	)
 ENDMACRO(ADD_LL_TO_BC_TARGET)
 
-SET (OCL_LL_MODULES_12 ocl_barrier ocl_clz ocl_ctz)
+SET (OCL_LL_MODULES_12 ocl_barrier ocl_clz ocl_ctz ocl_sampler)
 FOREACH(f ${OCL_LL_MODULES_12})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
@@ -255,7 +255,7 @@ if (ENABLE_OPENCL_20)
     ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS_20}")
   ENDFOREACH(f)
 
-  SET (OCL_LL_MODULES_20 ocl_barrier_20 ocl_clz_20 ocl_ctz_20 ocl_atomic_20)
+  SET (OCL_LL_MODULES_20 ocl_barrier_20 ocl_clz_20 ocl_ctz_20 ocl_atomic_20 ocl_sampler_20)
   FOREACH(f ${OCL_LL_MODULES_20})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 2548cb7..5408048 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -124,6 +124,10 @@
 #define cl_intel_subgroups
 #define cl_intel_subgroups_short
 
+#if __clang_major__*10 + __clang_minor__ > 40
+#define cl_intel_required_subgroup_size
+#endif
+
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 #endif
diff --git a/backend/src/libocl/include/ocl_enqueue.h b/backend/src/libocl/include/ocl_enqueue.h
index 6479df7..8e9fd32 100644
--- a/backend/src/libocl/include/ocl_enqueue.h
+++ b/backend/src/libocl/include/ocl_enqueue.h
@@ -38,19 +38,35 @@ struct Block_literal {
   void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
   int flags;
   int reserved;
-  __global void (*invoke)(void *, ...);
+  __global void* invoke;
   struct Block_descriptor_1 {
     unsigned long int reserved;         // NULL
     unsigned long int size;         // sizeof(struct Block_literal_1)
     // optional helper functions
-    void (*copy_helper)(void *dst, void *src);     // IFF (1<<25)
-    void (*dispose_helper)(void *src);             // IFF (1<<25)
+    void *copy_helper;                // IFF (1<<25)
+    void *dispose_helper;             // IFF (1<<25)
     // required ABI.2010.3.16
     const char *signature;                         // IFF (1<<30)
   } *descriptor;
   // imported variables
 };
 
+#if __clang_major__*10 + __clang_minor__ >= 50
+typedef struct ndrange_info_t ndrange_t;
+#endif
+
+#if __clang_major__*10 + __clang_minor__ >= 50
+#define BLOCK_TYPE void*
+#else
+#define BLOCK_TYPE __private void*
+#endif
+
+#if __clang_major__*10 + __clang_minor__ >= 40
+#define EVENT_TYPE clk_event_t*
+#else
+#define EVENT_TYPE __private clk_event_t*
+#endif
+
 clk_event_t create_user_event(void);
 void retain_event(clk_event_t event);
 void release_event(clk_event_t event);
@@ -58,21 +74,16 @@ void set_user_event_status(clk_event_t event, int status);
 bool is_valid_event(clk_event_t event);
 void capture_event_profiling_info(clk_event_t event, int name, global void *value);
 
-uint __get_kernel_work_group_size_impl(__private void *block);
-uint __get_kernel_preferred_work_group_multiple_impl(__private void *block);
+uint __get_kernel_work_group_size_impl(BLOCK_TYPE block);
+uint __get_kernel_preferred_work_group_multiple_impl(BLOCK_TYPE block);
 
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void));
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
-                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
-                                clk_event_t *event_ret, void (^block)(void));
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, __private void *block, uint size0, ...);
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
-                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
-                                clk_event_t *event_ret,  __private void *block, uint size0, ...);
+int __enqueue_kernel_basic(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block);
+int __enqueue_kernel_basic_events(queue_t q, int flag, ndrange_t ndrange,
+                                  uint num_events_in_wait_list, const EVENT_TYPE event_wait_list,
+                                  EVENT_TYPE event_ret, BLOCK_TYPE block);
 
 queue_t get_default_queue(void);
-int __gen_enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void), int size);
-int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes);
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block, int count, __private int* slm_sizes);
 
 OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size);
 OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size);
diff --git a/backend/src/libocl/include/ocl_image.h b/backend/src/libocl/include/ocl_image.h
index 5a679aa..f816e1a 100644
--- a/backend/src/libocl/include/ocl_image.h
+++ b/backend/src/libocl/include/ocl_image.h
@@ -23,6 +23,8 @@
 #define int1 int
 #define float1 float
 
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+
 #define DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
   OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, int##N coord); \
   OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, float##N coord);
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 2c0d700..a15cfe4 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -48,6 +48,7 @@ DEF(char)
 DEF(uchar)
 DEF(short)
 DEF(ushort)
+DEF(half)
 DEF(int)
 DEF(uint)
 DEF(float)
@@ -112,6 +113,7 @@ DEF(char)
 DEF(uchar)
 DEF(short)
 DEF(ushort)
+DEF(half)
 DEF(int)
 DEF(uint)
 DEF(float)
diff --git a/backend/src/libocl/src/ocl_enqueue.cl b/backend/src/libocl/src/ocl_enqueue.cl
index dc8fa3b..1ae43aa 100644
--- a/backend/src/libocl/src/ocl_enqueue.cl
+++ b/backend/src/libocl/src/ocl_enqueue.cl
@@ -30,7 +30,7 @@ ndrange_t __gen_ocl_set_ndrange_info(__private struct ndrange_info_t *info);
 __private struct ndrange_info_t* __gen_ocl_get_ndrange_info(ndrange_t info);
 __global int* __gen_ocl_get_enqueue_info_addr(void);
 
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void))
+int __enqueue_kernel_basic(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block)
 {
   int i;
   __private struct Block_literal *literal = (__private struct Block_literal *)block;
@@ -40,8 +40,11 @@ OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^b
   __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
   int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t));
   __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+#if __clang_major__*10 + __clang_minor__ >= 50
+  __private struct ndrange_info_t *info = to_private(&ndrange);
+#else
   __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
-
+#endif
   *((__global struct ndrange_info_t *)addr) = *info;
   addr += sizeof(*info);
 
@@ -51,14 +54,14 @@ OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^b
   return 0;
 }
 
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
-                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
-                                clk_event_t *event_ret, void (^block)(void))
+int __enqueue_kernel_basic_events(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const EVENT_TYPE event_wait_list,
+                                EVENT_TYPE event_ret, BLOCK_TYPE block)
 {
-  return enqueue_kernel(q, flag, ndrange, block);
+  return __enqueue_kernel_basic(q, flag, ndrange, block);
 }
 
-int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes)
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block, int count, __private int* slm_sizes)
 {
   int i;
   __private struct Block_literal* literal = (__private struct Block_literal *)block;
@@ -69,7 +72,11 @@ int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private v
   __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
   int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t) + slm_size);
   __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+#if __clang_major__*10 + __clang_minor__ >= 50
+  __private struct ndrange_info_t *info = to_private(&ndrange);
+#else
   __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+#endif
 
   *((__global struct ndrange_info_t *)addr) = *info;
   addr += sizeof(*info);
@@ -111,12 +118,12 @@ bool is_valid_event(clk_event_t event)
   return 1;
 }
 
-uint __get_kernel_work_group_size_impl(__private void *block)
+uint __get_kernel_work_group_size_impl(BLOCK_TYPE block)
 {
   return 256;
 }
 
-uint __get_kernel_preferred_work_group_multiple_impl(__private  void *block)
+uint __get_kernel_preferred_work_group_multiple_impl(BLOCK_TYPE block)
 {
   return 16;
 }
@@ -127,13 +134,19 @@ void capture_event_profiling_info(clk_event_t event, int name, global void *valu
   ((__global ulong *)value)[0] = 0x3000;
   ((__global ulong *)value)[1] = 0x6000;
 }
+
+#if __clang_major__*10 + __clang_minor__ >= 50
+#define RET_INFO  return info;
+#else
+#define RET_INFO  return __gen_ocl_set_ndrange_info(&info);
+#endif
+
 OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size)
 {
   struct ndrange_info_t info;
   info.type = 0x1;
   info.global_work_size[0] = global_work_size;
-  return __gen_ocl_set_ndrange_info(&info);
-  //return ndrange;
+  RET_INFO;
 }
 
 OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size)
@@ -142,8 +155,7 @@ OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_siz
   info.type = 0x2;
   info.global_work_size[0] = global_work_size;
   info.local_work_size[0] = local_work_size;
-  return __gen_ocl_set_ndrange_info(&info);
- // return ndrange;
+  RET_INFO;
 }
 
 
@@ -154,8 +166,7 @@ OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_
   info.global_work_size[0] = global_work_size;
   info.local_work_size[0] = local_work_size;
   info.global_work_offset[0] = global_work_offset;
-  return __gen_ocl_set_ndrange_info(&info);
-  //return ndrange;
+  RET_INFO;
 }
 
 OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
@@ -164,8 +175,7 @@ OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
   info.type = 0x11;
   info.global_work_size[0] = global_work_size[0];
   info.global_work_size[1] = global_work_size[1];
-  return __gen_ocl_set_ndrange_info(&info);
-  //return ndrange;
+  RET_INFO;
 }
 
 OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2])
@@ -176,7 +186,7 @@ OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t
   info.global_work_size[1] = global_work_size[1];
   info.local_work_size[0] = local_work_size[0];
   info.local_work_size[1] = local_work_size[1];
-  return __gen_ocl_set_ndrange_info(&info);
+  RET_INFO;
 }
 
 
@@ -190,7 +200,7 @@ OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size
   info.local_work_size[1] = local_work_size[1];
   info.global_work_offset[0] = global_work_offset[0];
   info.global_work_offset[1] = global_work_offset[1];
-  return __gen_ocl_set_ndrange_info(&info);
+  RET_INFO;
 }
 
 OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
@@ -200,7 +210,7 @@ OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
   info.global_work_size[0] = global_work_size[0];
   info.global_work_size[1] = global_work_size[1];
   info.global_work_size[2] = global_work_size[2];
-  return __gen_ocl_set_ndrange_info(&info);
+  RET_INFO;
 }
 
 OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3])
@@ -213,7 +223,7 @@ OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t
   info.local_work_size[0] = local_work_size[0];
   info.local_work_size[1] = local_work_size[1];
   info.local_work_size[2] = local_work_size[2];
-  return __gen_ocl_set_ndrange_info(&info);
+  RET_INFO;
 }
 
 OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3])
@@ -229,7 +239,7 @@ OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size
   info.global_work_offset[0] = global_work_offset[0];
   info.global_work_offset[1] = global_work_offset[1];
   info.global_work_offset[2] = global_work_offset[2];
-  return __gen_ocl_set_ndrange_info(&info);
+  RET_INFO;
 }
 
 int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret)
diff --git a/backend/src/libocl/src/ocl_memcpy.cl b/backend/src/libocl/src/ocl_memcpy.cl
index 131574d..8c0409a 100644
--- a/backend/src/libocl/src/ocl_memcpy.cl
+++ b/backend/src/libocl/src/ocl_memcpy.cl
@@ -16,12 +16,13 @@
  *
  */
 #include "ocl_memcpy.h"
+typedef int __attribute__((may_alias)) AI;
 
 #define DECL_TWO_SPACE_MEMCOPY_FN(NAME, DST_SPACE, SRC_SPACE) \
 void __gen_memcpy_ ##NAME## _align (DST_SPACE uchar* dst, SRC_SPACE uchar* src, size_t size) { \
   size_t index = 0; \
   while((index + 4) <= size) { \
-    *((DST_SPACE uint *)(dst + index)) = *((SRC_SPACE uint *)(src + index)); \
+    *((DST_SPACE AI *)(dst + index)) = *((SRC_SPACE AI *)(src + index)); \
     index += 4; \
   } \
   while(index < size) { \
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 3b2eb92..9a99df9 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -87,6 +87,7 @@ DEF(char)
 DEF(uchar)
 DEF(short)
 DEF(ushort)
+DEF(half)
 DEF(int)
 DEF(uint)
 DEF(float)
@@ -107,8 +108,8 @@ DEF(ulong)
 #define DEC2X(TYPE, MASKTYPE) \
   OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask) { \
     TYPE##2 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
     return z; \
   }
 
@@ -120,10 +121,10 @@ DEF(ulong)
 #define DEC4X(TYPE, MASKTYPE) \
   OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask) { \
     TYPE##4 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
-    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
-    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = (mask.s2 & 31) < 16 ? ((TYPE *)&x)[mask.s2 & 31] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = (mask.s3 & 31) < 16 ? ((TYPE *)&x)[mask.s3 & 31] : ((TYPE *)&y)[mask.s3 & 15]; \
     return z; \
   }
 
@@ -135,14 +136,14 @@ DEF(ulong)
 #define DEC8X(TYPE, MASKTYPE) \
   OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask) { \
     TYPE##8 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
-    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
-    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
-    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
-    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
-    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
-    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+    z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = (mask.s2 & 31) < 16 ? ((TYPE *)&x)[mask.s2 & 31] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = (mask.s3 & 31) < 16 ? ((TYPE *)&x)[mask.s3 & 31] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = (mask.s4 & 31) < 16 ? ((TYPE *)&x)[mask.s4 & 31] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = (mask.s5 & 31) < 16 ? ((TYPE *)&x)[mask.s5 & 31] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = (mask.s6 & 31) < 16 ? ((TYPE *)&x)[mask.s6 & 31] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = (mask.s7 & 31) < 16 ? ((TYPE *)&x)[mask.s7 & 31] : ((TYPE *)&y)[mask.s7 & 15]; \
     return z; \
   }
 
@@ -154,22 +155,22 @@ DEF(ulong)
 #define DEC16X(TYPE, MASKTYPE) \
   OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask) { \
     TYPE##16 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
-    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
-    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
-    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
-    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
-    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
-    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
-    z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
-    z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
-    z.sA = mask.sA < 16 ? ((TYPE *)&x)[mask.sA] : ((TYPE *)&y)[mask.sA & 15]; \
-    z.sB = mask.sB < 16 ? ((TYPE *)&x)[mask.sB] : ((TYPE *)&y)[mask.sB & 15]; \
-    z.sC = mask.sC < 16 ? ((TYPE *)&x)[mask.sC] : ((TYPE *)&y)[mask.sC & 15]; \
-    z.sD = mask.sD < 16 ? ((TYPE *)&x)[mask.sD] : ((TYPE *)&y)[mask.sD & 15]; \
-    z.sE = mask.sE < 16 ? ((TYPE *)&x)[mask.sE] : ((TYPE *)&y)[mask.sE & 15]; \
-    z.sF = mask.sF < 16 ? ((TYPE *)&x)[mask.sF] : ((TYPE *)&y)[mask.sF & 15]; \
+    z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = (mask.s2 & 31) < 16 ? ((TYPE *)&x)[mask.s2 & 31] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = (mask.s3 & 31) < 16 ? ((TYPE *)&x)[mask.s3 & 31] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = (mask.s4 & 31) < 16 ? ((TYPE *)&x)[mask.s4 & 31] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = (mask.s5 & 31) < 16 ? ((TYPE *)&x)[mask.s5 & 31] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = (mask.s6 & 31) < 16 ? ((TYPE *)&x)[mask.s6 & 31] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = (mask.s7 & 31) < 16 ? ((TYPE *)&x)[mask.s7 & 31] : ((TYPE *)&y)[mask.s7 & 15]; \
+    z.s8 = (mask.s8 & 31) < 16 ? ((TYPE *)&x)[mask.s8 & 31] : ((TYPE *)&y)[mask.s8 & 15]; \
+    z.s9 = (mask.s9 & 31) < 16 ? ((TYPE *)&x)[mask.s9 & 31] : ((TYPE *)&y)[mask.s9 & 15]; \
+    z.sA = (mask.sA & 31) < 16 ? ((TYPE *)&x)[mask.sA & 31] : ((TYPE *)&y)[mask.sA & 15]; \
+    z.sB = (mask.sB & 31) < 16 ? ((TYPE *)&x)[mask.sB & 31] : ((TYPE *)&y)[mask.sB & 15]; \
+    z.sC = (mask.sC & 31) < 16 ? ((TYPE *)&x)[mask.sC & 31] : ((TYPE *)&y)[mask.sC & 15]; \
+    z.sD = (mask.sD & 31) < 16 ? ((TYPE *)&x)[mask.sD & 31] : ((TYPE *)&y)[mask.sD & 15]; \
+    z.sE = (mask.sE & 31) < 16 ? ((TYPE *)&x)[mask.sE & 31] : ((TYPE *)&y)[mask.sE & 15]; \
+    z.sF = (mask.sF & 31) < 16 ? ((TYPE *)&x)[mask.sF & 31] : ((TYPE *)&y)[mask.sF & 15]; \
     return z; \
   }
 
@@ -201,6 +202,7 @@ DEF(char)
 DEF(uchar)
 DEF(short)
 DEF(ushort)
+DEF(half)
 DEF(int)
 DEF(uint)
 DEF(float)
diff --git a/backend/src/libocl/src/ocl_sampler.ll b/backend/src/libocl/src/ocl_sampler.ll
new file mode 100644
index 0000000..6d39fdb
--- /dev/null
+++ b/backend/src/libocl/src/ocl_sampler.ll
@@ -0,0 +1,10 @@
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+%opencl.sampler_t = type opaque
+
+declare %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32)
+
+define %opencl.sampler_t addrspace(2)*@__translate_sampler_initializer(i32 %s) {
+  %call = call %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32 %s)
+  ret %opencl.sampler_t addrspace(2)* %call
+}
diff --git a/backend/src/libocl/src/ocl_sampler_20.ll b/backend/src/libocl/src/ocl_sampler_20.ll
new file mode 100644
index 0000000..bea6d75
--- /dev/null
+++ b/backend/src/libocl/src/ocl_sampler_20.ll
@@ -0,0 +1,10 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+%opencl.sampler_t = type opaque
+
+declare %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32)
+
+define %opencl.sampler_t addrspace(2)*@__translate_sampler_initializer(i32 %s) {
+  %call = call %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32 %s)
+  ret %opencl.sampler_t addrspace(2)* %call
+}
diff --git a/backend/src/llvm/ExpandLargeIntegers.cpp b/backend/src/llvm/ExpandLargeIntegers.cpp
index 60740f5..8515dc1 100644
--- a/backend/src/llvm/ExpandLargeIntegers.cpp
+++ b/backend/src/llvm/ExpandLargeIntegers.cpp
@@ -93,7 +93,7 @@
 
 using namespace llvm;
 
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
 #define DEBUG_TYPE "nacl-expand-ints"
 #endif
 
@@ -766,7 +766,7 @@ static void convertInstruction(Instruction *Inst, ConversionState &State,
 bool ExpandLargeIntegers::runOnFunction(Function &F) {
   // Don't support changing the function arguments. Illegal function arguments
   // should not be generated by clang.
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
   for (const Argument &Arg : F.args())
 #else
   for (const Argument &Arg : F.getArgumentList())
@@ -789,7 +789,7 @@ bool ExpandLargeIntegers::runOnFunction(Function &F) {
       // Only attempt to convert an instruction if its result or any of its
       // operands are illegal.
       bool ShouldConvert = shouldConvert(&I);
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
       for (Value *Op : I.operands())
         ShouldConvert |= shouldConvert(Op);
 #else
diff --git a/backend/src/llvm/ExpandUtils.cpp b/backend/src/llvm/ExpandUtils.cpp
index a09d990..cb1736b 100644
--- a/backend/src/llvm/ExpandUtils.cpp
+++ b/backend/src/llvm/ExpandUtils.cpp
@@ -101,7 +101,11 @@ namespace llvm {
   Function *RecreateFunction(Function *Func, FunctionType *NewType) {
     Function *NewFunc = Function::Create(NewType, Func->getLinkage());
     NewFunc->copyAttributesFrom(Func);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    Func->getParent()->getFunctionList().insert(Func->getIterator(), NewFunc);
+#else
     Func->getParent()->getFunctionList().insert(ilist_iterator<Function>(Func), NewFunc);
+#endif
     NewFunc->takeName(Func);
     NewFunc->getBasicBlockList().splice(NewFunc->begin(),
                                         Func->getBasicBlockList());
diff --git a/backend/src/llvm/PromoteIntegers.cpp b/backend/src/llvm/PromoteIntegers.cpp
index a500311..d433771 100644
--- a/backend/src/llvm/PromoteIntegers.cpp
+++ b/backend/src/llvm/PromoteIntegers.cpp
@@ -605,8 +605,13 @@ static void convertInstruction(Instruction *Inst, ConversionState &State) {
     for (SwitchInst::CaseIt I = Switch->case_begin(),
              E = Switch->case_end();
          I != E; ++I) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+      NewInst->addCase(cast<ConstantInt>(convertConstant(I->getCaseValue())),
+                       I->getCaseSuccessor());
+#else
       NewInst->addCase(cast<ConstantInt>(convertConstant(I.getCaseValue())),
                        I.getCaseSuccessor());
+#endif
     }
     Switch->eraseFromParent();
   } else {
diff --git a/backend/src/llvm/StripAttributes.cpp b/backend/src/llvm/StripAttributes.cpp
index 9d07c29..99e5a71 100644
--- a/backend/src/llvm/StripAttributes.cpp
+++ b/backend/src/llvm/StripAttributes.cpp
@@ -79,10 +79,13 @@ namespace {
   class StripAttributes : public FunctionPass {
   public:
     static char ID; // Pass identification, replacement for typeid
-    StripAttributes() : FunctionPass(ID) {
+    StripAttributes(bool lastTime) : FunctionPass(ID),
+                                     lastTime(lastTime) {
     }
 
     virtual bool runOnFunction(Function &Func);
+  private:
+    bool lastTime; //last time all StripAttributes
   };
 }
 
@@ -93,7 +96,11 @@ bool StripAttributes::runOnFunction(Function &Func) {
   Func.setLinkage(GlobalValue::ExternalLinkage);
   if (!gbe::isKernelFunction(Func)) {
     Func.addFnAttr(Attribute::AlwaysInline);
-    Func.setLinkage(GlobalValue::LinkOnceAnyLinkage);
+    if (lastTime ||
+        (Func.getName().find("__gen_mem") == std::string::npos))
+      // Memcpy and memset functions could be deleted at last inline.
+      // Delete memcpy and memset functions for output llvm ir friendly.
+      Func.setLinkage(GlobalValue::LinkOnceAnyLinkage);
   }
 
   for (Function::iterator BB = Func.begin(), E = Func.end();
@@ -109,6 +116,6 @@ bool StripAttributes::runOnFunction(Function &Func) {
   return true;
 }
 
-FunctionPass *llvm::createStripAttributesPass() {
-  return new StripAttributes();
+FunctionPass *llvm::createStripAttributesPass(bool lastTime) {
+  return new StripAttributes(lastTime);
 }
diff --git a/backend/src/llvm/llvm_barrier_nodup.cpp b/backend/src/llvm/llvm_barrier_nodup.cpp
index 727e6bd..b8ffdf4 100644
--- a/backend/src/llvm/llvm_barrier_nodup.cpp
+++ b/backend/src/llvm/llvm_barrier_nodup.cpp
@@ -48,7 +48,12 @@ namespace gbe {
 
       }
 
-      virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+      virtual StringRef getPassName() const
+#else
+      virtual const char *getPassName() const
+#endif
+      {
         return "SPIR backend: set barrier no duplicate attr";
       }
 
@@ -69,7 +74,11 @@ namespace gbe {
               if (F.hasFnAttribute(Attribute::NoDuplicate)) {
                 auto attrs = F.getAttributes();
                 F.setAttributes(attrs.removeAttribute(M.getContext(),
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+                                AttributeList::FunctionIndex,
+#else
                                 AttributeSet::FunctionIndex,
+#endif
                                 Attribute::NoDuplicate));
                 changed = true;
               }
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 89d5e7c..ef56e4c 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -60,7 +60,7 @@ namespace gbe
       return NULL;
     }
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
     oclLib = getLazyIRFileModule(FilePath, Err, ctx);
 #else
     oclLib = getLazyIRFileModule(FilePath, Err, ctx).release();
@@ -117,17 +117,28 @@ namespace gbe
 
         std::string ErrInfo;// = "Not Materializable";
         if (!fromSrc && newMF->isMaterializable()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
-          if (newMF->Materialize(&ErrInfo)) {
-            printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+          if (llvm::Error EC = newMF->materialize()) {
+            std::string Msg;
+            handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {
+              Msg = EIB.message();
+            });
+            printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), Msg.c_str());
             return false;
           }
-#else
+          Gvs.push_back((GlobalValue *)newMF);
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
           if (std::error_code EC = newMF->materialize()) {
             printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), EC.message().c_str());
             return false;
           }
           Gvs.push_back((GlobalValue *)newMF);
+#else
+         if (newMF->Materialize(&ErrInfo)) {
+            printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+            return false;
+          }
+
 #endif
         }
         if (!materializedFuncCall(src, lib, *newMF, MFS, Gvs))
@@ -250,21 +261,30 @@ namespace gbe
       }
       std::string ErrInfo;// = "Not Materializable";
       if (newMF->isMaterializable()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
-        if (newMF->Materialize(&ErrInfo)) {
-          printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+        if (llvm::Error EC = newMF->materialize()) {
+          std::string Msg;
+          handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {
+            Msg = EIB.message();
+          });
+          printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), Msg.c_str());
           delete clonedLib;
           return NULL;
         }
-      }
-#else
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
         if (std::error_code EC = newMF->materialize()) {
           printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), EC.message().c_str());
           delete clonedLib;
           return NULL;
         }
-      }
+#else
+        if (newMF->Materialize(&ErrInfo)) {
+          printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+          delete clonedLib;
+          return NULL;
+        }
 #endif
+      }
 
       if (!materializedFuncCall(*mod, *clonedLib, *newMF, materializedFuncs, Gvs)) {
         delete clonedLib;
@@ -287,12 +307,17 @@ namespace gbe
    * pass to extract the functions and values in Gvs from the library module.
    * After extract what we need and remove what we do not need, we use 
    * materializeAll to mark the module as materialized. */
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
     /* Get all GlobalValue from module. */
     Module::GlobalListType &GVlist = clonedLib->getGlobalList();
     for(Module::global_iterator GVitr = GVlist.begin();GVitr != GVlist.end();++GVitr) {
       GlobalValue * GV = &*GVitr;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+      ExitOnError ExitOnErr("Can not materialize the clonedLib: ");
+      ExitOnErr(clonedLib->materialize(GV));
+#else
       clonedLib->materialize(GV);
+#endif
       Gvs.push_back(GV);
     }
     llvm::legacy::PassManager Extract;
@@ -300,8 +325,13 @@ namespace gbe
     Extract.add(createGVExtractionPass(Gvs, false));
     Extract.run(*clonedLib);
     /* Mark the library module as materialized for later use. */
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    ExitOnError ExitOnErr("Can not materialize the clonedLib: ");
+    ExitOnErr(clonedLib->materializeAll());
+#else
     clonedLib->materializeAll();
 #endif
+#endif
 
     /* the SPIR binary datalayout maybe different with beignet's bitcode */
     if(clonedLib->getDataLayout() != mod->getDataLayout())
@@ -309,23 +339,24 @@ namespace gbe
 
     /* We use beignet's bitcode as dst because it will have a lot of
        lazy functions which will not be loaded. */
-    char* errorMsg;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-    if(LLVMLinkModules2(wrap(clonedLib), wrap(mod))) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+    llvm::Module * linked_module = llvm::CloneModule((llvm::Module*)mod).release();
+    if(LLVMLinkModules2(wrap(clonedLib), wrap(linked_module))) {
 #else
+    char* errorMsg;
     if(LLVMLinkModules(wrap(clonedLib), wrap(mod), LLVMLinkerDestroySource, &errorMsg)) {
+      printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
 #endif
       delete clonedLib;
-      printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
       return NULL;
     }
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     llvm::legacy::PassManager passes;
 #else
     llvm::PassManager passes;
 #endif
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
     auto PreserveKernel = [=](const GlobalValue &GV) {
       for(size_t i = 0;i < kernels.size(); ++i)
         if(strcmp(GV.getName().data(), kernels[i]))
diff --git a/backend/src/llvm/llvm_device_enqueue.cpp b/backend/src/llvm/llvm_device_enqueue.cpp
index ee236de..58aa681 100644
--- a/backend/src/llvm/llvm_device_enqueue.cpp
+++ b/backend/src/llvm/llvm_device_enqueue.cpp
@@ -29,6 +29,7 @@ namespace gbe {
     BitCastInst* bt = dyn_cast<BitCastInst>(I);
     if (bt == NULL)
       return NULL;
+//bt->dump();
 
     Type* type = bt->getOperand(0)->getType();
     if(!type->isPointerTy())
@@ -62,7 +63,7 @@ namespace gbe {
       for (Value::use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
         // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
         // which is more straightforward.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
         User *theUser = *iter;
 #else
         User *theUser = iter->getUser();
@@ -84,7 +85,7 @@ namespace gbe {
 
   Function* setFunctionAsKernel(Module *mod, Function *Fn)
   {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 9)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
     LLVMContext &Context = mod->getContext();
     Type *intTy = IntegerType::get(mod->getContext(), 32);
     SmallVector<llvm::Metadata *, 5> kernelMDArgs;
@@ -112,7 +113,8 @@ namespace gbe {
     ValueToValueMapTy VMap;
     for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; ++I) {
       PointerType *ty = dyn_cast<PointerType>(I->getType());
-      if(ty && ty->getAddressSpace() == 0) //Foce set the address space to global
+      //Foce set the address space to global
+      if(ty && (ty->getAddressSpace() == 0 || ty->getAddressSpace() == 4))
         ty = PointerType::get(ty->getPointerElementType(), 1);
       ParamTys.push_back(ty);
     }
@@ -210,7 +212,7 @@ namespace gbe {
           }
 
           for (Value::use_iterator iter = bt->use_begin(); iter != bt->use_end(); ++iter) {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
             User *theUser = *iter;
 #else
             User *theUser = iter->getUser();
@@ -252,12 +254,13 @@ namespace gbe {
             if(gep == NULL)
               continue;
 
-            BitCastInst* fnPointer = dyn_cast<BitCastInst>(gep->getOperand(0));
-            if(fnPointer == NULL)
+            Value *fnPointer = gep->getOperand(0)->stripPointerCasts();
+
+            if(fnPointer == gep->getOperand(0))
               continue;
 
-            if(BitCastInst* bt = dyn_cast<BitCastInst>(fnPointer->getOperand(0))) {
-              std::string fnName = blocks[bt->getOperand(0)];
+            if(blocks.find(fnPointer) != blocks.end()) {
+              std::string fnName = blocks[fnPointer];
               Function* f = mod->getFunction(fnName);
               CallInst *newCI = builder.CreateCall(f, args);
               CI->replaceAllUsesWith(newCI);
@@ -266,7 +269,7 @@ namespace gbe {
             }
 
             //the function is global variable
-            if(GlobalVariable* gv = dyn_cast<GlobalVariable>(fnPointer->getOperand(0))) {
+            if(GlobalVariable* gv = dyn_cast<GlobalVariable>(fnPointer)) {
               Constant *c = gv->getInitializer();
               ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
               BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
@@ -277,7 +280,7 @@ namespace gbe {
               continue;
             }
 
-            ld = dyn_cast<LoadInst>(fnPointer->getOperand(0));
+            ld = dyn_cast<LoadInst>(fnPointer);
             if(ld == NULL)
               continue;
 
@@ -298,15 +301,13 @@ namespace gbe {
             if(AllocaInst *ai = dyn_cast<AllocaInst>(ld->getPointerOperand())) {
               Value *v = NULL;
               for (Value::use_iterator iter = ai->use_begin(); iter != ai->use_end(); ++iter) {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
                 User *theUser = *iter;
 #else
                 User *theUser = iter->getUser();
 #endif
                 if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
-                  bt = dyn_cast<BitCastInst>(st->getValueOperand());
-                  if(bt)
-                    v = bt->getOperand(0);
+                  v = st->getValueOperand()->stripPointerCasts();
                 }
               }
               if(blocks.find(v) == blocks.end()) {
@@ -339,23 +340,19 @@ namespace gbe {
             Type *type = CI->getArgOperand(block_index)->getType();
             if(type->isIntegerTy())
                 block_index = 6;
-            Value *block = CI->getArgOperand(block_index);
-            while(isa<BitCastInst>(block))
-               block = dyn_cast<BitCastInst>(block)->getOperand(0);
+            Value *block = CI->getArgOperand(block_index)->stripPointerCasts();
             LoadInst *ld = dyn_cast<LoadInst>(block);
             Value *v = NULL;
             if(ld) {
               Value *block = ld->getPointerOperand();
               for (Value::use_iterator iter = block->use_begin(); iter != block->use_end(); ++iter) {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
                 User *theUser = *iter;
 #else
                 User *theUser = iter->getUser();
 #endif
                 if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
-                  BitCastInst *bt = dyn_cast<BitCastInst>(st->getValueOperand());
-                  if(bt)
-                    v = bt->getOperand(0);
+                  v = st->getValueOperand()->stripPointerCasts();
                 }
               }
               if(blocks.find(v) == blocks.end()) {
@@ -378,15 +375,20 @@ namespace gbe {
             if( fn->isVarArg() ) {
               //enqueue function with slm, convert to __gen_enqueue_kernel_slm call
               //store the slm information to a alloca address.
-              int start = block_index + 1;
+              int start = block_index + 1 + 1;  //the first is count, skip
               int count = CI->getNumArgOperands() - start;
               Type *intTy = IntegerType::get(mod->getContext(), 32);
+              Type *int64Ty = IntegerType::get(mod->getContext(), 64);
 
               AllocaInst *AI = builder.CreateAlloca(intTy, ConstantInt::get(intTy, count));
 
               for(uint32_t i = start; i < CI->getNumArgOperands(); i++) {
                 Value *ptr = builder.CreateGEP(AI, ConstantInt::get(intTy, i-start));
-                builder.CreateStore(CI->getArgOperand(i), ptr);
+                Value *argSize = CI->getArgOperand(i);
+                if (argSize->getType() == int64Ty) {
+                  argSize = builder.CreateTrunc(argSize, intTy);
+                }
+                builder.CreateStore(argSize, ptr);
               }
               SmallVector<Value*, 16> args(CI->op_begin(), CI->op_begin() + 3);
               args.push_back(CI->getArgOperand(block_index));
@@ -394,8 +396,8 @@ namespace gbe {
               args.push_back(AI);
 
               std::vector<Type *> ParamTys;
-              for (Value** I = args.begin(); I != args.end(); ++I)
-                ParamTys.push_back((*I)->getType());
+              for (Value** iter = args.begin(); iter != args.end(); ++iter)
+                ParamTys.push_back((*iter)->getType());
               CallInst* newCI = builder.CreateCall(cast<llvm::Function>(mod->getOrInsertFunction(
                               "__gen_enqueue_kernel_slm", FunctionType::get(intTy, ParamTys, false))), args);
               CI->replaceAllUsesWith(newCI);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3fefa92..c93d89c 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -95,9 +95,9 @@
 #define LLVM_VERSION_MINOR 0
 #endif /* !defined(LLVM_VERSION_MINOR) */
 
-#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR < 3)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 33
 #error "Only LLVM 3.3 and newer are supported"
-#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4) */
+#endif
 
 using namespace llvm;
 
@@ -357,6 +357,16 @@ namespace gbe
       GBE_ASSERT(! (isa<Constant>(value) && !isa<GlobalValue>(value)));
       Type *type = value->getType();
       auto typeID = type->getTypeID();
+      if (typeID == Type::PointerTyID)
+      {
+        Type *eltTy = dyn_cast<PointerType>(type)->getElementType();
+        if (eltTy->isStructTy()) {
+          StructType *strTy = dyn_cast<StructType>(eltTy);
+          if (!strTy->isLiteral() && strTy->getName().data() &&
+              strstr(strTy->getName().data(), "sampler"))
+            type = Type::getInt32Ty(value->getContext());
+        }
+      }
       switch (typeID) {
         case Type::IntegerTyID:
         case Type::FloatTyID:
@@ -565,7 +575,7 @@ namespace gbe
         has_errors(false),
         legacyMode(true)
     {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
       initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
 #else
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -573,10 +583,14 @@ namespace gbe
       pass = PASS_EMIT_REGISTERS;
     }
 
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    virtual llvm::StringRef getPassName() const { return "Gen Back-End"; }
+#else
     virtual const char *getPassName() const { return "Gen Back-End"; }
+#endif
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
       AU.addRequired<LoopInfoWrapperPass>();
 #else
       AU.addRequired<LoopInfo>();
@@ -611,7 +625,7 @@ namespace gbe
       if (legacyMode)
         analyzePointerOrigin(F);
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
       LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 #else
       LI = &getAnalysis<LoopInfo>();
@@ -726,6 +740,8 @@ namespace gbe
     DECL_VISIT_FN(AtomicCmpXchgInst, AtomicCmpXchgInst);
 #undef DECL_VISIT_FN
 
+    // Emit rounding instructions from gen native function
+    void emitRoundingCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
     // Emit unary instructions from gen native function
     void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode, ir::Type = ir::TYPE_FLOAT);
     // Emit unary instructions from gen native function
@@ -745,9 +761,6 @@ namespace gbe
     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
     void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
     void visitInvokeInst(InvokeInst &I) {NOT_SUPPORTED;}
-#if LLVM_VERSION_MINOR == 0
-    void visitUnwindInst(UnwindInst &I) {NOT_SUPPORTED;}
-#endif /* __LLVM_30__ */
     void visitResumeInst(ResumeInst &I) {NOT_SUPPORTED;}
     void visitInlineAsm(CallInst &I) {NOT_SUPPORTED;}
     void visitIndirectBrInst(IndirectBrInst &I) {NOT_SUPPORTED;}
@@ -766,7 +779,7 @@ namespace gbe
     void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
     ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
-      if (unit.printfs.find(inst) == unit.printfs.end())
+      if (unit.printfs.find((void *)inst) == unit.printfs.end())
         return NULL;
       return unit.printfs[inst];
     }
@@ -837,7 +850,7 @@ namespace gbe
       for (Value::use_iterator iter = work->use_begin(); iter != work->use_end(); ++iter) {
       // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
       // which is more straightforward.
-  #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+  #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
         User *theUser = *iter;
   #else
         User *theUser = iter->getUser();
@@ -961,7 +974,7 @@ namespace gbe
             CallInst *ci = dyn_cast<CallInst>(theUser);
             pointer = ci ? ci->getArgOperand(0) : NULL;
           } else {
-            theUser->dump();
+            //theUser->dump();
             GBE_ASSERT(0 && "Unknown instruction operating on pointers\n");
           }
 
@@ -1091,7 +1104,7 @@ namespace gbe
             if (predBB->getTerminator())
               Builder2.SetInsertPoint(predBB->getTerminator());
 
-#if (LLVM_VERSION_MAJOR== 3 && LLVM_VERSION_MINOR < 6)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 36
   // llvm 3.5 and older version don't have CreateBitOrPointerCast() define
             Type *srcTy = base->getType();
             Type *dstTy = ptr->getType();
@@ -1109,7 +1122,7 @@ namespace gbe
           pointerBaseMap.insert(std::make_pair(ptr, basePhi));
           return basePhi;
       } else {
-        ptr->dump();
+        //ptr->dump();
         GBE_ASSERT(0 && "Unhandled instruction in getPointerBase\n");
         return ptr;
       }
@@ -1190,7 +1203,7 @@ namespace gbe
           BtiValueMap.insert(std::make_pair(Val, btiPhi));
           return btiPhi;
         } else {
-          Val->dump();
+          //Val->dump();
           GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
           return Val;
         }
@@ -1250,7 +1263,7 @@ namespace gbe
      uint32_t ops = clKernels->getNumOperands();
       for(uint32_t x = 0; x < ops; x++) {
         MDNode* node = clKernels->getOperand(x);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
         Value * op = node->getOperand(0);
 #else
         auto *V = cast<ValueAsMetadata>(node->getOperand(0));
@@ -1274,7 +1287,7 @@ namespace gbe
     MDNode *typeNameNode = NULL;
     MDNode *typeBaseNameNode = NULL;
     MDNode *typeQualNode = NULL;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
     typeNameNode = F.getMetadata("kernel_arg_type");
     typeBaseNameNode = F.getMetadata("kernel_arg_base_type");
     typeQualNode = F.getMetadata("kernel_arg_type_qual");
@@ -1300,7 +1313,7 @@ namespace gbe
     ir::FunctionArgument::InfoFromLLVM llvmInfo;
     for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
       unsigned opID = argID;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 39
       opID += 1;
 #endif
 
@@ -1342,7 +1355,7 @@ namespace gbe
       for (Value::use_iterator iter = work->use_begin(); iter != work->use_end(); ++iter) {
       // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
       // which is more straightforward.
-  #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+  #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
         User *theUser = *iter;
   #else
         User *theUser = iter->getUser();
@@ -1505,15 +1518,15 @@ namespace gbe
       Value *pointer = expr->getOperand(0);
       if (expr->getOpcode() == Instruction::GetElementPtr) {
         uint32_t constantOffset = 0;
-        CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+        Type* EltTy = pointer->getType();
         for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
             int32_t TypeIndex;
             ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
             GBE_ASSERTM(ConstOP != NULL, "must be constant index");
             TypeIndex = ConstOP->getZExtValue();
             GBE_ASSERT(TypeIndex >= 0);
-            constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
-            CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+            constantOffset += getGEPConstOffset(unit, pointer->getType(), TypeIndex);
+            EltTy = getEltType(EltTy, TypeIndex);
         }
 
         ir::Constant cc = unit.getConstantSet().getConstant(pointer->getName());
@@ -1644,7 +1657,7 @@ namespace gbe
         }
       default:
         {
-          c->dump();
+          //c->dump();
           NOT_IMPLEMENTED;
         }
     }
@@ -1749,7 +1762,6 @@ namespace gbe
   {
     GBE_ASSERT(dyn_cast<ConstantExpr>(CPV) == NULL);
 
-#if LLVM_VERSION_MINOR > 0
     ConstantDataSequential *seq = dyn_cast<ConstantDataSequential>(CPV);
 
     if (seq) {
@@ -1772,7 +1784,6 @@ namespace gbe
         GBE_ASSERTM(0, "Const data array never be half float\n");
       }
     } else
-#endif /* LLVM_VERSION_MINOR > 0 */
 
     if (dyn_cast<ConstantAggregateZero>(CPV)) {
       Type* Ty = CPV->getType();
@@ -1898,7 +1909,7 @@ namespace gbe
   ir::ImmediateIndex GenWriter::processConstantImmIndex(Constant *CPV, int32_t index) {
     if (dyn_cast<ConstantExpr>(CPV) == NULL)
       return processConstantImmIndexImpl(CPV, index);
-    CPV->dump();
+    //CPV->dump();
     GBE_ASSERT(0 && "unsupported constant.\n");
     return ctx.newImmediate((uint32_t)0);
   }
@@ -2114,6 +2125,7 @@ namespace gbe
     // Loop over the kernel metadatas to set the required work group size.
     size_t reqd_wg_sz[3] = {0, 0, 0};
     size_t hint_wg_sz[3] = {0, 0, 0};
+    size_t reqd_sg_sz = 0;
     ir::FunctionArgument::InfoFromLLVM llvmInfo;
     MDNode *addrSpaceNode = NULL;
     MDNode *typeNameNode = NULL;
@@ -2124,7 +2136,7 @@ namespace gbe
 
     std::string functionAttributes;
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
     /* LLVM 3.9 change kernel arg info as function metadata */
     addrSpaceNode = F.getMetadata("kernel_arg_addr_space");
     accessQualNode = F.getMetadata("kernel_arg_access_qual");
@@ -2209,6 +2221,27 @@ namespace gbe
       functionAttributes += buffer;
       functionAttributes += " ";
     }
+    if ((attrNode = F.getMetadata("intel_reqd_sub_group_size"))) {
+      GBE_ASSERT(attrNode->getNumOperands() == 1);
+      ConstantInt *sz = mdconst::extract<ConstantInt>(attrNode->getOperand(0));
+      GBE_ASSERT(sz);
+      reqd_sg_sz = sz->getZExtValue();
+      if(!(reqd_sg_sz == 8 || reqd_sg_sz == 16)){
+        F.getContext().emitError("Required sub group size is illegal!");
+        ctx.getUnit().setValid(false);
+        return;
+      }
+      functionAttributes += "intel_reqd_sub_group_size";
+      std::stringstream param;
+      char buffer[100] = {0};
+      param << "(";
+      param << reqd_sg_sz;
+      param << ")";
+      param >> buffer;
+      functionAttributes += buffer;
+      functionAttributes += " ";
+    }
+
 #else
     /* First find the meta data belong to this function. */
     MDNode *node = getKernelFunctionMetadata(&F);
@@ -2226,7 +2259,7 @@ namespace gbe
 
       if (attrName->getString() == "reqd_work_group_size") {
         GBE_ASSERT(attrNode->getNumOperands() == 4);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
         ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
         ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
         ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
@@ -2268,13 +2301,13 @@ namespace gbe
       } else if (attrName->getString() == "vec_type_hint") {
         GBE_ASSERT(attrNode->getNumOperands() == 3);
         functionAttributes += attrName->getString();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
         Value* V = attrNode->getOperand(1);
 #else
         auto *Op1 = cast<ValueAsMetadata>(attrNode->getOperand(1));
         Value *V = Op1 ? Op1->getValue() : NULL;
 #endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
         ConstantInt *sign = dyn_cast<ConstantInt>(attrNode->getOperand(2));
 #else
         ConstantInt *sign = mdconst::extract<ConstantInt>(attrNode->getOperand(2));
@@ -2303,7 +2336,7 @@ namespace gbe
         functionAttributes += " ";
       } else if (attrName->getString() == "work_group_size_hint") {
         GBE_ASSERT(attrNode->getNumOperands() == 4);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
         ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
         ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
         ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
@@ -2334,6 +2367,8 @@ namespace gbe
 #endif /* LLVM 3.9 Function metadata */
 
     ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
+    if (reqd_sg_sz)
+      ctx.setSimdWidth(reqd_sg_sz);
 
     ctx.getFunction().setFunctionAttributes(functionAttributes);
     // Loop over the arguments and output registers for them
@@ -2343,18 +2378,15 @@ namespace gbe
       Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
 
       // Insert a new register for each function argument
-#if LLVM_VERSION_MINOR <= 1
-      const AttrListPtr &PAL = F.getAttributes();
-#endif /* LLVM_VERSION_MINOR <= 1 */
       for (; I != E; ++I, ++argID) {
         uint32_t opID = argID;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 39
         opID += 1;
 #endif
         const std::string &argName = I->getName().str();
         Type *type = I->getType();
         if(addrSpaceNode) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
           llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(opID)))->getZExtValue();
 #else
           llvmInfo.addrSpace = (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(opID)))->getZExtValue();
@@ -2417,10 +2449,11 @@ namespace gbe
         }
 
         if (llvmInfo.isSamplerType()) {
-          ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
+          ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, llvmInfo, 4, 4, 0);
           (void)ctx.getFunction().getSamplerSet()->append(reg, &ctx);
           continue;
         }
+
         if(llvmInfo.isPipeType()) {
           llvmInfo.typeSize = getTypeSize(F.getParent(),unit,llvmInfo.typeName);
           ctx.input(argName, ir::FunctionArgument::PIPE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), BtiMap.find(&*I)->second);
@@ -2435,11 +2468,7 @@ namespace gbe
             continue;
           Type *pointed = pointerType->getElementType();
           // By value structure
-#if LLVM_VERSION_MINOR <= 1
-          if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
-#else
           if (I->hasByValAttr()) {
-#endif /* LLVM_VERSION_MINOR <= 1 */
             const size_t structSize = getTypeByteSize(unit, pointed);
             ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
           }
@@ -2925,7 +2954,7 @@ namespace gbe
     const Instruction *insn = NULL;
     for(Value::const_use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
     // After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
       const User *theUser = *iter;
 #else
       const User *theUser = iter->getUser();
@@ -2980,10 +3009,12 @@ namespace gbe
           this->newRegister(const_cast<GlobalVariable*>(&v));
           ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
           ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
-          ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
           if (!legacyMode) {
-            ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, reg);
-          }
+            ir::Register regload = ctx.reg(getFamily(getType(ctx, v.getType())));
+            ctx.LOADI(getType(ctx, v.getType()), regload, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
+            ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, regload);
+          } else
+            ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
         }
       } else if(addrSpace == ir::MEM_PRIVATE) {
           this->newRegister(const_cast<GlobalVariable*>(&v));
@@ -3163,15 +3194,9 @@ namespace gbe
   void GenWriter::emitFunction(Function &F)
   {
     switch (F.getCallingConv()) {
-#if LLVM_VERSION_MINOR <= 2
-      case CallingConv::PTX_Device: // we do not emit device function
-        return;
-      case CallingConv::PTX_Kernel:
-#else
       case CallingConv::C:
       case CallingConv::Fast:
       case CallingConv::SPIR_KERNEL:
-#endif
         break;
       default:
         GBE_ASSERTM(false, "Unsupported calling convention");
@@ -3401,11 +3426,12 @@ namespace gbe
     const ir::Register src0 = this->getRegister(I.getOperand(0));
     const ir::Register src1 = this->getRegister(I.getOperand(1));
     const ir::Register tmp = ctx.reg(getFamily(ctx, I.getType()));
+    const ir::Register tmp1 = ctx.reg(getFamily(ctx, I.getType()));
     Value *cv = ConstantInt::get(I.getType(), 1);
 
     switch (I.getPredicate()) {
       case ICmpInst::FCMP_OEQ: ctx.EQ(type, dst, src0, src1); break;
-      case ICmpInst::FCMP_ONE: ctx.NE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_UNE: ctx.NE(type, dst, src0, src1); break;
       case ICmpInst::FCMP_OLE: ctx.LE(type, dst, src0, src1); break;
       case ICmpInst::FCMP_OGE: ctx.GE(type, dst, src0, src1); break;
       case ICmpInst::FCMP_OLT: ctx.LT(type, dst, src0, src1); break;
@@ -3451,9 +3477,10 @@ namespace gbe
         ctx.GT(type, tmp, src0, src1);
         ctx.XOR(insnType, dst, tmp, getRegister(cv));
         break;
-      case ICmpInst::FCMP_UNE:
-        ctx.EQ(type, tmp, src0, src1);
-        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+      case ICmpInst::FCMP_ONE:
+        ctx.LT(type, tmp, src0, src1);
+        ctx.GT(type, tmp1, src0, src1);
+        ctx.OR(insnType, dst, tmp, tmp1);
         break;
       case ICmpInst::FCMP_TRUE:
         ctx.MOV(insnType, dst, getRegister(cv));
@@ -3788,14 +3815,12 @@ namespace gbe
           break;
           case Intrinsic::stackrestore:
           break;
-#if LLVM_VERSION_MINOR >= 2
           case Intrinsic::lifetime_start:
           case Intrinsic::lifetime_end:
           break;
           case Intrinsic::fmuladd:
             this->newRegister(&I);
           break;
-#endif /* LLVM_VERSION_MINOR >= 2 */
           case Intrinsic::debugtrap:
           case Intrinsic::trap:
           case Intrinsic::dbg_value:
@@ -4081,6 +4106,15 @@ namespace gbe
         regTranslator.newValueProxy(srcValue, dst);
         break;
       }
+      case GEN_OCL_INT_TO_SAMPLER:
+      case GEN_OCL_SAMPLER_TO_INT:
+      {
+        Value *srcValue = I.getOperand(0);
+        //srcValue->dump();
+        //dst->dump();
+        regTranslator.newValueProxy(srcValue, dst);
+        break;
+      }
       case GEN_OCL_ENQUEUE_GET_ENQUEUE_INFO_ADDR:
         regTranslator.newScalarProxy(ir::ocl::enqueuebufptr, dst);
         break;
@@ -4122,6 +4156,21 @@ namespace gbe
     };
   }
 
+  void GenWriter::emitRoundingCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode) {
+    if (I.getType()->isHalfTy()) {
+      const ir::Register src = this->getRegister(I.getOperand(0));
+      const ir::Register srcFloat = ctx.reg(ir::FAMILY_DWORD);
+      const ir::Register dstFloat = ctx.reg(ir::FAMILY_DWORD);
+      const ir::Register dst = this->getRegister(&I);
+      ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, srcFloat, src);
+      ctx.ALU1(opcode, ir::TYPE_FLOAT, dstFloat, srcFloat);
+      ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, dst, dstFloat);
+    } else {
+      GBE_ASSERT(I.getType()->isFloatTy());
+      this->emitUnaryCallInst(I,CS,opcode);
+    }
+  }
+
   void GenWriter::emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode, ir::Type type) {
     CallSite::arg_iterator AI = CS.arg_begin();
 #if GBE_DEBUG
@@ -4499,10 +4548,19 @@ namespace gbe
   /* append a new sampler. should be called before any reference to
    * a sampler_t value. */
   uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    CallInst *TC = dyn_cast<CallInst>(*AI);
+    Constant *CPV = TC ? dyn_cast<Constant>(TC->getOperand(0)) : NULL;
+#else
     Constant *CPV = dyn_cast<Constant>(*AI);
+#endif
     uint8_t index;
     if (CPV != NULL)
     {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+      // Check if the Callee is sampler convert function
+      GBE_ASSERT(TC->getCalledFunction()->getName().str() == "__gen_ocl_int_to_sampler");
+#endif
       // This is not a kernel argument sampler, we need to append it to sampler set,
       // and allocate a sampler slot for it.
       const ir::Immediate &x = processConstantImm(CPV);
@@ -4549,11 +4607,9 @@ namespace gbe
             ctx.MOV(ir::getType(family), dst, src);
           }
           break;
-#if LLVM_VERSION_MINOR >= 2
           case Intrinsic::lifetime_start:
           case Intrinsic::lifetime_end:
           break;
-#endif /* LLVM_VERSION_MINOR >= 2 */
           case Intrinsic::debugtrap:
           case Intrinsic::trap:
           case Intrinsic::dbg_value:
@@ -4699,10 +4755,10 @@ namespace gbe
           }
           break;
           case Intrinsic::sqrt: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
-          case Intrinsic::ceil: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
-          case Intrinsic::trunc: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
-          case Intrinsic::rint: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
-          case Intrinsic::floor: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
+          case Intrinsic::ceil: this->emitRoundingCallInst(I,CS,ir::OP_RNDU); break;
+          case Intrinsic::trunc: this->emitRoundingCallInst(I,CS,ir::OP_RNDZ); break;
+          case Intrinsic::rint: this->emitRoundingCallInst(I,CS,ir::OP_RNDE); break;
+          case Intrinsic::floor: this->emitRoundingCallInst(I,CS,ir::OP_RNDD); break;
           case Intrinsic::sin: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
@@ -5484,15 +5540,21 @@ namespace gbe
           case GEN_OCL_GET_PIPE:
           case GEN_OCL_MAKE_RID:
           case GEN_OCL_GET_RID:
+          case GEN_OCL_INT_TO_SAMPLER:
+          case GEN_OCL_SAMPLER_TO_INT:
           {
             break;
           }
           case GEN_OCL_ENQUEUE_SET_NDRANGE_INFO:
           {
             GBE_ASSERT(AI != AE);
+            Value *dstValue;
+            if(I.hasStructRetAttr())
+              dstValue = *AI++;
+            else
+              dstValue = &I;
             Value *srcValue = *AI;
             ++AI;
-            Value *dstValue = &I;
             regTranslator.newValueProxy(srcValue, dstValue);
             break;
           }
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 1ab77c9..b4715b1 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -46,7 +46,7 @@ namespace llvm {
   FunctionPass *createExpandConstantExprPass();
   FunctionPass *createExpandLargeIntegersPass();
   FunctionPass *createPromoteIntegersPass();
-  FunctionPass *createStripAttributesPass();
+  FunctionPass *createStripAttributesPass(bool lastTime);
   // Copy debug information from Original to New, and return New.
   template <typename T> T *CopyDebug(T *New, llvm::Instruction *Original) {
     New->setDebugLoc(Original->getDebugLoc());
@@ -82,9 +82,9 @@ namespace gbe
       auto it = map.find(symbol);
 
       if (it == map.end()) {
-        int status;
+        int status = 0; /* set for libcxxrt */
         char *realName = abi::__cxa_demangle(symbol.c_str(), NULL, NULL, &status);
-        if (status == 0) {
+        if (realName) {
           std::string realFnName(realName), stripName;
           stripName = realFnName.substr(0, realFnName.find("("));
           it = map.find(stripName);
@@ -118,7 +118,10 @@ namespace gbe
   uint32_t getTypeByteSize(const ir::Unit &unit, llvm::Type* Ty);
 
   /*! Get GEP constant offset for the specified operand.*/
-  int32_t getGEPConstOffset(const ir::Unit &unit, llvm::CompositeType *CompTy, int32_t TypeIndex);
+  int32_t getGEPConstOffset(const ir::Unit &unit, llvm::Type *eltTy, int32_t TypeIndex);
+
+  /*! Get element type for a type.*/
+  llvm::Type* getEltType(llvm::Type *eltTy, uint32_t index = 0);
 
   /*! whether this is a kernel function */
   bool isKernelFunction(const llvm::Function &f);
@@ -146,7 +149,7 @@ namespace gbe
   /*! Insert the time stamp for profiling. */
   llvm::FunctionPass* createProfilingInserterPass(int profilingType, ir::Unit &unit);
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
   /* customized loop unrolling pass. */
   llvm::LoopPass *createCustomLoopUnrollPass();
 #endif
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 86485da..08087cb 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -266,3 +266,7 @@ DECL_LLVM_GEN_FUNCTION(MAKE_RID, __gen_ocl_make_rid)
 DECL_LLVM_GEN_FUNCTION(ENQUEUE_SET_NDRANGE_INFO, __gen_ocl_set_ndrange_info)
 DECL_LLVM_GEN_FUNCTION(ENQUEUE_GET_NDRANGE_INFO, __gen_ocl_get_ndrange_info)
 DECL_LLVM_GEN_FUNCTION(ENQUEUE_GET_ENQUEUE_INFO_ADDR, __gen_ocl_get_enqueue_info_addr)
+
+// sampler helper functions
+DECL_LLVM_GEN_FUNCTION(SAMPLER_TO_INT, __gen_ocl_sampler_to_int)
+DECL_LLVM_GEN_FUNCTION(INT_TO_SAMPLER, __gen_ocl_int_to_sampler)
diff --git a/backend/src/llvm/llvm_includes.hpp b/backend/src/llvm/llvm_includes.hpp
index 0b80979..184553a 100644
--- a/backend/src/llvm/llvm_includes.hpp
+++ b/backend/src/llvm/llvm_includes.hpp
@@ -24,6 +24,7 @@
 #ifndef __GBE_IR_LLVM_INCLUDES_HPP__
 #define __GBE_IR_LLVM_INCLUDES_HPP__
 
+#ifdef GBE_COMPILER_AVAILABLE
 #include "llvm/Config/llvm-config.h"
 
 #include "llvm/IR/BasicBlock.h"
@@ -75,7 +76,12 @@
 
 #include "llvm-c/Linker.h"
 #include "llvm/IRReader/IRReader.h"
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+#include <llvm/Bitcode/BitcodeWriter.h>
+//#include <llvm/Bitcode/BitcodeReader.h>
+#else
 #include "llvm/Bitcode/ReaderWriter.h"
+#endif
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -91,7 +97,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CFG.h"
@@ -111,7 +117,7 @@
 #include "llvm/Target/Mangler.h"
 #endif
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
 #else
@@ -122,14 +128,20 @@
 
 #include <clang/CodeGen/CodeGenAction.h>
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #endif
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #endif
 
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+#include "llvm/Support/Error.h"
+#endif
+
+#endif /*GBE_COMPILER_AVAILABLE */
+
 #endif /* __GBE_IR_LLVM_INCLUDES_HPP__ */
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index f01bb51..57c933f 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -40,7 +40,12 @@ namespace gbe {
 
       }
 
-      virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+      virtual StringRef getPassName() const
+#else
+      virtual const char *getPassName() const
+#endif
+      {
         return "SPIR backend: lowering instrinsics";
       }
       static char convertSpaceToName(Value *val) {
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index e797e98..5aa38be 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -35,7 +35,7 @@ namespace gbe {
     GenLoadStoreOptimization() : BasicBlockPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
-#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
       AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addPreserved<ScalarEvolutionWrapperPass>();
 #else
@@ -46,12 +46,12 @@ namespace gbe {
     }
 
     virtual bool runOnBasicBlock(BasicBlock &BB) {
-#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
       SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 #else
       SE = &getAnalysis<ScalarEvolution>();
 #endif
-      #if LLVM_VERSION_MINOR >= 7
+      #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
         TD = &BB.getModule()->getDataLayout();
       #elif LLVM_VERSION_MINOR >= 5
         DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
@@ -75,8 +75,12 @@ namespace gbe {
                                   const BasicBlock::iterator &start,
                                   unsigned maxVecSize,
                                   bool isLoad);
-
-    virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    virtual StringRef getPassName() const
+#else
+    virtual const char *getPassName() const
+#endif
+    {
       return "Merge compatible Load/stores for Gen";
     }
   };
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index c5f3ffe..10752a3 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -42,7 +42,7 @@ namespace gbe
 {
   bool isKernelFunction(const llvm::Function &F) {
     bool bKernel = false;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
     bKernel = F.getMetadata("kernel_arg_name") != NULL;
 #else
     const Module *module = F.getParent();
@@ -53,7 +53,7 @@ namespace gbe
       uint32_t ops = md.getNumOperands();
       for(uint32_t x = 0; x < ops; x++) {
         MDNode* node = md.getOperand(x);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
         Value * op = node->getOperand(0);
 #else
         Value * op = cast<ValueAsMetadata>(node->getOperand(0))->getValue();
@@ -74,7 +74,7 @@ namespace gbe
     if(ops > 0) {
       uint32_t major = 0, minor = 0;
       MDNode* node = version->getOperand(0);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
       major = mdconst::extract<ConstantInt>(node->getOperand(0))->getZExtValue();
       minor = mdconst::extract<ConstantInt>(node->getOperand(1))->getZExtValue();
 #else
@@ -180,12 +180,23 @@ namespace gbe
     return size_bit/8;
   }
 
-  int32_t getGEPConstOffset(const ir::Unit &unit, CompositeType *CompTy, int32_t TypeIndex) {
+  Type* getEltType(Type* eltTy, uint32_t index) {
+    Type *elementType = NULL;
+    if (PointerType* ptrType = dyn_cast<PointerType>(eltTy))
+      elementType = ptrType->getElementType();
+    else if(SequentialType * seqType = dyn_cast<SequentialType>(eltTy))
+      elementType = seqType->getElementType();
+    else if(CompositeType * compTy= dyn_cast<CompositeType>(eltTy))
+      elementType = compTy->getTypeAtIndex(index);
+    GBE_ASSERT(elementType);
+    return elementType;
+  }
+
+  int32_t getGEPConstOffset(const ir::Unit &unit, Type *eltTy, int32_t TypeIndex) {
     int32_t offset = 0;
-    SequentialType * seqType = dyn_cast<SequentialType>(CompTy);
-    if (seqType != NULL) {
+    if (!eltTy->isStructTy()) {
       if (TypeIndex != 0) {
-        Type *elementType = seqType->getElementType();
+        Type *elementType = getEltType(eltTy);
         uint32_t elementSize = getTypeByteSize(unit, elementType);
         uint32_t align = getAlignmentByte(unit, elementType);
         elementSize += getPadding(elementSize, align);
@@ -193,17 +204,16 @@ namespace gbe
       }
     } else {
       int32_t step = TypeIndex > 0 ? 1 : -1;
-      GBE_ASSERT(CompTy->isStructTy());
       for(int32_t ty_i=0; ty_i != TypeIndex; ty_i += step)
       {
-        Type* elementType = CompTy->getTypeAtIndex(ty_i);
+        Type* elementType = getEltType(eltTy, ty_i);
         uint32_t align = getAlignmentByte(unit, elementType);
         offset += getPadding(offset, align * step);
         offset += getTypeByteSize(unit, elementType) * step;
       }
 
       //add getPaddingding for accessed type
-      const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+      const uint32_t align = getAlignmentByte(unit, getEltType(eltTy ,TypeIndex));
       offset += getPadding(offset, align * step);
     }
     return offset;
@@ -222,7 +232,11 @@ namespace gbe
       AU.setPreservesCFG();
     }
 
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    virtual StringRef getPassName() const {
+#else
     virtual const char *getPassName() const {
+#endif
       return "SPIR backend: insert special spir instructions";
     }
 
@@ -247,8 +261,8 @@ namespace gbe
   {
     const uint32_t ptrSize = unit.getPointerSize();
     Value* parentPointer = GEPInst->getOperand(0);
-    CompositeType* CompTy = parentPointer ? cast<CompositeType>(parentPointer->getType()) : NULL;
-    if(!CompTy)
+    Type* eltTy = parentPointer ? parentPointer->getType() : NULL;
+    if(!eltTy)
       return false;
 
     Value* currentAddrInst = 
@@ -262,14 +276,15 @@ namespace gbe
       ConstantInt* ConstOP = dyn_cast<ConstantInt>(GEPInst->getOperand(op));
       if (ConstOP != NULL) {
         TypeIndex = ConstOP->getZExtValue();
-        constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
+        constantOffset += getGEPConstOffset(unit, eltTy, TypeIndex);
       }
       else {
         // we only have array/vectors here, 
         // therefore all elements have the same size
         TypeIndex = 0;
 
-        Type* elementType = CompTy->getTypeAtIndex(TypeIndex);
+        Type* elementType = getEltType(eltTy);
+
         uint32_t size = getTypeByteSize(unit, elementType);
 
         //add padding
@@ -326,7 +341,7 @@ namespace gbe
       }
 
       //step down in type hirachy
-      CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+      eltTy = getEltType(eltTy, TypeIndex);
     }
 
     //insert addition of new offset before GEPInst when it is not zero
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 800f343..6bb7c52 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -309,7 +309,11 @@ error:
     bool parseOnePrintfInstruction(CallInst * call);
     bool generateOneParameterInst(PrintfSlot& slot, Value* arg, Value*& new_arg);
 
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    virtual StringRef getPassName() const
+#else
     virtual const char *getPassName() const
+#endif
     {
       return "Printf Parser";
     }
@@ -381,7 +385,7 @@ error:
     }
 
     GBE_ASSERT(unit.printfs.find(call) == unit.printfs.end());
-    unit.printfs.insert(std::pair<llvm::CallInst*, PrintfSet::PrintfFmt*>(call, printf_fmt));
+    unit.printfs.insert(std::pair<void *, PrintfSet::PrintfFmt*>((void *)call, printf_fmt));
     return true;
   }
 
@@ -389,15 +393,9 @@ error:
   {
     bool hasPrintf = false;
     switch (F.getCallingConv()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-      case CallingConv::PTX_Device:
-        return false;
-      case CallingConv::PTX_Kernel:
-#else
       case CallingConv::C:
       case CallingConv::Fast:
       case CallingConv::SPIR_KERNEL:
-#endif
         break;
       default:
         GBE_ASSERTM(false, "Unsupported calling convention");
@@ -521,7 +519,11 @@ error:
       case Type::FloatTyID: {
         /* llvm 3.6 will give a undef value for NAN. */
         if (dyn_cast<llvm::UndefValue>(arg)) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+          APFloat nan = APFloat::getNaN(APFloat::IEEEsingle(), false);
+#else
           APFloat nan = APFloat::getNaN(APFloat::IEEEsingle, false);
+#endif
           new_arg = ConstantFP::get(module->getContext(), nan);
         }
 
diff --git a/backend/src/llvm/llvm_profiling.cpp b/backend/src/llvm/llvm_profiling.cpp
index 96c95ee..2d2ee11 100644
--- a/backend/src/llvm/llvm_profiling.cpp
+++ b/backend/src/llvm/llvm_profiling.cpp
@@ -26,29 +26,15 @@
 #include <stdlib.h>
 
 #include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
 #include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
 
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CFG.h"
 #else
@@ -97,7 +83,11 @@ namespace gbe
     {
     }
 
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+    virtual StringRef getPassName() const
+#else
     virtual const char *getPassName() const
+#endif
     {
       return "Timestamp Parser";
     }
@@ -111,15 +101,9 @@ namespace gbe
     int pointNum = 0;
 
     switch (F.getCallingConv()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-      case CallingConv::PTX_Device:
-        return false;
-      case CallingConv::PTX_Kernel:
-#else
       case CallingConv::C:
       case CallingConv::Fast:
       case CallingConv::SPIR_KERNEL:
-#endif
         break;
       default:
         GBE_ASSERTM(false, "Unsupported calling convention");
@@ -178,12 +162,19 @@ namespace gbe
       /* Add the timestamp store function call. */
       // __gen_ocl_store_timestamp(int nth, int type);
       Value *Args[2] = {ConstantInt::get(intTy, pointNum++), ConstantInt::get(intTy, profilingType)};
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
       builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
               "__gen_ocl_calc_timestamp", Type::getVoidTy(module->getContext()),
               IntegerType::getInt32Ty(module->getContext()),
+              IntegerType::getInt32Ty(module->getContext()))),
+              ArrayRef<Value*>(Args));
+#else
+      builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+              "__gen_ocl_calc_timestamp", Type::getVoidTy(module->getContext()),
               IntegerType::getInt32Ty(module->getContext()),
-              NULL)),
+              IntegerType::getInt32Ty(module->getContext()), nullptr)),
               ArrayRef<Value*>(Args));
+#endif
     }
     /* We insert one store_profiling at the end of the last block to hold the place. */
     llvm::Function::iterator BE = F.end();
@@ -193,12 +184,19 @@ namespace gbe
     builder->SetInsertPoint(&*retInst);
     Value *Args2[2] = {profilingBuf, ConstantInt::get(intTy, profilingType)};
 
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+    builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+            "__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
+            ptrTy,
+            IntegerType::getInt32Ty(module->getContext()))),
+            ArrayRef<Value*>(Args2));
+#else
     builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
             "__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
             ptrTy,
-            IntegerType::getInt32Ty(module->getContext()),
-            NULL)),
+            IntegerType::getInt32Ty(module->getContext()), nullptr)),
             ArrayRef<Value*>(Args2));
+#endif
 
     delete builder;
     return changed;
diff --git a/backend/src/llvm/llvm_sampler_fix.cpp b/backend/src/llvm/llvm_sampler_fix.cpp
index de7ebdb..c9ec817 100644
--- a/backend/src/llvm/llvm_sampler_fix.cpp
+++ b/backend/src/llvm/llvm_sampler_fix.cpp
@@ -33,7 +33,7 @@ namespace gbe {
   class SamplerFix : public FunctionPass {
   public:
     SamplerFix() : FunctionPass(ID) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
       initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
 #else
       initializeDominatorTreePass(*PassRegistry::getPassRegistry());
@@ -55,9 +55,17 @@ namespace gbe {
         //          ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST));
         bool needFix = true;
         Value *needFixVal;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+        CallInst *init = dyn_cast<CallInst>(I->getOperand(0));
+        if (init && init->getCalledValue()->getName().compare("__translate_sampler_initializer"))
+        {
+          const ConstantInt *ci = dyn_cast<ConstantInt>(init->getOperand(0));
+          uint32_t samplerInt = ci->getZExtValue();
+#else
         if (dyn_cast<ConstantInt>(I->getOperand(0))) {
           const ConstantInt *ci = dyn_cast<ConstantInt>(I->getOperand(0));
           uint32_t samplerInt = ci->getZExtValue();
+#endif
           needFix = ((samplerInt & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP &&
                      (samplerInt & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST);
           needFixVal = ConstantInt::get(boolTy, needFix);
@@ -65,14 +73,28 @@ namespace gbe {
           IRBuilder<> Builder(I->getParent());
 
           Builder.SetInsertPoint(I);
+
           Value *addressMask = ConstantInt::get(i32Ty, __CLK_ADDRESS_MASK);
-          Value *addressMode = Builder.CreateAnd(I->getOperand(0), addressMask);
           Value *clampInt =  ConstantInt::get(i32Ty, CLK_ADDRESS_CLAMP);
-          Value *isClampMode = Builder.CreateICmpEQ(addressMode, clampInt);
           Value *filterMask = ConstantInt::get(i32Ty, __CLK_FILTER_MASK);
-          Value *filterMode = Builder.CreateAnd(I->getOperand(0), filterMask);
           Value *nearestInt = ConstantInt::get(i32Ty, CLK_FILTER_NEAREST);
+
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+          Module *M = I->getParent()->getParent()->getParent();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+          Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType());
+#else
+          Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType(), nullptr);
+#endif
+          Value *samplerVal = Builder.CreateCall(samplerCvt, {I->getOperand(0)});
+#else
+          Value *samplerVal = I->getOperand(0);
+#endif
+          Value *addressMode = Builder.CreateAnd(samplerVal, addressMask);
+          Value *isClampMode = Builder.CreateICmpEQ(addressMode, clampInt);
+          Value *filterMode = Builder.CreateAnd(samplerVal, filterMask);
           Value *isNearestMode = Builder.CreateICmpEQ(filterMode, nearestInt);
+
           needFixVal = Builder.CreateAnd(isClampMode, isNearestMode);
         }
 
@@ -83,16 +105,35 @@ namespace gbe {
         //  return ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0);
         bool needFix = true;
         Value *needFixVal;
+ #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+        CallInst *init = dyn_cast<CallInst>(I->getOperand(0));
+        if (init && init->getCalledValue()->getName().compare("__translate_sampler_initializer"))
+        {
+          const ConstantInt *ci = dyn_cast<ConstantInt>(init->getOperand(0));
+          uint32_t samplerInt = ci->getZExtValue();
+#else
         if (dyn_cast<ConstantInt>(I->getOperand(0))) {
           const ConstantInt *ci = dyn_cast<ConstantInt>(I->getOperand(0));
           uint32_t samplerInt = ci->getZExtValue();
+#endif
           needFix = samplerInt & CLK_NORMALIZED_COORDS_TRUE;
           needFixVal = ConstantInt::get(boolTy, needFix);
         } else {
           IRBuilder<> Builder(I->getParent());
           Builder.SetInsertPoint(I);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+          Module *M = I->getParent()->getParent()->getParent();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+          Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType());
+#else
+          Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType(), nullptr);
+#endif
+          Value *samplerVal = Builder.CreateCall(samplerCvt, {I->getOperand(0)});
+#else
+          Value *samplerVal = I->getOperand(0);
+#endif
           Value *normalizeMask = ConstantInt::get(i32Ty, CLK_NORMALIZED_COORDS_TRUE);
-          Value *normalizeMode = Builder.CreateAnd(I->getOperand(0), normalizeMask);
+          Value *normalizeMode = Builder.CreateAnd(samplerVal, normalizeMask);
           needFixVal = Builder.CreateICmpEQ(normalizeMode, ConstantInt::get(i32Ty, 0));
         }
         I->replaceAllUsesWith(needFixVal);
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 8850abb..e9a2a66 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -96,7 +96,7 @@ namespace gbe {
 
     Scalarize() : FunctionPass(ID)
     {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
       initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
 #else
       initializeDominatorTreePass(*PassRegistry::getPassRegistry());
@@ -873,15 +873,9 @@ namespace gbe {
   bool Scalarize::runOnFunction(Function& F)
   {
     switch (F.getCallingConv()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-    case CallingConv::PTX_Device:
-      return false;
-    case CallingConv::PTX_Kernel:
-#else
     case CallingConv::C:
     case CallingConv::Fast:
     case CallingConv::SPIR_KERNEL:
-#endif
       break;
     default:
       GBE_ASSERTM(false, "Unsupported calling convention");
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index bef4df1..7f7deff 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -46,14 +46,7 @@ namespace gbe
   BVAR(OCL_OUTPUT_CFG_GEN_IR, false);
   using namespace llvm;
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-  llvm::LLVMContext& GBEGetLLVMContext() {
-    static llvm::LLVMContext GBEContext;
-    return GBEContext;
-  }
-#endif
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
   #define TARGETLIBRARY  TargetLibraryInfoImpl
 #else
   #define TARGETLIBRARY  TargetLibraryInfo
@@ -61,32 +54,32 @@ namespace gbe
 
   void runFuntionPass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL)
   {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     legacy::FunctionPassManager FPM(&mod);
 #else
     FunctionPassManager FPM(&mod);
 #endif
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
     FPM.add(new DataLayoutPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR == 35
     FPM.add(new DataLayoutPass(DL));
 #else
     FPM.add(new DataLayout(DL));
 #endif
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
     FPM.add(createVerifierPass(true));
 #else
     FPM.add(createVerifierPass());
 #endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     FPM.add(new TargetLibraryInfoWrapperPass(*libraryInfo));
 #else
     FPM.add(new TargetLibraryInfo(*libraryInfo));
 #endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
     FPM.add(createTypeBasedAAWrapperPass());
     FPM.add(createBasicAAWrapperPass());
 #else
@@ -108,27 +101,27 @@ namespace gbe
 
   void runModulePass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL, int optLevel, bool strictMath)
   {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     legacy::PassManager MPM;
 #else
     PassManager MPM;
 #endif
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
     MPM.add(new DataLayoutPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR == 35
     MPM.add(new DataLayoutPass(DL));
 #else
     MPM.add(new DataLayout(DL));
 #endif
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     MPM.add(new TargetLibraryInfoWrapperPass(*libraryInfo));
 #else
     MPM.add(new TargetLibraryInfo(*libraryInfo));
 #endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
     MPM.add(createTypeBasedAAWrapperPass());
     MPM.add(createBasicAAWrapperPass());
 #else
@@ -139,7 +132,7 @@ namespace gbe
     MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
     MPM.add(createFunctionInliningPass(20000));
     MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
-    MPM.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
+    MPM.add(createStripAttributesPass(false));     // Strip unsupported attributes and calling conventions.
     MPM.add(createSamplerFixPass());
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
 
@@ -149,9 +142,9 @@ namespace gbe
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
     MPM.add(createPruneEHPass());             // Remove dead EH info
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
     MPM.add(createPostOrderFunctionAttrsLegacyPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
     MPM.add(createPostOrderFunctionAttrsPass());       // Set readonly/readnone attrs
 #else
     MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
@@ -159,7 +152,7 @@ namespace gbe
 
     //MPM.add(createScalarReplAggregatesPass(64, true, -1, -1, 64))
     if(optLevel > 0)
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
       MPM.add(createSROAPass());
 #else
       MPM.add(createSROAPass(/*RequiresDomTree*/ false));
@@ -182,14 +175,14 @@ namespace gbe
     MPM.add(createLoopDeletionPass());          // Delete dead loops
     MPM.add(createLoopUnrollPass(640)); //1024, 32, 1024, 512)); //Unroll loops
     if(optLevel > 0) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
       MPM.add(createSROAPass());
 #else
       MPM.add(createSROAPass(/*RequiresDomTree*/ false));
 #endif
       MPM.add(createGVNPass());                 // Remove redundancies
     }
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
     // FIXME Workaround: we find that CustomLoopUnroll may increase register pressure greatly,
     // and it may even make som cl kernel cannot compile because of limited scratch memory for spill.
     // As we observe this under strict math. So we disable CustomLoopUnroll if strict math is enabled.
@@ -199,7 +192,7 @@ namespace gbe
 #endif
       MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
       if(optLevel > 0) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
         MPM.add(createSROAPass());
 #else
         MPM.add(createSROAPass(/*RequiresDomTree*/ false));
@@ -230,7 +223,7 @@ namespace gbe
   }
 
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
 #define OUTPUT_BITCODE(STAGE, MOD)  do {         \
   legacy::PassManager passes__;           \
    if (OCL_OUTPUT_LLVM_##STAGE) {                \
@@ -238,7 +231,7 @@ namespace gbe
      passes__.run(MOD);                          \
    }                                             \
  }while(0)
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
 #define OUTPUT_BITCODE(STAGE, MOD)  do {         \
    PassManager passes__;           \
    if (OCL_OUTPUT_LLVM_##STAGE) {                \
@@ -288,7 +281,7 @@ namespace gbe
     dc->process(diagnostic);
   }
 
-  bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module,
+  bool llvmToGen(ir::Unit &unit, const void* module,
                  int optLevel, bool strictMath, int profiling, std::string &errors)
   {
     std::string errInfo;
@@ -296,29 +289,15 @@ namespace gbe
     if (OCL_OUTPUT_LLVM_BEFORE_LINK || OCL_OUTPUT_LLVM_AFTER_LINK || OCL_OUTPUT_LLVM_AFTER_GEN)
       o = std::unique_ptr<llvm::raw_fd_ostream>(new llvm::raw_fd_ostream(fileno(stdout), false));
 
-    // Get the module from its file
-    llvm::SMDiagnostic Err;
-
     Module* cl_mod = NULL;
     if (module) {
       cl_mod = reinterpret_cast<Module*>(const_cast<void*>(module));
-    } else if (fileName){
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-      llvm::LLVMContext& c = GBEGetLLVMContext();
-#else
-      llvm::LLVMContext& c = llvm::getGlobalContext();
-#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
-      cl_mod = parseIRFile(fileName, Err, c).release();
-#else
-      cl_mod = ParseIRFile(fileName, Err, c);
-#endif
     }
 
     if (!cl_mod) return false;
 
     OUTPUT_BITCODE(BEFORE_LINK, (*cl_mod));
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     legacy::PassManager passes__;
 #else
     PassManager passes__;
@@ -335,8 +314,7 @@ namespace gbe
     /* Before do any thing, we first filter in all CL functions in bitcode. */
     /* Also set unit's pointer size in runBitCodeLinker */
     M.reset(runBitCodeLinker(cl_mod, strictMath, unit));
-    if (!module)
-      delete cl_mod;
+
     if (M.get() == 0)
       return true;
 
@@ -346,7 +324,7 @@ namespace gbe
     gbeDiagnosticContext dc;
     mod.getContext().setDiagnosticHandler(&gbeDiagnosticHandler,&dc);
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     mod.setDataLayout(DL);
 #endif
     Triple TargetTriple(mod.getTargetTriple());
@@ -357,24 +335,24 @@ namespace gbe
 
     runFuntionPass(mod, libraryInfo, DL);
     runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     legacy::PassManager passes;
 #else
     PassManager passes;
 #endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
     passes.add(new DataLayoutPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR == 35
     passes.add(new DataLayoutPass(DL));
 #else
     passes.add(new DataLayout(DL));
 #endif
     // Print the code before further optimizations
     passes.add(createIntrinsicLoweringPass());
-    passes.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
+    passes.add(createStripAttributesPass(true));     // Strip unsupported attributes and calling conventions.
     passes.add(createFunctionInliningPass(20000));
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
     passes.add(createSROAPass());
 #else
     passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
@@ -402,9 +380,17 @@ namespace gbe
     passes.add(createScalarizePass());             // Expand all vector ops
 
     if(OCL_OUTPUT_CFG)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+      passes.add(createCFGPrinterLegacyPassPass());
+#else
       passes.add(createCFGPrinterPass());
+#endif
     if(OCL_OUTPUT_CFG_ONLY)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+      passes.add(createCFGOnlyPrinterLegacyPassPass());
+#else
       passes.add(createCFGOnlyPrinterPass());
+#endif
     passes.add(createGenPass(unit));
     passes.run(mod);
     errors = dc.str();
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
index d3928c6..d2247bb 100644
--- a/backend/src/llvm/llvm_to_gen.hpp
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -23,7 +23,7 @@
  */
 #ifndef __GBE_IR_LLVM_TO_GEN_HPP__
 #define __GBE_IR_LLVM_TO_GEN_HPP__
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
 #include "llvm/IR/LLVMContext.h"
 #endif
 
@@ -35,12 +35,8 @@ namespace gbe {
 
   /*! Convert the LLVM IR code to a GEN IR code,
 		  optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
-  bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module,
+  bool llvmToGen(ir::Unit &unit, const void* module,
                  int optLevel, bool strictMath, int profiling, std::string &errors);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
-  extern llvm::LLVMContext& GBEGetLLVMContext();
-#endif
-
 } /* namespace gbe */
 
 #endif /* __GBE_IR_LLVM_TO_GEN_HPP__ */
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
index e24dc4f..107d793 100644
--- a/backend/src/llvm/llvm_unroll.cpp
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -16,7 +16,7 @@
  */
 
 #include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
 #include <set>
 
 #include "llvm_includes.hpp"
@@ -36,7 +36,7 @@ namespace gbe {
        LoopPass(ID) {}
 
       void getAnalysisUsage(AnalysisUsage &AU) const {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 7)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
         AU.addRequired<LoopInfoWrapperPass>();
         AU.addPreserved<LoopInfoWrapperPass>();
 #else
@@ -47,7 +47,7 @@ namespace gbe {
         AU.addPreservedID(LoopSimplifyID);
         AU.addRequiredID(LCSSAID);
         AU.addPreservedID(LCSSAID);
-#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
         AU.addRequired<ScalarEvolutionWrapperPass>();
         AU.addPreserved<ScalarEvolutionWrapperPass>();
 #else
@@ -91,7 +91,7 @@ namespace gbe {
           assert(MD->getNumOperands() == 2 &&
                  "Unroll count hint metadata should have two operands.");
           unsigned Count;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
           Count = mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
 #else
           Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue();
@@ -105,7 +105,7 @@ namespace gbe {
       void setUnrollID(Loop *L, bool enable) {
         assert(enable);
         LLVMContext &Context = L->getHeader()->getContext();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
         SmallVector<Metadata *, 2> forceUnroll;
         forceUnroll.push_back(MDString::get(Context, "llvm.loop.unroll.enable"));
         MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
@@ -169,7 +169,7 @@ namespace gbe {
       // be unrolled.
       bool handleParentLoops(Loop *L, LPPassManager &LPM) {
         Loop *currL = L;
-#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
         ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
         LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 #else
@@ -205,7 +205,7 @@ namespace gbe {
           if (parentTripCount != 0 && currTripCount * parentTripCount > 32) {
             //Don't change the unrollID if doesn't force unroll.
             //setUnrollID(parentL, false);
-#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
             loopInfo.markAsRemoved(parentL);
 #else
             LPM.deleteLoopFromQueue(parentL);
@@ -238,7 +238,12 @@ namespace gbe {
         return true;
       }
 
-      virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+      virtual StringRef getPassName() const
+#else
+      virtual const char *getPassName() const
+#endif
+      {
         return "SPIR backend: custom loop unrolling pass";
       }
 
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index b1019da..75039d4 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -283,7 +283,7 @@ Documents for OpenCL application developers
 - [[OpenGL Buffer Sharing|Beignet/howto/gl-buffer-sharing-howto]]
 - [[Video Motion Estimation|Beignet/howto/video-motion-estimation-howto]]
 - [[Stand Alone Unit Test|Beignet/howto/stand-alone-utest-howto]]
-- [[Android build|Beignet/android-build-howto]]
+- [[Android build|Beignet/howto/android-build-howto]]
 
 The wiki URL is as below:
 [http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index 601a07f..5ec3a45 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,5 +1,8 @@
 # News
 
+## Oct 26, 2017
+[Beignet 1.3.2](https://01.org/beignet/downloads/beignet-1.3.2-2017-10-26) is released. This is a bug-fix release.
+
 ## Mar 13, 2017
 [Beignet 1.3.1](https://01.org/beignet/downloads/beignet-1.3.1-2017-03-13) is released. This is a bug-fix release.
 
diff --git a/docs/howto/gl-buffer-sharing-howto.mdwn b/docs/howto/gl-buffer-sharing-howto.mdwn
index 6b3a751..5fb3d87 100644
--- a/docs/howto/gl-buffer-sharing-howto.mdwn
+++ b/docs/howto/gl-buffer-sharing-howto.mdwn
@@ -1,16 +1,16 @@
 GL Buffer Sharing HowTo
 =========================
 
-Beignet now support cl_khr_gl_sharing partially(the most commonly used part), which is an offcial
+Beignet now supports cl_khr_gl_sharing partially (the most commonly used part), which is an official
 extension of Khronos OpenCL. With this extension, Beignet can create memory object from OpenGL/OpenGL
-ES buffer, texture or renderbuffer object with zero-copy. Currently, we just support create memory
-object from GL buffer object or 2d texture(the most common target type). We will support creating
+ES buffer, texture or renderbuffer object with zero-copy. Currently, we just support creating memory
+object from GL buffer object or 2d texture (the most common target type). We will support creating
 from other GL target type if necessary.
 
 Prerequisite
 ------------
 
-Mesa GL library and Mesa EGL libray are required. Both version should be greater or equal than
+Mesa GL library and Mesa EGL library are required. Both version should be greater or equal than
 13.0.0.
 
 Steps
@@ -18,7 +18,7 @@ Steps
 
 A typical procedure of using cl_khr_gl_sharing is as below:
 
-- Basic egl routine(eglGetDisplay, eglInitialize, eglCreateContext...).
+- Basic egl routine (eglGetDisplay, eglInitialize, eglCreateContext...).
 
 - Create GL 2d texture in normal OpenGL way.
 
@@ -44,7 +44,7 @@ A typical procedure of using cl_khr_gl_sharing is as below:
 
 - Access this cl image object as an usual cl image object.
 
-- Relase cl image object by calling clEnqueueReleaseGLObjects.
+- Release cl image object by calling clEnqueueReleaseGLObjects.
 
 - Ensure any pending OpenCL operations which access this cl image object have completed by clFinish.
 
@@ -54,7 +54,7 @@ Sample code
 -----------
 
 We have developed an example showing how to utilize cl_khr_gl_sharing in examples/gl_buffer_sharing
-directory. A cl image object is created from a gl 2d texutre and processed by OpenCL kernel, then
+directory. A cl image object is created from a gl 2d texture and processed by OpenCL kernel, then
 is shown on screen.
 
 Steps to build and run this example:
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 47bae46..3cb8515 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -197,6 +197,12 @@ typedef CL_API_ENTRY cl_int
 						      void* /*param_value*/,
 						      size_t* /*param_value_size_ret*/ );
 #endif
+
+/* cl_intel_required_subgroup_size extension*/
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                 0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                  0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL          0x410A
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/kernels/compiler_if_else.cl b/kernels/compiler_if_else.cl
index 7ae8f99..c8b2cb8 100644
--- a/kernels/compiler_if_else.cl
+++ b/kernels/compiler_if_else.cl
@@ -5,10 +5,8 @@ compiler_if_else(__global int *src, __global int *dst)
   dst[id] = src[id];
   if (dst[id] >= 0) {
     dst[id] = src[id+1];
-    src[id] = 1;
   } else {
     dst[id]--;
-    src[id] = 2;
   }
 }
 
diff --git a/kernels/compiler_remove_negative_add.cl b/kernels/compiler_remove_negative_add.cl
new file mode 100644
index 0000000..d6f7270
--- /dev/null
+++ b/kernels/compiler_remove_negative_add.cl
@@ -0,0 +1,4 @@
+kernel void compiler_remove_negative_add(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = exp2(-src[i]);
+};
diff --git a/kernels/compiler_reqd_sub_group_size.cl b/kernels/compiler_reqd_sub_group_size.cl
new file mode 100644
index 0000000..0ce70e9
--- /dev/null
+++ b/kernels/compiler_reqd_sub_group_size.cl
@@ -0,0 +1,5 @@
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+__kernel void compiler_reqd_sub_group_size(global int* src)
+{
+
+}
diff --git a/kernels/compiler_sqrt_div.cl b/kernels/compiler_sqrt_div.cl
new file mode 100644
index 0000000..7d5a2f0
--- /dev/null
+++ b/kernels/compiler_sqrt_div.cl
@@ -0,0 +1,8 @@
+kernel void compiler_sqrt_div(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  float tmp = sqrt(src[i]);
+  dst[i*4] = 1.0f/tmp;
+  dst[i*4+1] = (float)i/tmp;
+  dst[i*4+2] = 2.0f/tmp;
+  dst[i*4+3] = 1.0f/tmp  + tmp;
+};
diff --git a/kernels/test_fill_gl_image.cl b/kernels/test_fill_gl_image.cl
index 4250a57..7b5dce7 100644
--- a/kernels/test_fill_gl_image.cl
+++ b/kernels/test_fill_gl_image.cl
@@ -1,5 +1,5 @@
 __kernel void
-test_fill_gl_image(image2d_t img, int color)
+test_fill_gl_image(write_only image2d_t img, int color)
 {
 	int2 coord;
         float4 color_v4;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f3c4632..0cd41fc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,13 +18,13 @@ foreach (KF ${KERNEL_FILES})
       OUTPUT ${output_file}
       COMMAND rm -rf ${output_file}
       COMMAND ${GBE_BIN_GENERATER} -s -o${output_file} -t${GEN_PCI_ID} ${input_file}
-      DEPENDS ${input_file} ${GBE_BIN_FILE})
+      DEPENDS ${input_file} ${GBE_BIN_FILE} beignet_bitcode)
   else(GEN_PCI_ID)
     add_custom_command(
       OUTPUT ${output_file}
       COMMAND rm -rf ${output_file}
       COMMAND ${GBE_BIN_GENERATER} -s -o${output_file} ${input_file}
-      DEPENDS ${input_file} ${GBE_BIN_FILE})
+      DEPENDS ${input_file} ${GBE_BIN_FILE} beignet_bitcode)
   endif(GEN_PCI_ID)
 endforeach (KF)
 endmacro (MakeKernelBinStr)
diff --git a/src/cl_context.c b/src/cl_context.c
index 1ba2302..fef36f8 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -358,29 +358,37 @@ cl_context_delete(cl_context ctx)
   if (UNLIKELY(ctx == NULL))
     return;
 
+  int internal_ctx_refs = 1;
+  // determine how many ctx refs are held by internal_prgs and built_in_prgs
+  for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
+    if (ctx->internal_kernels[i] && ctx->internal_prgs[i])
+      ++internal_ctx_refs;
+  }
+
   /* We are not done yet */
-  if (CL_OBJECT_DEC_REF(ctx) > 1)
+  if (CL_OBJECT_DEC_REF(ctx) > internal_ctx_refs)
     return;
 
+  // create a temporary extra ref here so cl_program_delete doesn't
+  // attempt a recursive full cl_context_delete when cleaning up
+  // our internal programs
+  CL_OBJECT_INC_REF(ctx);
+
   /* delete the internal programs. */
   for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
     if (ctx->internal_kernels[i]) {
-      cl_kernel_delete(ctx->internal_kernels[i]);
+      cl_kernel k = ctx->internal_kernels[i];
       ctx->internal_kernels[i] = NULL;
+      cl_kernel_delete(k);
 
       assert(ctx->internal_prgs[i]);
-      cl_program_delete(ctx->internal_prgs[i]);
+      cl_program p = ctx->internal_prgs[i];
       ctx->internal_prgs[i] = NULL;
-    }
-
-    if (ctx->built_in_kernels[i]) {
-      cl_kernel_delete(ctx->built_in_kernels[i]);
-      ctx->built_in_kernels[i] = NULL;
+      cl_program_delete(p);
     }
   }
 
-  cl_program_delete(ctx->built_in_prgs);
-  ctx->built_in_prgs = NULL;
+  CL_OBJECT_DEC_REF(ctx);
 
   cl_free(ctx->prop_user);
   cl_free(ctx->devices);
@@ -427,32 +435,18 @@ cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
 
     ctx->internal_prgs[index]->is_built = 1;
 
-    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
-    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
-      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
-      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
-        if (index != i) {
-          assert(ctx->internal_prgs[i] == NULL);
-          assert(ctx->internal_kernels[i] == NULL);
-          cl_program_add_ref(ctx->internal_prgs[index]);
-          ctx->internal_prgs[i] = ctx->internal_prgs[index];
-        }
-
-        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+    if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+      ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_2", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+    } else if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+      ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_4", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+    } else if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+      ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_8", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+    } else if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+      ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_16", NULL);
-        } else
-          assert(0);
-      }
     } else {
       ctx->internal_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
     }
diff --git a/src/cl_context.h b/src/cl_context.h
index 4812afd..a46f2f5 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -120,8 +120,6 @@ struct _cl_context {
                                     /* All programs internal used, for example clEnqueuexxx api use */
   cl_kernel  internal_kernels[CL_INTERNAL_KERNEL_MAX];
                                     /* All kernels  for clenqueuexxx api, for example clEnqueuexxx api use */
-  cl_program built_in_prgs;  /*all built-in kernels belongs to this program only*/
-  cl_kernel  built_in_kernels[CL_INTERNAL_KERNEL_MAX];
   uint32_t ver;                     /* Gen version */
   struct _cl_context_prop props;
   cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index f3c5204..123b619 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -247,7 +247,9 @@
 /* SKL */
 #define PCI_CHIP_SKYLAKE_ULT_GT1	0x1906   /* Intel(R) Skylake ULT - GT1 */
 #define PCI_CHIP_SKYLAKE_ULT_GT2	0x1916   /* Intel(R) Skylake ULT - GT2 */
-#define PCI_CHIP_SKYLAKE_ULT_GT3	0x1926   /* Intel(R) Skylake ULT - GT3 */
+#define PCI_CHIP_SKYLAKE_ULT_GT3	0x1923   /* Intel(R) Skylake ULT - GT3 */
+#define PCI_CHIP_SKYLAKE_ULT_GT3E1	0x1926   /* Intel(R) Skylake ULT - GT3E */
+#define PCI_CHIP_SKYLAKE_ULT_GT3E2	0x1927   /* Intel(R) Skylake ULT - GT3E */
 #define PCI_CHIP_SKYLAKE_ULT_GT2F	0x1921   /* Intel(R) Skylake ULT - GT2F */
 #define PCI_CHIP_SKYLAKE_ULX_GT1	0x190E   /* Intel(R) Skylake ULX - GT1 */
 #define PCI_CHIP_SKYLAKE_ULX_GT2	0x191E   /* Intel(R) Skylake ULX - GT2 */
@@ -284,6 +286,8 @@
 
 #define IS_SKL_GT3(devid)               \
   (devid == PCI_CHIP_SKYLAKE_ULT_GT3 ||   \
+   devid == PCI_CHIP_SKYLAKE_ULT_GT3E1 ||   \
+   devid == PCI_CHIP_SKYLAKE_ULT_GT3E2 ||   \
    devid == PCI_CHIP_SKYLAKE_HALO_GT3 || \
    devid == PCI_CHIP_SKYLAKE_SRV_GT3 || \
    devid == PCI_CHIP_SKYLAKE_MEDIA_SRV_GT3)
@@ -361,7 +365,14 @@
 
 #define IS_KABYLAKE(devid) (IS_KBL_GT1(devid) || IS_KBL_GT15(devid) || IS_KBL_GT2(devid) || IS_KBL_GT3(devid) || IS_KBL_GT4(devid))
 
-#define IS_GEN9(devid)     (IS_SKYLAKE(devid) || IS_BROXTON(devid) || IS_KABYLAKE(devid))
+#define PCI_CHIP_GLK_3x6     0x3184
+#define PCI_CHIP_GLK_2x6     0x3185
+
+#define IS_GEMINILAKE(devid)      \
+  (devid == PCI_CHIP_GLK_3x6 ||   \
+   devid == PCI_CHIP_GLK_2x6)
+
+#define IS_GEN9(devid)     (IS_SKYLAKE(devid) || IS_BROXTON(devid) || IS_KABYLAKE(devid) || IS_GEMINILAKE(devid))
 
 #define MAX_OCLVERSION(devid) (IS_GEN9(devid) ? 200 : 120)
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index d4f4208..76e77d7 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -254,6 +254,26 @@ static struct _cl_device_id intel_kbl_gt4_device = {
 #include "cl_gen9_device.h"
 };
 
+static struct _cl_device_id intel_glk18eu_device = {
+  .max_compute_unit = 18,
+  .max_thread_per_unit = 6,
+  .sub_slice_count = 3,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
+static struct _cl_device_id intel_glk12eu_device = {
+  .max_compute_unit = 12,
+  .max_thread_per_unit = 6,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
 LOCAL cl_device_id
 cl_get_gt_device(cl_device_type device_type)
 {
@@ -585,6 +605,10 @@ skl_gt2_break:
 
     case PCI_CHIP_SKYLAKE_ULT_GT3:
       DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3");
+    case PCI_CHIP_SKYLAKE_ULT_GT3E1:
+      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3E");
+    case PCI_CHIP_SKYLAKE_ULT_GT3E2:
+      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3E");
     case PCI_CHIP_SKYLAKE_HALO_GT3:
       DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Halo GT3");
     case PCI_CHIP_SKYLAKE_SRV_GT3:
@@ -737,6 +761,26 @@ kbl_gt4_break:
       cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
+    case PCI_CHIP_GLK_3x6:
+      DECL_INFO_STRING(glk18eu_break, intel_bxt18eu_device, name, "Intel(R) HD Graphics Geminilake(3x6)");
+glk18eu_break:
+      intel_glk18eu_device.device_id = device_id;
+      intel_glk18eu_device.platform = cl_get_platform_default();
+      ret = &intel_glk18eu_device;
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
+    case PCI_CHIP_GLK_2x6:
+      DECL_INFO_STRING(glk12eu_break, intel_bxt12eu_device, name, "Intel(R) HD Graphics Geminilake(2x6)");
+glk12eu_break:
+      intel_glk12eu_device.device_id = device_id;
+      intel_glk12eu_device.platform = cl_get_platform_default();
+      ret = &intel_glk12eu_device;
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
     case PCI_CHIP_SANDYBRIDGE_BRIDGE:
     case PCI_CHIP_SANDYBRIDGE_GT1:
     case PCI_CHIP_SANDYBRIDGE_GT2:
@@ -942,7 +986,9 @@ LOCAL cl_bool is_gen_device(cl_device_id device) {
          device == &intel_kbl_gt15_device ||
          device == &intel_kbl_gt2_device ||
          device == &intel_kbl_gt3_device ||
-         device == &intel_kbl_gt4_device;
+         device == &intel_kbl_gt4_device ||
+         device == &intel_glk18eu_device ||
+         device == &intel_glk12eu_device;
 }
 
 LOCAL cl_int
@@ -1333,6 +1379,10 @@ cl_get_device_info(cl_device_id     device,
       src_ptr = device->driver_version;
       src_size = device->driver_version_sz;
       break;
+    case CL_DEVICE_SUB_GROUP_SIZES_INTEL:
+      src_ptr = device->sub_group_sizes;
+      src_size = device->sub_group_sizes_sz;
+      break;
 
     default:
       return CL_INVALID_VALUE;
@@ -1363,7 +1413,8 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
         || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device
         || device == &intel_bxt18eu_device || device == &intel_bxt12eu_device || device == &intel_kbl_gt1_device
         || device == &intel_kbl_gt2_device || device == &intel_kbl_gt3_device
-        || device == &intel_kbl_gt4_device || device == &intel_kbl_gt15_device) {
+        || device == &intel_kbl_gt4_device || device == &intel_kbl_gt15_device
+        || device == &intel_glk18eu_device || device == &intel_glk12eu_device) {
     *ver = 9;
   } else
     return CL_INVALID_VALUE;
@@ -1475,6 +1526,7 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
     DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
     DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
     case CL_KERNEL_GLOBAL_WORK_SIZE:
+    {
       dimension = cl_check_builtin_kernel_dimension(kernel, device);
       if ( !dimension ) return CL_INVALID_VALUE;
       if (param_value_size_ret != NULL)
@@ -1492,6 +1544,18 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
         return CL_SUCCESS;
       }
       return CL_SUCCESS;
+    }
+    case CL_KERNEL_SPILL_MEM_SIZE_INTEL:
+    {
+      if (param_value && param_value_size < sizeof(cl_ulong))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(cl_ulong);
+      if (param_value)
+        *(cl_ulong*)param_value = (cl_ulong)interp_kernel_get_scratch_size(kernel->opaque);
+      return CL_SUCCESS;
+    }
+
     default:
       return CL_INVALID_VALUE;
   };
@@ -1575,6 +1639,16 @@ cl_get_kernel_subgroup_info(cl_kernel kernel,
       }
       break;
     }
+    case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL:
+    {
+      if (param_value && param_value_size < sizeof(size_t))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(size_t);
+      if (param_value)
+        *(size_t*)param_value = interp_kernel_get_simd_width(kernel->opaque);
+      return CL_SUCCESS;
+    }
     default:
       return CL_INVALID_VALUE;
   };
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 6b8f2eb..93bd2f1 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -136,6 +136,8 @@ struct _cl_device_id {
   uint32_t atomic_test_result;
   cl_uint image_pitch_alignment;
   cl_uint image_base_address_alignment;
+  size_t sub_group_sizes[2];
+  size_t sub_group_sizes_sz;
 
   //inited as NULL, created only when cmrt kernel is used
   void* cmrt_device;  //realtype: CmDevice*
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index a3c71ca..56099ad 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -42,7 +42,7 @@ void check_opt1_extension(cl_extensions_t *extensions)
   {
     if (id == EXT_ID(khr_icd))
       extensions->extensions[id].base.ext_enabled = 1;
-#if  LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
     if (id == EXT_ID(khr_spir))
       extensions->extensions[id].base.ext_enabled = 1;
 #endif
@@ -69,8 +69,16 @@ check_intel_extension(cl_extensions_t *extensions)
 {
   int id;
   for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
+  {
     if(id != EXT_ID(intel_motion_estimation))
       extensions->extensions[id].base.ext_enabled = 1;
+    if(id == EXT_ID(intel_required_subgroup_size))
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR > 40
+      extensions->extensions[id].base.ext_enabled = 1;
+#else
+      extensions->extensions[id].base.ext_enabled = 0;
+#endif
+  }
 }
 
 void
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 52a4953..e6c64ba 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -29,7 +29,8 @@
   DECL_EXT(intel_accelerator) \
   DECL_EXT(intel_motion_estimation) \
   DECL_EXT(intel_subgroups) \
-  DECL_EXT(intel_subgroups_short)
+  DECL_EXT(intel_subgroups_short) \
+  DECL_EXT(intel_required_subgroup_size)
 
 #define DECL_GL_EXTENSIONS \
   DECL_EXT(khr_gl_sharing)\
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index f190b0d..0379b3e 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -24,6 +24,7 @@
 
 //function pointer from libgbe.so
 gbe_program_new_from_source_cb *compiler_program_new_from_source = NULL;
+gbe_program_new_from_llvm_file_cb *compiler_program_new_from_llvm_file = NULL;
 gbe_program_compile_from_source_cb *compiler_program_compile_from_source = NULL;
 gbe_program_new_gen_program_cb *compiler_program_new_gen_program = NULL;
 gbe_program_link_program_cb *compiler_program_link_program = NULL;
@@ -298,6 +299,10 @@ struct GbeLoaderInitializer
       if (compiler_program_new_from_source == NULL)
         return;
 
+      compiler_program_new_from_llvm_file = *(gbe_program_new_from_llvm_file_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm_file");
+      if (compiler_program_new_from_llvm_file == NULL)
+        return;
+
       compiler_program_compile_from_source = *(gbe_program_compile_from_source_cb **)dlsym(dlhCompiler, "gbe_program_compile_from_source");
       if (compiler_program_compile_from_source == NULL)
         return;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index df885d2..df85f1e 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -25,6 +25,7 @@
 extern "C" {
 #endif
 extern gbe_program_new_from_source_cb *compiler_program_new_from_source;
+extern gbe_program_new_from_llvm_file_cb *compiler_program_new_from_llvm_file;
 extern gbe_program_compile_from_source_cb *compiler_program_compile_from_source;
 extern gbe_program_new_gen_program_cb *compiler_program_new_gen_program;
 extern gbe_program_link_program_cb *compiler_program_link_program;
diff --git a/src/cl_gl_api.c b/src/cl_gl_api.c
index 897edb4..61b3ab8 100644
--- a/src/cl_gl_api.c
+++ b/src/cl_gl_api.c
@@ -35,6 +35,8 @@
 #include "cl_sampler.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
+#include "cl_enqueue.h"
+#include "cl_event.h"
 
 #include "CL/cl.h"
 #include "CL/cl_gl.h"
@@ -134,6 +136,80 @@ cl_int clEnqueueAcquireGLObjects (cl_command_queue command_queue,
                                   cl_event *event)
 {
   cl_int err = CL_SUCCESS;
+  cl_int e_status, i;
+  cl_event e = NULL;
+  enqueue_data *data = NULL;
+
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      err = CL_INVALID_COMMAND_QUEUE;
+      break;
+    }
+
+    if (UNLIKELY(command_queue->ctx->props.gl_type == CL_GL_NOSHARE)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+
+    if ((num_objects == 0 && mem_objects != NULL) ||
+        (num_objects > 0 && mem_objects == NULL)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    for (i = 0; i < num_objects; i++) {
+      if (!cl_mem_image(mem_objects[i])) {
+        err = CL_INVALID_MEM_OBJECT;
+        break;
+      }
+      if (!IS_GL_IMAGE(mem_objects[i])) {
+        err = CL_INVALID_GL_OBJECT;
+        break;
+      }
+    }
+    if (err != CL_SUCCESS) {
+      break;
+    }
+
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_ACQUIRE_GL_OBJECTS, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+
+    e_status = cl_event_is_ready(e);
+
+    data = &e->exec_data;
+    data->type = EnqueueReturnSuccesss;
+
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+      if (err != CL_SUCCESS) {
+        break;
+      }
+
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+
   return err;
 }
 
@@ -146,5 +222,79 @@ cl_int clEnqueueReleaseGLObjects (cl_command_queue command_queue,
                                   cl_event *event)
 {
   cl_int err = CL_SUCCESS;
+  cl_int e_status, i;
+  cl_event e = NULL;
+  enqueue_data *data = NULL;
+
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      err = CL_INVALID_COMMAND_QUEUE;
+      break;
+    }
+
+    if (UNLIKELY(command_queue->ctx->props.gl_type == CL_GL_NOSHARE)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+
+    if ((num_objects == 0 && mem_objects != NULL) ||
+        (num_objects > 0 && mem_objects == NULL)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    for (i = 0; i < num_objects; i++) {
+      if (!cl_mem_image(mem_objects[i])) {
+        err = CL_INVALID_MEM_OBJECT;
+        break;
+      }
+      if (!IS_GL_IMAGE(mem_objects[i])) {
+        err = CL_INVALID_GL_OBJECT;
+        break;
+      }
+    }
+    if (err != CL_SUCCESS) {
+      break;
+    }
+
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_ACQUIRE_GL_OBJECTS, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+
+    e_status = cl_event_is_ready(e);
+
+    data = &e->exec_data;
+    data->type = EnqueueReturnSuccesss;
+
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+      if (err != CL_SUCCESS) {
+        break;
+      }
+
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+
   return err;
 }
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index ca4f3c5..f6cb5f8 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -153,4 +153,6 @@ DECL_INFO_STRING(spir_versions, "1.2")
 .partition_type = {0},
 .image_pitch_alignment = 1,
 .image_base_address_alignment = 4096,
+.sub_group_sizes = {8, 16},
+.sub_group_sizes_sz = sizeof(size_t) * 2,
 .cmrt_device = NULL
diff --git a/src/cl_program.c b/src/cl_program.c
index 363aed5..3cb5906 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -371,6 +371,7 @@ cl_program_create_with_built_in_kernles(cl_context     ctx,
                                   cl_int *             errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  cl_program built_in_prgs = NULL;
 
   assert(ctx);
   INVALID_DEVICE_IF (num_devices != 1);
@@ -382,54 +383,24 @@ cl_program_create_with_built_in_kernles(cl_context     ctx,
   extern size_t cl_internal_built_in_kernel_str_size;
   char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
 
-  ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
-                                                          &ctx->devices[0],
-                                                          (size_t*)&cl_internal_built_in_kernel_str_size,
-                                                          (const unsigned char **)&p_built_in_kernel_str,
-                                                          &binary_status, &err);
-  if (!ctx->built_in_prgs)
+  built_in_prgs = cl_program_create_from_binary(ctx, 1,
+                                                &ctx->devices[0],
+                                                (size_t*)&cl_internal_built_in_kernel_str_size,
+                                                (const unsigned char **)&p_built_in_kernel_str,
+                                                &binary_status, &err);
+  if (!built_in_prgs)
     return NULL;
 
-  err = cl_program_build(ctx->built_in_prgs, NULL);
+  err = cl_program_build(built_in_prgs, NULL);
   if (err != CL_SUCCESS)
     return NULL;
 
-  ctx->built_in_prgs->is_built = 1;
-
-  char delims[] = ";";
-  char* saveptr = NULL;
-  char* local_kernel_names;
-  char* kernel = NULL;
-  char* matched_kernel;
-  int i = 0;
-
-  //copy the content to local_kernel_names to protect the kernel_names.
-  TRY_ALLOC(local_kernel_names, cl_calloc(strlen(kernel_names)+1, sizeof(char) ) );
-  memcpy(local_kernel_names, kernel_names, strlen(kernel_names)+1);
-
-  kernel = strtok_r( local_kernel_names, delims , &saveptr);
-  while( kernel != NULL ) {
-    matched_kernel = strstr(ctx->devices[0]->built_in_kernels, kernel);
-    if(matched_kernel){
-      for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
-        assert(ctx->built_in_prgs->ker[i]);
-        const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
-        if (ker_name != NULL && strcmp(ker_name, kernel) == 0) {
-          break;
-        }
-      }
-
-      ctx->built_in_kernels[i] = cl_program_create_kernel(ctx->built_in_prgs, kernel, NULL);
-    }
-    kernel = strtok_r((char*)saveptr , delims, &saveptr );
-  }
-
-  cl_free(local_kernel_names);
+  built_in_prgs->is_built = 1;
 
 exit:
   if (errcode_ret)
     *errcode_ret = err;
-  return ctx->built_in_prgs;
+  return built_in_prgs;
 error:
   goto exit;
 
@@ -458,7 +429,7 @@ cl_program_create_from_llvm(cl_context ctx,
       goto error;
   }
 
-  program->opaque = compiler_program_new_from_llvm(ctx->devices[0]->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
+  program->opaque = compiler_program_new_from_llvm_file(ctx->devices[0]->device_id, file_name, program->build_log_max_sz, program->build_log, &program->build_log_sz);
   if (UNLIKELY(program->opaque == NULL)) {
     err = CL_INVALID_PROGRAM;
     goto error;
@@ -675,7 +646,8 @@ cl_program_build(cl_program p, const char *options)
     memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
     copyed += sz;
   }
-  if ((err = get_program_global_data(p)) != CL_SUCCESS)
+  uint32_t ocl_version = interp_kernel_get_ocl_version(interp_program_get_kernel(p->opaque, 0));
+  if (ocl_version >= 200 && (err = get_program_global_data(p)) != CL_SUCCESS)
     goto error;
 
   p->is_built = 1;
@@ -784,7 +756,8 @@ cl_program_link(cl_context            context,
     copyed += sz;
   }
 
-  if ((err = get_program_global_data(p)) != CL_SUCCESS)
+  uint32_t ocl_version = interp_kernel_get_ocl_version(interp_program_get_kernel(p->opaque, 0));
+  if (ocl_version >= 200 && (err = get_program_global_data(p)) != CL_SUCCESS)
     goto error;
 
 done:
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index b8a1b52..3caf737 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -137,19 +137,28 @@ return 1;
 static int
 intel_driver_context_init(intel_driver_t *driver)
 {
-driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
-if (!driver->ctx)
-  return 0;
-driver->null_bo = NULL;
+  driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
+  if (!driver->ctx)
+    return 0;
+  driver->null_bo = NULL;
 #ifdef HAS_BO_SET_SOFTPIN
-drm_intel_bo *bo = dri_bo_alloc(driver->bufmgr, "null_bo", 64*1024, 4096);
-drm_intel_bo_set_softpin_offset(bo, 0);
-// don't reuse it, that would make two bo trying to bind to same address,
-// which is un-reasonable.
-drm_intel_bo_disable_reuse(bo);
-driver->null_bo = bo;
+  drm_intel_bo *bo = dri_bo_alloc(driver->bufmgr, "null_bo", 64*1024, 4096);
+  drm_intel_bo_set_softpin_offset(bo, 0);
+  // don't reuse it, that would make two bo trying to bind to same address,
+  // which is un-reasonable.
+  drm_intel_bo_disable_reuse(bo);
+
+  drm_intel_bo_map(bo, 1);
+  *(uint32_t *)bo->virtual = MI_BATCH_BUFFER_END;
+  drm_intel_bo_unmap(bo);
+
+  if (drm_intel_gem_bo_context_exec(bo, driver->ctx, 0, 0) == 0) {
+    driver->null_bo = bo;
+  } else {
+    drm_intel_bo_unreference(bo);
+  }
 #endif
-return 1;
+    return 1;
 }
 
 static void
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 283b07a..4f6989d 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -2529,6 +2529,8 @@ intel_set_gpgpu_callbacks(int device_id)
     intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
     intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //SKL need not restore SLM, same as gen7
     intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+    if(IS_GEMINILAKE(device_id))
+      intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
     intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen9;
     intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen9;
     intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 43cf7f3..300d87a 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -283,6 +283,7 @@ set (utests_sources
   compiler_sub_group_shuffle_down.cpp
   compiler_sub_group_shuffle_up.cpp
   compiler_sub_group_shuffle_xor.cpp
+  compiler_reqd_sub_group_size.cpp
   builtin_global_linear_id.cpp
   builtin_local_linear_id.cpp
   multi_queue_events.cpp
@@ -297,7 +298,9 @@ set (utests_sources
   compiler_generic_pointer.cpp
   runtime_pipe_query.cpp
   compiler_pipe_builtin.cpp
-  compiler_device_enqueue.cpp)
+  compiler_device_enqueue.cpp
+  compiler_sqrt_div.cpp
+  compiler_remove_negative_add.cpp)
 
 if (LLVM_VERSION_NODOT VERSION_GREATER 34)
   SET(utests_sources
diff --git a/utests/compiler_if_else.cpp b/utests/compiler_if_else.cpp
index e38b23f..d65ecd3 100644
--- a/utests/compiler_if_else.cpp
+++ b/utests/compiler_if_else.cpp
@@ -21,43 +21,38 @@ static void compiler_if_else(void)
   OCL_NDRANGE(1);
 
   // First control flow
-  OCL_MAP_BUFFER(0);
   OCL_MAP_BUFFER(1);
   for (uint32_t i = 0; i < 16; ++i) {
     OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
-    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
   }
+  OCL_UNMAP_BUFFER(1);
 
   // Second control flow
+  OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
   OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
   OCL_NDRANGE(1);
-  OCL_MAP_BUFFER(0);
   OCL_MAP_BUFFER(1);
   for (uint32_t i = 0; i < 16; ++i) {
     OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
-    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
   }
+  OCL_UNMAP_BUFFER(1);
 
   // Third control flow
+  OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 2;
   for (uint32_t i = 4; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
   OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
   OCL_NDRANGE(1);
-  OCL_MAP_BUFFER(0);
   OCL_MAP_BUFFER(1);
   for (uint32_t i = 0; i < 3; ++i) {
     OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
-    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
   }
   OCL_ASSERT(((int32_t*)buf_data[1])[3] == -1);
-  OCL_ASSERT(((int32_t*)buf_data[0])[3] == 1);
   for (uint32_t i = 4; i < 16; ++i) {
     OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
-    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
   }
+  OCL_UNMAP_BUFFER(1);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_if_else);
diff --git a/utests/compiler_remove_negative_add.cpp b/utests/compiler_remove_negative_add.cpp
new file mode 100644
index 0000000..2b5df73
--- /dev/null
+++ b/utests/compiler_remove_negative_add.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+void compiler_remove_negative_add(void) {
+  const int n = 1024;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_remove_negative_add");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1024; j++) {
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < n; ++i) {
+      src[i] = ((float *)buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+    }
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(1);
+    float *dst = (float *)buf_data[1];
+    for (int i = 0; i < n; ++i) {
+      float cpu = exp2(-src[i]);
+      float gpu = dst[i];
+      if (fabsf(cpu - gpu) >= 1e-3) {
+        printf("%f %f %f", src[i], cpu, gpu);
+        OCL_ASSERT(0);
+      }
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_remove_negative_add);
diff --git a/utests/compiler_reqd_sub_group_size.cpp b/utests/compiler_reqd_sub_group_size.cpp
new file mode 100644
index 0000000..37d96fe
--- /dev/null
+++ b/utests/compiler_reqd_sub_group_size.cpp
@@ -0,0 +1,46 @@
+#include "utest_helper.hpp"
+#include<string>
+#include<sstream>
+#include<iostream>
+
+using namespace std;
+
+void compiler_reqd_sub_group_size(void)
+{
+  if (!cl_check_reqd_subgroup())
+    return;
+
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_SUB_GROUP_SIZES_INTEL,
+           0, NULL, &param_value_size);
+
+  size_t* param_value = new size_t[param_value_size];
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_SUB_GROUP_SIZES_INTEL,
+           param_value_size, param_value, NULL);
+
+  const char* opt = "-D SIMD_SIZE=";
+  for( uint32_t i = 0; i < param_value_size / sizeof(size_t) ; ++i)
+  {
+    ostringstream ss;
+    uint32_t simd_size = param_value[i];
+    ss << opt << simd_size;
+    //cout << "options: " << ss.str() << endl;
+    OCL_CALL(cl_kernel_init, "compiler_reqd_sub_group_size.cl", "compiler_reqd_sub_group_size",
+                             SOURCE, ss.str().c_str());
+    size_t SIMD_SIZE = 0;
+    OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device, CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL,0, NULL,sizeof(size_t),&SIMD_SIZE,NULL);
+    //cout << SIMD_SIZE << " with " << simd_size << endl;
+    OCL_ASSERT(SIMD_SIZE == simd_size);
+
+    cl_ulong SPILL_SIZE = 0xFFFFFFFF;
+    OCL_CALL(clGetKernelWorkGroupInfo, kernel, device, CL_KERNEL_SPILL_MEM_SIZE_INTEL, sizeof(cl_ulong), &SPILL_SIZE, NULL);
+    //cout << "spill size: " << SPILL_SIZE << endl;
+    OCL_ASSERT(SPILL_SIZE == 0);
+
+    clReleaseProgram(program);
+    program = NULL;
+  }
+  delete[] param_value;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_reqd_sub_group_size);
diff --git a/utests/compiler_sqrt_div.cpp b/utests/compiler_sqrt_div.cpp
new file mode 100644
index 0000000..c22c5a3
--- /dev/null
+++ b/utests/compiler_sqrt_div.cpp
@@ -0,0 +1,61 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+void compiler_sqrt_div(void) {
+  const int n = 1024;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_sqrt_div");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * 4 * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1024; j++) {
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < n; ++i) {
+      src[i] = ((float *)buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+    }
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(1);
+    float *dst = (float *)buf_data[1];
+    for (int i = 0; i < n; ++i) {
+      float cpu = 1.0f / sqrt(src[i]);
+      float gpu = dst[4 * i];
+      if (fabsf(cpu - gpu) >= 1e-3) {
+        printf("%f %f %f", src[i], cpu, gpu);
+        OCL_ASSERT(0);
+      }
+
+      cpu = i / sqrt(src[i]);
+      gpu = dst[4 * i + 1];
+      if (fabsf(cpu - gpu) >= 1e-3) {
+        printf("%f %f %f", src[i], cpu, gpu);
+        OCL_ASSERT(0);
+      }
+
+      cpu = 2.0f / sqrt(src[i]);
+      gpu = dst[4 * i + 2];
+      if (fabsf(cpu - gpu) >= 1e-3) {
+        printf("%f %f %f", src[i], cpu, gpu);
+        OCL_ASSERT(0);
+      }
+
+      cpu = 1.0f / sqrt(src[i]) + sqrt(src[i]);
+      gpu = dst[4 * i + 3];
+      if (fabsf(cpu - gpu) >= 1e-3) {
+        printf("%f %f %f", src[i], cpu, gpu);
+        OCL_ASSERT(0);
+      }
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_sqrt_div);
diff --git a/utests/enqueue_built_in_kernels.cpp b/utests/enqueue_built_in_kernels.cpp
index 52b8848..2afbabd 100644
--- a/utests/enqueue_built_in_kernels.cpp
+++ b/utests/enqueue_built_in_kernels.cpp
@@ -14,6 +14,7 @@ void enqueue_built_in_kernels(void)
   OCL_ASSERT(ret_sz == built_in_kernels_size);
   cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
   OCL_ASSERT(built_in_prog != NULL);
+  clReleaseProgram(built_in_prog);
 }
 
 MAKE_UTEST_FROM_FUNCTION(enqueue_built_in_kernels);
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 7052a14..4dae20d 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -1119,3 +1119,23 @@ float as_float(uint32_t i)
   _tmp._uint = i;
   return _tmp._float;
 }
+
+int cl_check_reqd_subgroup(void)
+{
+  if (!cl_check_subgroups())
+    return 0;
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_intel_required_subgroup_size") == NULL) {
+    printf("No cl_intel_required_subgroup_size, Skip!");
+    return 0;
+  }
+  return 1;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index e2a6a88..19ec69d 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -325,4 +325,7 @@ extern float as_float(uint32_t i);
 extern uint32_t as_uint(float f);
 /* Check is intel subgroups short enabled. */
 extern int cl_check_subgroups_short(void);
+
+/* Check is intel_required_subgroup_size enabled. */
+extern int cl_check_reqd_subgroup(void);
 #endif /* __UTEST_HELPER_HPP__ */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-commits mailing list