[beignet] 01/13: Imported Upstream version 1.1.2

Rebecca Palmer rnpalmer-guest at moszumanska.debian.org
Mon Apr 25 21:55:11 UTC 2016


This is an automated email from the git hooks/post-receive script.

rnpalmer-guest pushed a commit to branch master
in repository beignet.

commit e22f3a9666cc94ec94f4efbefda978997f2a3791
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date:   Sun Apr 24 13:40:59 2016 +0100

    Imported Upstream version 1.1.2
---
 CMakeLists.txt                                     |   2 +-
 GetGenID.sh                                        |   2 +-
 backend/src/backend/context.cpp                    |   6 +-
 backend/src/backend/gen8_context.cpp               |   2 +-
 backend/src/backend/gen8_context.hpp               |   2 +
 backend/src/backend/gen9_context.cpp               | 110 ++++
 backend/src/backend/gen9_context.hpp               |  22 +
 backend/src/backend/gen_insn_selection.cpp         |  13 +-
 backend/src/backend/gen_insn_selection.hpp         |   7 +
 backend/src/backend/gen_program.cpp                |  17 +-
 backend/src/backend/program.cpp                    | 145 +++-
 backend/src/backend/program.hpp                    |  13 +-
 backend/src/gbe_bin_generater.cpp                  |   4 +
 backend/src/libocl/CMakeLists.txt                  |   5 +-
 backend/src/libocl/include/ocl.h                   |   1 +
 backend/src/libocl/include/ocl_memcpy.h            |  51 ++
 backend/src/libocl/include/{ocl.h => ocl_memset.h} |  39 +-
 backend/src/libocl/script/ocl_convert.sh           |  26 +-
 backend/src/libocl/src/ocl_barrier.ll              |   3 +
 backend/src/libocl/src/ocl_clz.ll                  |   3 +
 backend/src/libocl/src/ocl_memcpy.cl               |  49 ++
 backend/src/libocl/src/ocl_memcpy.ll               | 729 ---------------------
 .../libocl/{include/ocl.h => src/ocl_memset.cl}    |  50 +-
 backend/src/libocl/src/ocl_memset.ll               | 193 ------
 backend/src/llvm/ExpandConstantExpr.cpp            |   7 +-
 backend/src/llvm/ExpandLargeIntegers.cpp           |  21 +-
 backend/src/llvm/ExpandUtils.cpp                   |   8 +-
 backend/src/llvm/PromoteIntegers.cpp               |  10 +-
 backend/src/llvm/StripAttributes.cpp               |   9 +-
 backend/src/llvm/llvm_barrier_nodup.cpp            |  25 +-
 backend/src/llvm/llvm_bitcode_link.cpp             |  20 +-
 backend/src/llvm/llvm_gen_backend.cpp              | 114 +---
 backend/src/llvm/llvm_gen_backend.hpp              |   4 -
 backend/src/llvm/llvm_includes.hpp                 | 125 ++++
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |  24 +-
 backend/src/llvm/llvm_loadstore_optimization.cpp   |  36 +-
 backend/src/llvm/llvm_passes.cpp                   |  70 +-
 backend/src/llvm/llvm_printf_parser.cpp            |  63 +-
 backend/src/llvm/llvm_sampler_fix.cpp              |  21 +-
 backend/src/llvm/llvm_scalarize.cpp                |  35 +-
 backend/src/llvm/llvm_to_gen.cpp                   |  80 +--
 backend/src/llvm/llvm_unroll.cpp                   |  36 +-
 docs/NEWS.mdwn                                     |   3 +
 kernels/compiler_function_qualifiers.cl            |   4 +-
 src/cl_api.c                                       |   3 +-
 src/cl_command_queue.c                             |  22 +-
 src/cl_command_queue_gen7.c                        |   2 +-
 src/cl_device_data.h                               |   9 +-
 src/cl_device_id.c                                 |  46 +-
 src/cl_device_id.h                                 |   4 +-
 src/cl_driver.h                                    |   2 +-
 src/cl_event.c                                     |  58 +-
 src/cl_event.h                                     |   5 +
 src/cl_gt_device.h                                 |   2 +-
 src/intel/intel_driver.c                           |   6 +-
 src/intel/intel_gpgpu.c                            |  44 +-
 utests/compiler_cl_finish.cpp                      |   7 +-
 utests/profiling_exec.cpp                          |   1 +
 58 files changed, 886 insertions(+), 1534 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ed8429..88985d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 1)
 set (LIBCL_DRIVER_VERSION_MINOR 1)
-set (LIBCL_DRIVER_VERSION_PATCH 1)
+set (LIBCL_DRIVER_VERSION_PATCH 2)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 2)
 if( ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
diff --git a/GetGenID.sh b/GetGenID.sh
index 7acf9bd..30296da 100755
--- a/GetGenID.sh
+++ b/GetGenID.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26)
+genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 5a84)
 pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] '{print $1}'))
 n=${#pciid[*]}
 i=0
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index b8dfa8c..b230aa8 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -452,7 +452,11 @@ namespace gbe
       const auto &arg = fn.getArg(argID);
 
       kernel->args[argID].align = arg.align;
-      kernel->args[argID].info = arg.info;
+      kernel->args[argID].info.addrSpace = arg.info.addrSpace;
+      kernel->args[argID].info.typeName = arg.info.typeName;
+      kernel->args[argID].info.accessQual = arg.info.accessQual;
+      kernel->args[argID].info.typeQual = arg.info.typeQual;
+      kernel->args[argID].info.argName = arg.info.argName;
       switch (arg.type) {
         case ir::FunctionArgument::VALUE:
         case ir::FunctionArgument::STRUCTURE:
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index b497ee5..a92bdde 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -356,7 +356,7 @@ namespace gbe
     GBE_ASSERT(0);
   }
 
-  static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0)
+  GenRegister Gen8Context::unpacked_ud(GenRegister reg, uint32_t offset)
   {
     if(reg.hstride == GEN_HORIZONTAL_STRIDE_0) {
       if(offset == 0)
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index 84508e9..b33aeeb 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -74,6 +74,8 @@ namespace gbe
     virtual void emitPackLongInstruction(const SelectionInstruction &insn);
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
 
+    static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0);
+
   protected:
     virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
     virtual GenEncoder* generateEncoder(void) {
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 326f5a1..6b01657 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -54,4 +54,114 @@ namespace gbe
       p->WAIT();
     p->pop();
   }
+
+  void BxtContext::newSelection(void) {
+    this->sel = GBE_NEW(SelectionBxt, *this);
+  }
+
+  void BxtContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                             GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
+  {
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst_h.type = dst_l.type = GEN_TYPE_UL;
+    s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
+
+    //GenRegister tmp;
+
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, reuse s0l_s1h
+    GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, reuse dst_l
+
+    p->MOV(s0h, GenRegister::offset(s0l, 0, 4));
+    p->MOV(s1h, GenRegister::offset(s1l, 0, 4));
+
+    /* High 32 bits X High 32 bits. */
+    p->MUL(dst_h, s0h, s1h);
+    /* High 32 bits X low 32 bits. */
+    p->MUL(s0h_s1l, s0h, s1l);
+    /* Low 32 bits X high 32 bits. */
+    p->MUL(s0l_s1h, s0l, s1h);
+    /* Low 32 bits X low 32 bits. */
+    p->MUL(dst_l, s0l, s1l);
+
+    /*  Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32
+        The max of addding 2 32bits integer to it is
+        2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
+        which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not
+        overflow and have no carry.
+        By this manner, we can avoid using acc register, which has a lot of restrictions. */
+
+    GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
+    p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
+
+    p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32));
+    GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h);
+    p->ADD(dst_h, dst_h, s0l_s1h_h);
+
+    GenRegister dst_l_h = unpacked_ud(s0l_s1h);
+    p->MOV(dst_l_h, unpacked_ud(dst_l, 1));
+    p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
+
+    // No longer need s0l_s1h
+    GenRegister tmp = s0l_s1h;
+
+    p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
+    GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
+    p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked);
+
+    p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
+    p->ADD(dst_h, dst_h, tmp);
+  }
+
+  void BxtContext::emitI64MULInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister res = ra->genReg(insn.dst(1));
+
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst.type = GEN_TYPE_UL;
+    res.type = GEN_TYPE_UL;
+
+    /* Low 32 bits X low 32 bits. */
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    p->MUL(dst, s0l, s1l);
+
+    /* Low 32 bits X high 32 bits. */
+    GenRegister s1h = unpacked_ud(res);
+    p->MOV(s1h, unpacked_ud(src1, 1));
+
+    p->MUL(res, s0l, s1h);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+
+    /* High 32 bits X low 32 bits. */
+    GenRegister s0h = unpacked_ud(res);
+    p->MOV(s0h, unpacked_ud(src0, 1));
+
+    p->MUL(res, s0h, s1l);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+  }
+
+  void BxtContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    if (sz == 0)
+      sz = 16;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+
+    p->push();
+    p->curr.execWidth = 1;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    for (int i = 0; i < sz/2; i++) {
+      p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+             GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+    }
+    p->pop();
+  }
+
 }
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 8acad8c..a2931cc 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -46,5 +46,27 @@ namespace gbe
   private:
     virtual void newSelection(void);
   };
+
+  //most code of BxtContext are copied from ChvContext, it results in two physical copy of the same code.
+  //there are two possible ways to resolve it: 1) virtual inheritance  2) class template
+  //but either way makes BxtContext and ChvContext tied closely, it might impact the flexibility of future changes
+  //so, choose the method of two physical copies.
+  class BxtContext : public Gen9Context
+  {
+  public:
+    virtual ~BxtContext(void) { }
+    BxtContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+            : Gen9Context(unit, name, deviceID, relaxMath) {
+    };
+    virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+
+  protected:
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
+  private:
+    virtual void newSelection(void);
+    virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                           GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+  };
 }
 #endif /* __GBE_GEN9_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 520aede..7eec2b3 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2072,6 +2072,15 @@ namespace gbe
     this->opaque->setHasHalfType(true);
   }
 
+  SelectionBxt::SelectionBxt(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setLongRegRestrict(true);
+    this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+    this->opaque->setSlowByteGather(true);
+    this->opaque->setHasHalfType(true);
+  }
+
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
                                       uint32_t bti, bool is3D) {
     uint32_t elemID = 0;
@@ -3585,9 +3594,9 @@ namespace gbe
               sel.curr.execWidth = 1;
             }
             if (elemSize == GEN_BYTE_SCATTER_WORD)
-              sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst, isUniform));
             else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-              sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst, isUniform));
           sel.pop();
         }
       }
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index ffc79e1..3bb00dd 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -298,6 +298,13 @@ namespace gbe
       Selection9(GenContext &ctx);
   };
 
+  class SelectionBxt: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      SelectionBxt(GenContext &ctx);
+  };
+
 } /* namespace gbe */
 
 #endif /*  __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 04da692..4577990 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -170,6 +170,8 @@ namespace gbe {
       ctx = GBE_NEW(ChvContext, unit, name, deviceID, relaxMath);
     } else if (IS_SKYLAKE(deviceID)) {
       ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
+    } else if (IS_BROXTON(deviceID)) {
+      ctx = GBE_NEW(BxtContext, unit, name, deviceID, relaxMath);
     }
     GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
     ctx->setASMFileName(this->asm_file_name);
@@ -214,7 +216,8 @@ namespace gbe {
                                       (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) ||  \
                                       (IS_BROADWELL(typeA) && !strcmp(src_hw_info, "BDW")) ||  \
                                       (IS_CHERRYVIEW(typeA) && !strcmp(src_hw_info, "CHV")) ||  \
-                                      (IS_SKYLAKE(typeA) && !strcmp(src_hw_info, "SKL")) )
+                                      (IS_SKYLAKE(typeA) && !strcmp(src_hw_info, "SKL")) || \
+                                      (IS_BROXTON(typeA) && !strcmp(src_hw_info, "BXT")) )
 
   static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
     using namespace gbe;
@@ -328,6 +331,14 @@ namespace gbe {
         src_hw_info[0]='S';
         src_hw_info[1]='K';
         src_hw_info[2]='L';
+      }else if(IS_BROXTON(prog->deviceID)){
+        src_hw_info[0]='B';
+        src_hw_info[1]='X';
+        src_hw_info[2]='T';
+      }else {
+        free(*binary);
+        *binary = NULL;
+        return 0;
       }
       FILL_DEVICE_ID(*binary, src_hw_info);
       memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
@@ -402,7 +413,11 @@ namespace gbe {
       llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
       llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+      if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource_Removed, &errMsg)) {
+#else
       if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource, &errMsg)) {
+#endif
         if (err != NULL && errSize != NULL && stringSize > 0u) {
           strncpy(err, errMsg, stringSize-1);
           err[stringSize-1] = '\0';
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 0ee76fc..145eb0f 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -273,6 +273,24 @@ namespace gbe {
       OUT_UPDATE_SZ(arg.size);
       OUT_UPDATE_SZ(arg.align);
       OUT_UPDATE_SZ(arg.bti);
+
+      OUT_UPDATE_SZ(arg.info.addrSpace);
+
+      OUT_UPDATE_SZ(arg.info.typeName.size());
+      outs.write(arg.info.typeName.c_str(), arg.info.typeName.size());
+      ret_size += sizeof(char)*arg.info.typeName.size();
+
+      OUT_UPDATE_SZ(arg.info.accessQual.size());
+      outs.write(arg.info.accessQual.c_str(), arg.info.accessQual.size());
+      ret_size += sizeof(char)*arg.info.accessQual.size();
+
+      OUT_UPDATE_SZ(arg.info.typeQual.size());
+      outs.write(arg.info.typeQual.c_str(), arg.info.typeQual.size());
+      ret_size += sizeof(char)*arg.info.typeQual.size();
+
+      OUT_UPDATE_SZ(arg.info.argName.size());
+      outs.write(arg.info.argName.c_str(), arg.info.argName.size());
+      ret_size += sizeof(char)*arg.info.argName.size();
     }
 
     OUT_UPDATE_SZ(patches.size());
@@ -363,6 +381,43 @@ namespace gbe {
       IN_UPDATE_SZ(arg.size);
       IN_UPDATE_SZ(arg.align);
       IN_UPDATE_SZ(arg.bti);
+
+      IN_UPDATE_SZ(arg.info.addrSpace);
+
+      size_t len;
+      char* a_name = NULL;
+
+      IN_UPDATE_SZ(len);
+      a_name = new char[len+1];
+      ins.read(a_name, len*sizeof(char));
+      total_size += sizeof(char)*len;
+      a_name[len] = 0;
+      arg.info.typeName = a_name;
+      delete[] a_name;
+
+      IN_UPDATE_SZ(len);
+      a_name = new char[len+1];
+      ins.read(a_name, len*sizeof(char));
+      total_size += sizeof(char)*len;
+      a_name[len] = 0;
+      arg.info.accessQual = a_name;
+      delete[] a_name;
+
+      IN_UPDATE_SZ(len);
+      a_name = new char[len+1];
+      ins.read(a_name, len*sizeof(char));
+      total_size += sizeof(char)*len;
+      a_name[len] = 0;
+      arg.info.typeQual = a_name;
+      delete[] a_name;
+
+      IN_UPDATE_SZ(len);
+      a_name = new char[len+1];
+      ins.read(a_name, len*sizeof(char));
+      total_size += sizeof(char)*len;
+      a_name[len] = 0;
+      arg.info.argName = a_name;
+      delete[] a_name;
     }
 
     IN_UPDATE_SZ(patch_num);
@@ -530,8 +585,6 @@ namespace gbe {
     }
 
     args.push_back("-cl-kernel-arg-info");
-    args.push_back("-mllvm");
-    args.push_back("-inline-threshold=200000");
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
     args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
 #endif
@@ -596,18 +649,7 @@ namespace gbe {
     clang::LangOptions & lang_opts = Clang.getLangOpts();
     lang_opts.OpenCL = 1;
     
-    //llvm flags need command line parsing to take effect
-    if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
-      unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
-      const char **Args = new const char*[NumArgs + 2];
-      Args[0] = "clang (LLVM option parsing)";
-      for (unsigned i = 0; i != NumArgs; ++i){
-        Args[i + 1] = Clang.getFrontendOpts().LLVMArgs[i].c_str();
-      }
-      Args[NumArgs + 1] = 0;
-      llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
-      delete [] Args;
-    }
+    GBE_ASSERT(Clang.getFrontendOpts().LLVMArgs.empty() && "We do not have llvm args now");
   
     // Create an action and make the compiler instance carry it out
     std::unique_ptr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction(llvm_ctx));
@@ -715,9 +757,9 @@ namespace gbe {
     bool useDefaultCLCVersion = true;
 
     if (options) {
-      char *str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
-      memcpy(str, options, strlen(options) + 1);
-      std::string optionStr(str);
+      char *c_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+      memcpy(c_str, options, strlen(options) + 1);
+      std::string optionStr(c_str);
       const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
                        "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
 
@@ -726,14 +768,75 @@ namespace gbe {
       while (end != std::string::npos) {
         end = optionStr.find(' ', start);
         std::string str = optionStr.substr(start, end - start);
-        start = end + 1;
-        if(str.size() == 0)
+
+        if(str.size() == 0) {
+          start = end + 1;
           continue;
+        }
+
+EXTEND_QUOTE:
+        /* We need to find the ", if the there are odd number of " within this string,
+           we need to extend the string to the matched " of the last one. */
+        int quoteNum = 0;
+        for (size_t i = 0; i < str.size(); i++) {
+          if (str[i] == '"') {
+            quoteNum++;
+          }
+        }
+
+        if (quoteNum % 2) { // Odd number of ", need to extend the string.
+          /* find the second " */
+          while (end < optionStr.size() && optionStr[end] != '"')
+            end++;
+
+          if (end == optionStr.size()) {
+            printf("Warning: Unmatched \" number in build option\n");
+            free(c_str);
+            return false;
+          }
+
+          GBE_ASSERT(optionStr[end] == '"');
+          end++;
+
+          if (end < optionStr.size() && optionStr[end] != ' ') {
+            // "CC AAA"BBDDDD case, need to further extend.
+            end = optionStr.find(' ', end);
+            str = optionStr.substr(start, end - start);
+            goto EXTEND_QUOTE;
+          } else {
+            str = optionStr.substr(start, end - start);
+          }
+        }
+        start = end + 1;
 
         if(unsupportedOptions.find(str) != std::string::npos) {
           continue;
         }
 
+        /* if -I, we need to extract "path" to path, no " */
+        if (clOpt.back() == "-I") {
+          if (str[0] == '"') {
+            GBE_ASSERT(str[str.size() - 1] == '"');
+            if (str.size() > 2) {
+              clOpt.push_back(str.substr(1, str.size() - 2));
+            } else {
+              clOpt.push_back("");
+            }
+            continue;
+          }
+        }
+        // The -I"YYYY" like case.
+        if (str.size() > 4 && str[0] == '-' && str[1] == 'I' && str[2] == '"') {
+          GBE_ASSERT(str[str.size() - 1] == '"');
+          clOpt.push_back("-I");
+          if (str.size() > 4) {
+            clOpt.push_back(str.substr(3, str.size() - 4));
+          } else {
+            clOpt.push_back("");
+          }
+          continue;
+        }
+
         if(str.find("-cl-std=") != std::string::npos) {
           useDefaultCLCVersion = false;
           if (str == "-cl-std=CL1.1")
@@ -767,7 +870,7 @@ namespace gbe {
 
         clOpt.push_back(str);
       }
-      free(str);
+      free(c_str);
     }
 
     if (useDefaultCLCVersion) {
@@ -1023,7 +1126,7 @@ namespace gbe {
   static void *kernelGetArgInfo(gbe_kernel genKernel, uint32_t argID, uint32_t value) {
     if (genKernel == NULL) return NULL;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
-    ir::FunctionArgument::InfoFromLLVM* info = kernel->getArgInfo(argID);
+    KernelArgument::ArgInfo* info = kernel->getArgInfo(argID);
 
     switch (value) {
       case GBE_GET_ARG_INFO_ADDRSPACE:
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index cff2463..4836c51 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -49,7 +49,16 @@ namespace gbe {
     uint32_t size;     //!< Size of the argument
     uint32_t align;    //!< addr alignment of the argument
     uint8_t bti;      //!< binding table index for __global buffer
-    ir::FunctionArgument::InfoFromLLVM info;
+
+    // Strings for arg info.
+    struct ArgInfo {
+      uint32_t addrSpace;
+      std::string typeName;
+      std::string accessQual;
+      std::string typeQual;
+      std::string argName;
+    };
+    ArgInfo info;
   };
 
   /*! Stores the offset where to patch where to patch */
@@ -161,7 +170,7 @@ namespace gbe {
                                 global_wk_sz1, global_wk_sz2, output_sz);
     }
 
-    ir::FunctionArgument::InfoFromLLVM* getArgInfo(uint32_t id) const { return &args[id].info; }
+    KernelArgument::ArgInfo* getArgInfo(uint32_t id) const { return &args[id].info; }
 
     /*! Set compile work group size */
     void setCompileWorkGroupSize(const size_t wg_sz[3]) {
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index 86197e1..8225d4a 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -186,6 +186,10 @@ void program_build_instance::serialize_program(void) throw(int)
         src_hw_info[0]='S';
         src_hw_info[1]='K';
         src_hw_info[2]='L';
+    }else if(IS_BROXTON(gen_pci_id)){
+        src_hw_info[0]='B';
+        src_hw_info[1]='X';
+        src_hw_info[2]='T';
     }
 
     if (str_fmt_out) {
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 0cd1eef..0fffd9b 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -52,7 +52,8 @@ FOREACH(M ${OCL_COPY_HEADERS})
     COPY_THE_HEADER(${M})
 ENDFOREACH(M) 
 
-SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_misc ocl_vload ocl_geometric ocl_image)
+SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy
+                      ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image)
 FOREACH(M ${OCL_COPY_MODULES})
     COPY_THE_HEADER(${M})
     COPY_THE_SOURCE(${M})
@@ -181,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	)
 ENDMACRO(ADD_LL_TO_BC_TARGET)
 
-SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset ocl_clz)
+SET (OCL_LL_MODULES ocl_barrier ocl_clz)
 FOREACH(f ${OCL_LL_MODULES})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index a4af4aa..7897567 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -30,6 +30,7 @@
 #include "ocl_image.h"
 #include "ocl_integer.h"
 #include "ocl_math.h"
+#include "ocl_memcpy.h"
 #include "ocl_misc.h"
 #include "ocl_printf.h"
 #include "ocl_relational.h"
diff --git a/backend/src/libocl/include/ocl_memcpy.h b/backend/src/libocl/include/ocl_memcpy.h
new file mode 100644
index 0000000..2672298
--- /dev/null
+++ b/backend/src/libocl/include/ocl_memcpy.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_MEMCPY_H__
+#define __OCL_MEMCPY_H__
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// memcopy functions
+/////////////////////////////////////////////////////////////////////////////
+void __gen_memcpy_gg_align(__global uchar* dst, __global uchar* src, size_t size);
+void __gen_memcpy_gp_align(__global uchar* dst, __private uchar* src, size_t size);
+void __gen_memcpy_gl_align(__global uchar* dst, __local uchar* src, size_t size);
+void __gen_memcpy_gc_align(__global uchar* dst, __constant uchar* src, size_t size);
+void __gen_memcpy_pg_align(__private uchar* dst, __global uchar* src, size_t size);
+void __gen_memcpy_pp_align(__private uchar* dst, __private uchar* src, size_t size);
+void __gen_memcpy_pl_align(__private uchar* dst, __local uchar* src, size_t size);
+void __gen_memcpy_pc_align(__private uchar* dst, __constant uchar* src, size_t size);
+void __gen_memcpy_lg_align(__local uchar* dst, __global uchar* src, size_t size);
+void __gen_memcpy_lp_align(__local uchar* dst, __private uchar* src, size_t size);
+void __gen_memcpy_ll_align(__local uchar* dst, __local uchar* src, size_t size);
+void __gen_memcpy_lc_align(__local uchar* dst, __constant uchar* src, size_t size);
+
+void __gen_memcpy_gg(__global uchar* dst, __global uchar* src, size_t size);
+void __gen_memcpy_gp(__global uchar* dst, __private uchar* src, size_t size);
+void __gen_memcpy_gl(__global uchar* dst, __local uchar* src, size_t size);
+void __gen_memcpy_gc(__global uchar* dst, __constant uchar* src, size_t size);
+void __gen_memcpy_pg(__private uchar* dst, __global uchar* src, size_t size);
+void __gen_memcpy_pp(__private uchar* dst, __private uchar* src, size_t size);
+void __gen_memcpy_pl(__private uchar* dst, __local uchar* src, size_t size);
+void __gen_memcpy_pc(__private uchar* dst, __constant uchar* src, size_t size);
+void __gen_memcpy_lg(__local uchar* dst, __global uchar* src, size_t size);
+void __gen_memcpy_lp(__local uchar* dst, __private uchar* src, size_t size);
+void __gen_memcpy_ll(__local uchar* dst, __local uchar* src, size_t size);
+void __gen_memcpy_lc(__local uchar* dst, __constant uchar* src, size_t size);
+
+#endif  /* __OCL_MEMCPY_H__ */
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl_memset.h
similarity index 52%
copy from backend/src/libocl/include/ocl.h
copy to backend/src/libocl/include/ocl_memset.h
index a4af4aa..2d444ad 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl_memset.h
@@ -15,28 +15,19 @@
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
  */
-#ifndef __OCL_H__
-#define __OCL_H__
-
-#include "ocl_defines.h"
+#ifndef __OCL_MEMSET_H__
+#define __OCL_MEMSET_H__
 #include "ocl_types.h"
-#include "ocl_as.h"
-#include "ocl_async.h"
-#include "ocl_atom.h"
-#include "ocl_common.h"
-#include "ocl_convert.h"
-#include "ocl_float.h"
-#include "ocl_geometric.h"
-#include "ocl_image.h"
-#include "ocl_integer.h"
-#include "ocl_math.h"
-#include "ocl_misc.h"
-#include "ocl_printf.h"
-#include "ocl_relational.h"
-#include "ocl_sync.h"
-#include "ocl_vload.h"
-#include "ocl_workitem.h"
-#include "ocl_simd.h"
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-#pragma OPENCL EXTENSION cl_khr_fp16 : disable
-#endif
+
+/////////////////////////////////////////////////////////////////////////////
+// memcopy functions
+/////////////////////////////////////////////////////////////////////////////
+void __gen_memset_g_align(__global uchar* dst, uchar val, size_t size);
+void __gen_memset_p_align(__private uchar* dst, uchar val, size_t size);
+void __gen_memset_l_align(__local uchar* dst, uchar val, size_t size);
+
+void __gen_memset_g(__global uchar* dst, uchar val, size_t size);
+void __gen_memset_p(__private uchar* dst, uchar val, size_t size);
+void __gen_memset_l(__local uchar* dst, uchar val, size_t size);
+
+#endif  /* __OCL_MEMSET_H__ */
diff --git a/backend/src/libocl/script/ocl_convert.sh b/backend/src/libocl/script/ocl_convert.sh
index 4f720fe..7599a66 100755
--- a/backend/src/libocl/script/ocl_convert.sh
+++ b/backend/src/libocl/script/ocl_convert.sh
@@ -161,7 +161,8 @@ else
     echo '
 #define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
 OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-  return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
+  x = x >= MAX ? MAX : x; \
+  return x <= MIN ? (DSTTYPE)MIN : (DSTTYPE)x; \
 }
 '
 fi
@@ -173,8 +174,27 @@ DEF(short, long, -32768, 32767);
 DEF(ushort, long, 0, 65535);
 DEF(int, long, -0x7fffffff-1, 0x7fffffff);
 DEF(uint, long, 0, 0xffffffffu);
-DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
-DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "
+#define DEF(DSTTYPE, SRCTYPE, SRC_MIN, SRC_MAX, DST_MIN, DST_MAX) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+//convert float to long/ulong must take care of overflow, if overflow the value is undef.
+#define DEF(DSTTYPE, SRCTYPE, SRC_MIN, SRC_MAX, DST_MIN, DST_MAX) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  DSTTYPE y = x >= SRC_MAX ? DST_MAX : (DSTTYPE)x; \
+  return x <= SRC_MIN ? DST_MIN : y; \
+}
+'
+fi
+
+echo '
+DEF(long, float, -0x1.0p63, 0x1.0p63, 0x8000000000000000, 0x7fffffffffffffff);
+DEF(ulong, float, 0, 0x1.0p64, 0, 0xffffffffffffffff);
 #undef DEF
 '
 
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index dc3579c..2765a71 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -4,6 +4,9 @@
 ;#define CLK_LOCAL_MEM_FENCE  (1 << 0)
 ;#define CLK_GLOBAL_MEM_FENCE (1 << 1)
 
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
 declare i32 @_get_local_mem_fence() nounwind alwaysinline
 declare i32 @_get_global_mem_fence() nounwind alwaysinline
 declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
diff --git a/backend/src/libocl/src/ocl_clz.ll b/backend/src/libocl/src/ocl_clz.ll
index a274cde..9522881 100644
--- a/backend/src/libocl/src/ocl_clz.ll
+++ b/backend/src/libocl/src/ocl_clz.ll
@@ -1,3 +1,6 @@
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
 declare i8 @llvm.ctlz.i8(i8, i1)
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i32 @llvm.ctlz.i32(i32, i1)
diff --git a/backend/src/libocl/src/ocl_memcpy.cl b/backend/src/libocl/src/ocl_memcpy.cl
new file mode 100644
index 0000000..85f490f
--- /dev/null
+++ b/backend/src/libocl/src/ocl_memcpy.cl
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_memcpy.h"
+
+#define DECL_TWO_SPACE_MEMCOPY_FN(NAME, DST_SPACE, SRC_SPACE) \
+void __gen_memcpy_ ##NAME## _align (DST_SPACE uchar* dst, SRC_SPACE uchar* src, size_t size) { \
+  size_t index = 0; \
+  while((index + 4) <= size) { \
+    *((DST_SPACE uint *)(dst + index)) = *((SRC_SPACE uint *)(src + index)); \
+    index += 4; \
+  } \
+  while(index < size) { \
+    dst[index] = src[index]; \
+    index++; \
+  } \
+} \
+void __gen_memcpy_ ##NAME (DST_SPACE uchar* dst, SRC_SPACE uchar* src, size_t size) { \
+  size_t index = 0; \
+  while(index < size) { \
+    dst[index] = src[index]; \
+    index++; \
+  } \
+}
+
+#define DECL_ONE_SPACE_MEMCOPY_FN(NAME, DST_SPACE) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## g, DST_SPACE, __global) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## l, DST_SPACE, __local) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## p, DST_SPACE, __private) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## c, DST_SPACE, __constant)
+
+DECL_ONE_SPACE_MEMCOPY_FN(g, __global)
+DECL_ONE_SPACE_MEMCOPY_FN(l, __local)
+DECL_ONE_SPACE_MEMCOPY_FN(p, __private)
+
diff --git a/backend/src/libocl/src/ocl_memcpy.ll b/backend/src/libocl/src/ocl_memcpy.ll
deleted file mode 100644
index b3fadb2..0000000
--- a/backend/src/libocl/src/ocl_memcpy.ll
+++ /dev/null
@@ -1,729 +0,0 @@
-;The memcpy's source code.
-; INLINE_OVERLOADABLE void __gen_memcpy_align(uchar* dst, uchar* src, size_t size) {
-;   size_t index = 0;
-;   while((index + 4) <= size) {
-;     *((uint *)(dst + index)) = *((uint *)(src + index));
-;     index += 4;
-;   }
-;   while(index < size) {
-;     dst[index] = src[index];
-;     index++;
-;   }
-; }
-
-define void @__gen_memcpy_gg_align(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  store i32 %1, i32 addrspace(1)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
-  %3 = load i8 addrspace(1)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_gp_align(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
-  %1 = load i32 addrspace(0)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  store i32 %1, i32 addrspace(1)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
-  %3 = load i8 addrspace(0)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_gl_align(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  %1 = load i32 addrspace(3)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  store i32 %1, i32 addrspace(1)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
-  %3 = load i8 addrspace(3)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_pg_align(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
-  store i32 %1, i32 addrspace(0)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
-  %3 = load i8 addrspace(1)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_pp_align(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
-  %1 = load i32 addrspace(0)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
-  store i32 %1, i32 addrspace(0)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
-  %3 = load i8 addrspace(0)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_pl_align(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  %1 = load i32 addrspace(3)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
-  store i32 %1, i32 addrspace(0)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
-  %3 = load i8 addrspace(3)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_lg_align(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
-  store i32 %1, i32 addrspace(3)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
-  %3 = load i8 addrspace(1)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_lp_align(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
-  %1 = load i32 addrspace(0)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
-  store i32 %1, i32 addrspace(3)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
-  %3 = load i8 addrspace(0)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_ll_align(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  %1 = load i32 addrspace(3)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
-  store i32 %1, i32 addrspace(3)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
-  %3 = load i8 addrspace(3)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-;The memcpy's source code.
-; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
-;   size_t index = 0;
-;   while(index < size) {
-;     dst[index] = src[index];
-;     index++;
-;   }
-; }
-
-define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(1)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(1)*
-  %3 = load i8 addrspace(1)* %2, align 1
-  %4 = ptrtoint i8 addrspace(1)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(1)*
-  store i8 %3, i8 addrspace(1)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(0)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(0)*
-  %3 = load i8 addrspace(0)* %2, align 1
-  %4 = ptrtoint i8 addrspace(1)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(1)*
-  store i8 %3, i8 addrspace(1)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(3)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(3)*
-  %3 = load i8 addrspace(3)* %2, align 1
-  %4 = ptrtoint i8 addrspace(1)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(1)*
-  store i8 %3, i8 addrspace(1)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(1)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(1)*
-  %3 = load i8 addrspace(1)* %2, align 1
-  %4 = ptrtoint i8 addrspace(0)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(0)*
-  store i8 %3, i8 addrspace(0)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(0)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(0)*
-  %3 = load i8 addrspace(0)* %2, align 1
-  %4 = ptrtoint i8 addrspace(0)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(0)*
-  store i8 %3, i8 addrspace(0)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(3)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(3)*
-  %3 = load i8 addrspace(3)* %2, align 1
-  %4 = ptrtoint i8 addrspace(0)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(0)*
-  store i8 %3, i8 addrspace(0)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(1)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(1)*
-  %3 = load i8 addrspace(1)* %2, align 1
-  %4 = ptrtoint i8 addrspace(3)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(3)*
-  store i8 %3, i8 addrspace(3)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(0)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(0)*
-  %3 = load i8 addrspace(0)* %2, align 1
-  %4 = ptrtoint i8 addrspace(3)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(3)*
-  store i8 %3, i8 addrspace(3)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(3)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(3)*
-  %3 = load i8 addrspace(3)* %2, align 1
-  %4 = ptrtoint i8 addrspace(3)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(3)*
-  store i8 %3, i8 addrspace(3)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_gc_align(i8 addrspace(1)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(2)* %add.ptr to i32 addrspace(2)*
-  %1 = load i32 addrspace(2)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  store i32 %1, i32 addrspace(1)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.1
-  %3 = load i8 addrspace(2)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_pc_align(i8 addrspace(0)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(2)* %add.ptr to i32 addrspace(2)*
-  %1 = load i32 addrspace(2)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
-  store i32 %1, i32 addrspace(0)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.1
-  %3 = load i8 addrspace(2)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_lc_align(i8 addrspace(3)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(2)* %add.ptr to i32 addrspace(2)*
-  %1 = load i32 addrspace(2)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
-  store i32 %1, i32 addrspace(3)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.1
-  %3 = load i8 addrspace(2)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
-  ret void
-}
-
-define void @__gen_memcpy_pc(i8 addrspace(0)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(2)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(2)*
-  %3 = load i8 addrspace(2)* %2, align 1
-  %4 = ptrtoint i8 addrspace(0)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(0)*
-  store i8 %3, i8 addrspace(0)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_gc(i8 addrspace(1)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(2)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(2)*
-  %3 = load i8 addrspace(2)* %2, align 1
-  %4 = ptrtoint i8 addrspace(1)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(1)*
-  store i8 %3, i8 addrspace(1)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memcpy_lc(i8 addrspace(3)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp4 = icmp eq i32 %size, 0
-  br i1 %cmp4, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(2)* %src to i32
-  %1 = add i32 %0, %index.05
-  %2 = inttoptr i32 %1 to i8 addrspace(2)*
-  %3 = load i8 addrspace(2)* %2, align 1
-  %4 = ptrtoint i8 addrspace(3)* %dst to i32
-  %5 = add i32 %4, %index.05
-  %6 = inttoptr i32 %5 to i8 addrspace(3)*
-  store i8 %3, i8 addrspace(3)* %6, align 1
-  %inc = add i32 %index.05, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/src/ocl_memset.cl
similarity index 52%
copy from backend/src/libocl/include/ocl.h
copy to backend/src/libocl/src/ocl_memset.cl
index a4af4aa..b41851a 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/src/ocl_memset.cl
@@ -15,28 +15,30 @@
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
  */
-#ifndef __OCL_H__
-#define __OCL_H__
+#include "ocl_memset.h"
+
+#define DECL_MEMSET_FN(NAME, DST_SPACE) \
+void __gen_memset_ ##NAME## _align (DST_SPACE uchar* dst, uchar val, size_t size) { \
+  size_t index = 0; \
+  uint v = (val << 24) | (val << 16) | (val << 8) | val; \
+  while((index + 4) >= size) { \
+    *((DST_SPACE uint *)(dst + index)) = v; \
+    index += 4; \
+  } \
+  while(index < size) { \
+    dst[index] = val; \
+    index++; \
+ } \
+} \
+void __gen_memset_ ##NAME (DST_SPACE uchar* dst, uchar val, size_t size) { \
+  size_t index = 0; \
+  while(index < size) { \
+    dst[index] = val; \
+    index++; \
+ } \
+}
+
+DECL_MEMSET_FN(g, __global)
+DECL_MEMSET_FN(l, __local)
+DECL_MEMSET_FN(p, __private)
 
-#include "ocl_defines.h"
-#include "ocl_types.h"
-#include "ocl_as.h"
-#include "ocl_async.h"
-#include "ocl_atom.h"
-#include "ocl_common.h"
-#include "ocl_convert.h"
-#include "ocl_float.h"
-#include "ocl_geometric.h"
-#include "ocl_image.h"
-#include "ocl_integer.h"
-#include "ocl_math.h"
-#include "ocl_misc.h"
-#include "ocl_printf.h"
-#include "ocl_relational.h"
-#include "ocl_sync.h"
-#include "ocl_vload.h"
-#include "ocl_workitem.h"
-#include "ocl_simd.h"
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-#pragma OPENCL EXTENSION cl_khr_fp16 : disable
-#endif
diff --git a/backend/src/libocl/src/ocl_memset.ll b/backend/src/libocl/src/ocl_memset.ll
deleted file mode 100644
index 665eac4..0000000
--- a/backend/src/libocl/src/ocl_memset.ll
+++ /dev/null
@@ -1,193 +0,0 @@
-;The memset's source code.
-; INLINE_OVERLOADABLE void __gen_memset_align(uchar* dst, uchar val, size_t size) {
-;   size_t index = 0;
-;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
-;   while((index + 4) >= size) {
-;     *((uint *)(dst + index)) = v;
-;     index += 4;
-;   }
-;   while(index < size) {
-;     dst[index] = val;
-;     index++;
-;  }
-; }
-
-define void @__gen_memset_p_align(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
-entry:
-  %conv = zext i8 %val to i32
-  %shl = shl nuw i32 %conv, 24
-  %shl2 = shl nuw nsw i32 %conv, 16
-  %or = or i32 %shl, %shl2
-  %shl4 = shl nuw nsw i32 %conv, 8
-  %or5 = or i32 %or, %shl4
-  %or7 = or i32 %or5, %conv
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond10, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
-  %0 = bitcast i8* %add.ptr to i32*
-  store i32 %or7, i32* %0, align 4
-  br label %while.cond
-
-while.cond10:                                     ; preds = %while.cond, %while.body13
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
-  %cmp11 = icmp ult i32 %index.1, %size
-  br i1 %cmp11, label %while.body13, label %while.end14
-
-while.body13:                                     ; preds = %while.cond10
-  %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
-  store i8 %val, i8* %arrayidx, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond10
-
-while.end14:                                      ; preds = %while.cond10
-  ret void
-}
-
-define void @__gen_memset_g_align(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
-entry:
-  %conv = zext i8 %val to i32
-  %shl = shl nuw i32 %conv, 24
-  %shl2 = shl nuw nsw i32 %conv, 16
-  %or = or i32 %shl, %shl2
-  %shl4 = shl nuw nsw i32 %conv, 8
-  %or5 = or i32 %or, %shl4
-  %or7 = or i32 %or5, %conv
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond10, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  store i32 %or7, i32 addrspace(1)* %0, align 4
-  br label %while.cond
-
-while.cond10:                                     ; preds = %while.cond, %while.body13
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
-  %cmp11 = icmp ult i32 %index.1, %size
-  br i1 %cmp11, label %while.body13, label %while.end14
-
-while.body13:                                     ; preds = %while.cond10
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %val, i8 addrspace(1)* %arrayidx, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond10
-
-while.end14:                                      ; preds = %while.cond10
-  ret void
-}
-
-define void @__gen_memset_l_align(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
-entry:
-  %conv = zext i8 %val to i32
-  %shl = shl nuw i32 %conv, 24
-  %shl2 = shl nuw nsw i32 %conv, 16
-  %or = or i32 %shl, %shl2
-  %shl4 = shl nuw nsw i32 %conv, 8
-  %or5 = or i32 %or, %shl4
-  %or7 = or i32 %or5, %conv
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ugt i32 %add, %size
-  br i1 %cmp, label %while.cond10, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  store i32 %or7, i32 addrspace(3)* %0, align 4
-  br label %while.cond
-
-while.cond10:                                     ; preds = %while.cond, %while.body13
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
-  %cmp11 = icmp ult i32 %index.1, %size
-  br i1 %cmp11, label %while.body13, label %while.end14
-
-while.body13:                                     ; preds = %while.cond10
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %val, i8 addrspace(3)* %arrayidx, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond10
-
-while.end14:                                      ; preds = %while.cond10
-  ret void
-}
-
-;The memset's source code.
-; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
-;   size_t index = 0;
-;   while(index < size) {
-;     dst[index] = val;
-;     index++;
-;  }
-; }
-
-define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp3 = icmp eq i32 %size, 0
-  br i1 %cmp3, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(0)* %dst to i32
-  %1 = add i32 %0, %index.04
-  %2 = inttoptr i32 %1 to i8 addrspace(0)*
-  store i8 %val, i8 addrspace(0)* %2, align 1
-  %inc = add i32 %index.04, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp3 = icmp eq i32 %size, 0
-  br i1 %cmp3, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(1)* %dst to i32
-  %1 = add i32 %0, %index.04
-  %2 = inttoptr i32 %1 to i8 addrspace(1)*
-  store i8 %val, i8 addrspace(1)* %2, align 1
-  %inc = add i32 %index.04, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
-entry:
-  %cmp3 = icmp eq i32 %size, 0
-  br i1 %cmp3, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
-  %0 = ptrtoint i8 addrspace(3)* %dst to i32
-  %1 = add i32 %0, %index.04
-  %2 = inttoptr i32 %1 to i8 addrspace(3)*
-  store i8 %val, i8 addrspace(3)* %2, align 1
-  %inc = add i32 %index.04, 1
-  %cmp = icmp ult i32 %inc, %size
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
diff --git a/backend/src/llvm/ExpandConstantExpr.cpp b/backend/src/llvm/ExpandConstantExpr.cpp
index 5c5934a..c6f57b8 100644
--- a/backend/src/llvm/ExpandConstantExpr.cpp
+++ b/backend/src/llvm/ExpandConstantExpr.cpp
@@ -77,12 +77,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <map>
-
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
+#include "llvm_includes.hpp"
 #include "llvm_gen_backend.hpp"
 
 using namespace llvm;
diff --git a/backend/src/llvm/ExpandLargeIntegers.cpp b/backend/src/llvm/ExpandLargeIntegers.cpp
index f7e59a5..20fdda9 100644
--- a/backend/src/llvm/ExpandLargeIntegers.cpp
+++ b/backend/src/llvm/ExpandLargeIntegers.cpp
@@ -86,24 +86,9 @@
 //       2. OR x, 0 can be optimized as x. And x, 0 can be optimized as 0.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#if LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/CFG.h"
-#else
-#include "llvm/Support/CFG.h"
-#endif
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
+
+#include "llvm_includes.hpp"
+
 #include "llvm_gen_backend.hpp"
 
 using namespace llvm;
diff --git a/backend/src/llvm/ExpandUtils.cpp b/backend/src/llvm/ExpandUtils.cpp
index e6dfb52..801f969 100644
--- a/backend/src/llvm/ExpandUtils.cpp
+++ b/backend/src/llvm/ExpandUtils.cpp
@@ -64,12 +64,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm_includes.hpp"
+
 #include "llvm_gen_backend.hpp"
 
 using namespace llvm;
diff --git a/backend/src/llvm/PromoteIntegers.cpp b/backend/src/llvm/PromoteIntegers.cpp
index aba42b9..b65440f 100644
--- a/backend/src/llvm/PromoteIntegers.cpp
+++ b/backend/src/llvm/PromoteIntegers.cpp
@@ -84,14 +84,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm_includes.hpp"
+
 #include "llvm_gen_backend.hpp"
 
 using namespace llvm;
diff --git a/backend/src/llvm/StripAttributes.cpp b/backend/src/llvm/StripAttributes.cpp
index 05cac17..e6df312 100644
--- a/backend/src/llvm/StripAttributes.cpp
+++ b/backend/src/llvm/StripAttributes.cpp
@@ -69,14 +69,7 @@
 //  * Calling conventions from functions and function calls.
 //
 
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-
-#if LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/CallSite.h"
-#else
-#include "llvm/Support/CallSite.h"
-#endif
+#include "llvm_includes.hpp"
 
 #include "llvm_gen_backend.hpp"
 
diff --git a/backend/src/llvm/llvm_barrier_nodup.cpp b/backend/src/llvm/llvm_barrier_nodup.cpp
index 19deafc..727e6bd 100644
--- a/backend/src/llvm/llvm_barrier_nodup.cpp
+++ b/backend/src/llvm/llvm_barrier_nodup.cpp
@@ -28,30 +28,7 @@
  *  
  */
 
-#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
-#include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/IR/Attributes.h"
+#include "llvm_includes.hpp"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "sys/map.hpp"
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index ebf4386..56205bb 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -21,24 +21,11 @@
 #include <iostream>
 #include <sstream>
 #include <set>
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/PassManager.h"
-#include "llvm/Pass.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Support/SourceMgr.h"
 
 #include "sys/cvar.hpp"
 #include "src/GBEConfig.h"
+#include "llvm_includes.hpp"
 #include "llvm/llvm_gen_backend.hpp"
-#include "llvm-c/Linker.h"
 
 using namespace llvm;
 
@@ -248,8 +235,11 @@ namespace gbe
       printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
       return NULL;
     }
-
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+    llvm::legacy::PassManager passes;
+#else
     llvm::PassManager passes;
+#endif
 
     passes.add(createInternalizePass(kernels));
     passes.add(createGlobalDCEPass());
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 4905415..17b65a1 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -71,86 +71,7 @@
  *   is intercepted, we just abort
  */
 
-#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#else
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Instructions.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
-#include "llvm/IR/IRBuilder.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Intrinsics.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/InlineAsm.h"
-#else
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/InlineAsm.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/ConstantsScanner.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
-#include "llvm/IR/Mangler.h"
-#else
-#include "llvm/Target/Mangler.h"
-#endif
-
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
-#include "llvm/Target/TargetData.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/DataLayout.h"
-#else
-#include "llvm/IR/DataLayout.h"
-#endif
-
-#if LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/CFG.h"
-#else
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
-#endif
-
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
-#include "llvm/Support/InstVisitor.h"
-#elif LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/InstVisitor.h"
-#else
-#include "llvm/InstVisitor.h"
-#endif
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm_includes.hpp"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "ir/context.hpp"
@@ -527,14 +448,22 @@ namespace gbe
         TheModule(0),
         btiBase(BTI_RESERVED_NUM)
     {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+      initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+#else
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+#endif
       pass = PASS_EMIT_REGISTERS;
     }
 
     virtual const char *getPassName() const { return "Gen Back-End"; }
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+      AU.addRequired<LoopInfoWrapperPass>();
+#else
       AU.addRequired<LoopInfo>();
+#endif
       AU.setPreservesAll();
     }
 
@@ -564,7 +493,11 @@ namespace gbe
       assignBti(F);
       analyzePointerOrigin(F);
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+#else
       LI = &getAnalysis<LoopInfo>();
+#endif
       emitFunction(F);
       phiMap.clear();
       globalPointer.clear();
@@ -3094,11 +3027,21 @@ namespace gbe
         // We use a select (0,1) not a convert when the destination is a boolean
         if (srcType == ir::TYPE_BOOL) {
           const ir::RegisterFamily family = getFamily(dstType);
-          const ir::ImmediateIndex zero = ctx.newIntegerImmediate(0, dstType);
+          ir::ImmediateIndex zero;
+          if(dstType == ir::TYPE_FLOAT)
+            zero = ctx.newFloatImmediate(0);
+          else if(dstType == ir::TYPE_DOUBLE)
+            zero = ctx.newDoubleImmediate(0);
+	  else
+            zero = ctx.newIntegerImmediate(0, dstType);
           ir::ImmediateIndex one;
           if (I.getOpcode() == Instruction::SExt
               && (dstType == ir::TYPE_S8 || dstType == ir::TYPE_S16 || dstType == ir::TYPE_S32 || dstType == ir::TYPE_S64))
             one = ctx.newIntegerImmediate(-1, dstType);
+          else if(dstType == ir::TYPE_FLOAT)
+            one = ctx.newFloatImmediate(1);
+          else if(dstType == ir::TYPE_DOUBLE)
+            one = ctx.newDoubleImmediate(1);
           else
             one = ctx.newIntegerImmediate(1, dstType);
           const ir::Register zeroReg = ctx.reg(family);
@@ -4334,7 +4277,6 @@ namespace gbe
   void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI)
   {
     Type *llvmType = llvmValues->getType();
-    const ir::Type type = getType(ctx, llvmType);
     unsigned byteSize = getTypeByteSize(unit, llvmType);
 
     Type *elemType = llvmType;
@@ -4344,6 +4286,7 @@ namespace gbe
       elemType = vectorType->getElementType();
       elemNum = vectorType->getNumElements();
     }
+    const ir::Type type = getType(ctx, elemType);
 
     vector<ir::Register> tupleData;
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
@@ -4386,7 +4329,7 @@ namespace gbe
           ctx.LOADI(ir::TYPE_S32, offset, immIndex);
           ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
         }
-       ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
+       ctx.STORE(ir::TYPE_U8, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
       }
     }
   }
@@ -4440,9 +4383,10 @@ namespace gbe
     else
       ptr = pointer;
 
+    unsigned primitiveBits = scalarType->getPrimitiveSizeInBits();
     if (!dwAligned
-       && (scalarType == IntegerType::get(I.getContext(), 64)
-          || scalarType == IntegerType::get(I.getContext(), 32))
+       && (primitiveBits == 64
+          || primitiveBits == 32)
        ) {
       emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad, dwAligned, fixedBTI);
       return;
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 1f16557..94a377b 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -30,11 +30,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/LoopPass.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Instructions.h"
-#else
 #include "llvm/IR/Instructions.h"
-#endif
 #include "sys/platform.hpp"
 #include "sys/map.hpp"
 #include <algorithm>
diff --git a/backend/src/llvm/llvm_includes.hpp b/backend/src/llvm/llvm_includes.hpp
new file mode 100644
index 0000000..fed3a18
--- /dev/null
+++ b/backend/src/llvm/llvm_includes.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Yang Rong <rong.r.yang at intel.com>
+ */
+
+/**
+ * \file llvm_includes.hpp
+ * \author Yang Rong <rong.r.yang at intel.com>
+ */
+#ifndef __GBE_IR_LLVM_INCLUDES_HPP__
+#define __GBE_IR_LLVM_INCLUDES_HPP__
+
+#include "llvm/Config/llvm-config.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+
+#include "llvm_includes.hpp"
+
+#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
+
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/Passes.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+#include "llvm-c/Linker.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Dominators.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Target/Mangler.h"
+#endif
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#else
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/PassManager.h"
+#endif
+#include "llvm/ADT/Triple.h"
+
+#include <clang/CodeGen/CodeGenAction.h>
+
+#endif /* __GBE_IR_LLVM_INCLUDES_HPP__ */
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index 7d1f8f0..b35d1e6 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -20,29 +20,7 @@
  * \author Yang Rong <rong.r.yang at intel.com>
  */
 
-#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
-#include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
-#include "llvm/Support/raw_ostream.h"
+#include "llvm_includes.hpp"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "sys/map.hpp"
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index c6349fa..698fdc2 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -22,37 +22,7 @@
  * from Vectorize passes in llvm.
  */
 
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
-
-#include "llvm/Config/llvm-config.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
-#include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm_includes.hpp"
 
 using namespace llvm;
 namespace gbe {
@@ -72,7 +42,9 @@ namespace gbe {
 
     virtual bool runOnBasicBlock(BasicBlock &BB) {
       SE = &getAnalysis<ScalarEvolution>();
-      #if LLVM_VERSION_MINOR >= 5
+      #if LLVM_VERSION_MINOR >= 7
+        TD = &BB.getModule()->getDataLayout();
+      #elif LLVM_VERSION_MINOR >= 5
         DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
         TD = DLP ? &DLP->getDataLayout() : nullptr;
       #else
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 223f61b..d5d965b 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -30,75 +30,7 @@
  * Segovia) the right to use another license for it (MIT here)
  */
 
-#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#else
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Instructions.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Intrinsics.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/InlineAsm.h"
-#else
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/InlineAsm.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/ConstantsScanner.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
-#include "llvm/IR/Mangler.h"
-#else
-#include "llvm/Target/Mangler.h"
-#endif
-
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
-#include "llvm/Target/TargetData.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/DataLayout.h"
-#else
-#include "llvm/IR/DataLayout.h"
-#endif
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
-#include "llvm/Support/InstVisitor.h"
-#elif LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/InstVisitor.h"
-#else
-#include "llvm/InstVisitor.h"
-#endif
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm_includes.hpp"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "ir/unit.hpp"
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 3d84457..47688f7 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -33,39 +33,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
-#include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
-
-#if LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/CFG.h"
-#else
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
-#endif
-
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/IR/Attributes.h"
+#include "llvm_includes.hpp"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "sys/map.hpp"
@@ -261,7 +229,7 @@ again:
         printf("string end with %%\n");
         goto error;
       }
-      if (*(p + 1) == '%') { // %%
+      if (p + 1 < end && *(p + 1) == '%') { // %%
         p += 2;
         goto again;
       }
@@ -328,6 +296,8 @@ error:
     Module* module;
     IRBuilder<>* builder;
     Type* intTy;
+    llvm::Constant * pbuf_global;
+    llvm::Constant * index_buf_global;
     Value* pbuf_ptr;
     Value* index_buf_ptr;
     Value* g1Xg2Xg3;
@@ -342,13 +312,11 @@ error:
       PrintfSet::PrintfFmt* printf_fmt;
     };
 
-    PrintfParser(void) : FunctionPass(ID)
-    {
+    void stateInit(void) {
       module = NULL;
       builder = NULL;
       intTy = NULL;
       out_buf_sizeof_offset = 0;
-      printfs.clear();
       pbuf_ptr = NULL;
       index_buf_ptr = NULL;
       g1Xg2Xg3 = NULL;
@@ -357,6 +325,13 @@ error:
       totalSizeofSize = 0;
     }
 
+    PrintfParser(void) : FunctionPass(ID)
+    {
+      stateInit();
+      pbuf_global = NULL;
+      index_buf_global = NULL;
+    }
+
     ~PrintfParser(void)
     {
       for (auto &s : printfs) {
@@ -554,6 +529,7 @@ error:
 
   bool PrintfParser::runOnFunction(llvm::Function &F)
   {
+    stateInit();
     bool changed = false;
     bool hasPrintf = false;
     switch (F.getCallingConv()) {
@@ -630,29 +606,30 @@ error:
     if (!hasPrintf)
       return changed;
 
-    if (!pbuf_ptr) {
+    if (!pbuf_global) {
       /* alloc a new buffer ptr to collect the print output. */
       Type *ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
-      llvm::Constant *pBuf = new GlobalVariable(*module, ptrTy, false,
+      pbuf_global= new GlobalVariable(*module, ptrTy, false,
                                 GlobalVariable::ExternalLinkage,
                                 nullptr,
                                 StringRef("__gen_ocl_printf_buf"),
                                 nullptr,
                                 GlobalVariable::NotThreadLocal,
                                 1);
-      pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
     }
-    if (!index_buf_ptr) {
+    pbuf_ptr = builder->CreatePtrToInt(pbuf_global, Type::getInt32Ty(module->getContext()));
+
+    if (!index_buf_global) {
       Type *ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
-      llvm::Constant *pBuf = new GlobalVariable(*module, ptrTy, false,
+      index_buf_global = new GlobalVariable(*module, ptrTy, false,
                                 GlobalVariable::ExternalLinkage,
                                 nullptr,
                                 StringRef("__gen_ocl_printf_index_buf"),
                                 nullptr,
                                 GlobalVariable::NotThreadLocal,
                                 1);
-      index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
     }
+    index_buf_ptr = builder->CreatePtrToInt(index_buf_global, Type::getInt32Ty(module->getContext()));
 
     if (!wg_offset || !g1Xg2Xg3) {
       Value* op0 = NULL;
diff --git a/backend/src/llvm/llvm_sampler_fix.cpp b/backend/src/llvm/llvm_sampler_fix.cpp
index 8c76324..01db8fe 100644
--- a/backend/src/llvm/llvm_sampler_fix.cpp
+++ b/backend/src/llvm/llvm_sampler_fix.cpp
@@ -20,27 +20,8 @@
  * make sure to get correct pixel value. But for some other
  * sampler, we don't need those work around code.
  */
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
 
-#include "llvm/Config/llvm-config.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/IR/IRBuilder.h"
-#if LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/CFG.h"
-#else
-#include "llvm/Support/CFG.h"
-#endif
-
-#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm_includes.hpp"
 
 #include "llvm_gen_backend.hpp"
 #include "ocl_common_defines.h"
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index bc985c6..7ee5259 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -59,39 +59,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/llvm-config.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
-#include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
-
-#if LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/CFG.h"
-#else
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
-#endif
-#include "llvm/Support/raw_ostream.h"
+#include "llvm_includes.hpp"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "sys/map.hpp"
@@ -128,7 +96,6 @@ namespace gbe {
 
     Scalarize() : FunctionPass(ID)
     {
-      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
       initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
 #else
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 891f2a1..24d4be7 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -22,40 +22,8 @@
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
-#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/DataLayout.h"
-#else
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/DataLayout.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/PassManager.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/ADT/Triple.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-#include "llvm/Support/IRReader.h"
-#else
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/SourceMgr.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
-#include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/Verifier.h"
-#else
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#endif
+#include "llvm_includes.hpp"
 
-#include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/llvm_gen_backend.hpp"
 #include "llvm/llvm_to_gen.hpp"
 #include "sys/cvar.hpp"
@@ -64,8 +32,6 @@
 #include "ir/function.hpp"
 #include "ir/structurizer.hpp"
 
-#include <clang/CodeGen/CodeGenAction.h>
-
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -78,11 +44,19 @@ namespace gbe
   BVAR(OCL_OUTPUT_CFG_GEN_IR, false);
   using namespace llvm;
 
-  void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+  using namespace llvm::legacy;
+  #define TARGETLIBRARY  TargetLibraryInfoImpl
+#else
+  #define TARGETLIBRARY  TargetLibraryInfo
+#endif
+
+  void runFuntionPass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL)
   {
     FunctionPassManager FPM(&mod);
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
     FPM.add(new DataLayoutPass());
 #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
     FPM.add(new DataLayoutPass(DL));
@@ -95,7 +69,11 @@ namespace gbe
 #else
     FPM.add(createVerifierPass());
 #endif
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+    FPM.add(new TargetLibraryInfoWrapperPass(*libraryInfo));
+#else
     FPM.add(new TargetLibraryInfo(*libraryInfo));
+#endif
     FPM.add(createTypeBasedAliasAnalysisPass());
     FPM.add(createBasicAliasAnalysisPass());
     FPM.add(createCFGSimplificationPass());
@@ -111,21 +89,28 @@ namespace gbe
     FPM.doFinalization();
   }
 
-  void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL, int optLevel, bool strictMath)
+  void runModulePass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL, int optLevel, bool strictMath)
   {
-    llvm::PassManager MPM;
+    PassManager MPM;
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
     MPM.add(new DataLayoutPass());
 #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
     MPM.add(new DataLayoutPass(DL));
 #else
     MPM.add(new DataLayout(DL));
 #endif
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+    MPM.add(new TargetLibraryInfoWrapperPass(*libraryInfo));
+#else
     MPM.add(new TargetLibraryInfo(*libraryInfo));
+#endif
     MPM.add(createTypeBasedAliasAnalysisPass());
     MPM.add(createBasicAliasAnalysisPass());
     MPM.add(createIntrinsicLoweringPass());
+    MPM.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
     MPM.add(createSamplerFixPass());
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
 
@@ -135,7 +120,6 @@ namespace gbe
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
     MPM.add(createPruneEHPass());             // Remove dead EH info
-    MPM.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
     MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
     MPM.add(createFunctionInliningPass(20000));
     MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
@@ -202,7 +186,7 @@ namespace gbe
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
 #define OUTPUT_BITCODE(STAGE, MOD)  do {         \
-   llvm::PassManager passes__;                   \
+   PassManager passes__;           \
    if (OCL_OUTPUT_LLVM_##STAGE) {                \
      passes__.add(createPrintModulePass(*o));    \
      passes__.run(MOD);                          \
@@ -210,7 +194,7 @@ namespace gbe
  }while(0)
 #else
 #define OUTPUT_BITCODE(STAGE, MOD)  do {         \
-   llvm::PassManager passes__;                   \
+   PassManager passes__;           \
    if (OCL_OUTPUT_LLVM_##STAGE) {                \
      passes__.add(createPrintModulePass(&*o));   \
      passes__.run(MOD);                          \
@@ -260,16 +244,20 @@ namespace gbe
     Module &mod = *M.get();
     DataLayout DL(&mod);
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+    mod.setDataLayout(DL);
+#endif
     Triple TargetTriple(mod.getTargetTriple());
-    TargetLibraryInfo *libraryInfo = new TargetLibraryInfo(TargetTriple);
+    TARGETLIBRARY *libraryInfo = new TARGETLIBRARY(TargetTriple);
     libraryInfo->disableAllFunctions();
 
     OUTPUT_BITCODE(AFTER_LINK, mod);
 
     runFuntionPass(mod, libraryInfo, DL);
     runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
-    llvm::PassManager passes;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+    PassManager passes;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
     passes.add(new DataLayoutPass());
 #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
     passes.add(new DataLayoutPass(DL));
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
index 5d3fad8..6990e39 100644
--- a/backend/src/llvm/llvm_unroll.cpp
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -18,34 +18,9 @@
 #include "llvm/Config/llvm-config.h"
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
 #include <set>
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-#include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
-#include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/PassManager.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Dominators.h"
+
+#include "llvm_includes.hpp"
+
 #include "llvm/llvm_gen_backend.hpp"
 #include "sys/map.hpp"
 
@@ -61,8 +36,13 @@ namespace gbe {
        LoopPass(ID) {}
 
       void getAnalysisUsage(AnalysisUsage &AU) const {
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 7)
+        AU.addRequired<LoopInfoWrapperPass>();
+        AU.addPreserved<LoopInfoWrapperPass>();
+#else
         AU.addRequired<LoopInfo>();
         AU.addPreserved<LoopInfo>();
+#endif
         AU.addRequiredID(LoopSimplifyID);
         AU.addPreservedID(LoopSimplifyID);
         AU.addRequiredID(LCSSAID);
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index c9aef28..eddbe95 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,5 +1,8 @@
 # News
 
+## Apr 19, 2016
+[Beignet 1.1.2](https://01.org/beignet/downloads/beignet-1.1.2-2016-04-19) is released. This is a bug-fix release.
+
 ## Oct 08, 2015
 [Beignet 1.1.1](https://01.org/beignet/downloads/beignet-1.1.1-2015-10-08) is released. This is a bug-fix release.
 
diff --git a/kernels/compiler_function_qualifiers.cl b/kernels/compiler_function_qualifiers.cl
index c904c84..c9f7e5d 100644
--- a/kernels/compiler_function_qualifiers.cl
+++ b/kernels/compiler_function_qualifiers.cl
@@ -1,9 +1,9 @@
 /* test OpenCL 1.1 Function Qualifiers (section 6.7) */
-kernel void compiler_function_qualifiers()
+kernel void compiler_function_qualifiers(void)
 __attribute__((vec_type_hint(float)))
 __attribute__((work_group_size_hint(4,1,1)))
 __attribute__((reqd_work_group_size(4,1,1)));
 
-kernel void compiler_function_qualifiers()
+kernel void compiler_function_qualifiers(void)
 {
 }
diff --git a/src/cl_api.c b/src/cl_api.c
index cef5bbb..5c47f81 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -3000,6 +3000,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+error:
   if(b_output_kernel_perf)
   {
     if(kernel->program->build_opts != NULL)
@@ -3007,7 +3008,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     else
       time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
   }
-error:
+
   return err;
 }
 
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4e4ebfb..50436fc 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -76,11 +76,9 @@ cl_command_queue_delete(cl_command_queue queue)
   assert(queue);
   if (atomic_dec(&queue->ref_n) != 1) return;
 
-  // If there is a valid last event, we need to give it a chance to
-  // call the call-back function.
-  cl_event last_event = get_last_event(queue);
-  if (last_event && last_event->user_cb)
-    cl_event_update_status(last_event, 1);
+  // If there is a list of valid events, we need to give them
+  // a chance to call the call-back function.
+  cl_event_update_last_events(queue,1);
   /* Remove it from the list */
   assert(queue->ctx);
   pthread_mutex_lock(&queue->ctx->queue_lock);
@@ -255,14 +253,11 @@ cl_command_queue_flush(cl_command_queue queue)
   int err;
   GET_QUEUE_THREAD_GPGPU(queue);
   err = cl_command_queue_flush_gpgpu(queue, gpgpu);
-  // As we don't have a deadicate timer thread to take care the possible
-  // event which has a call back function registerred and the event will
-  // be released at the call back function, no other function will access
-  // the event any more. If we don't do this here, we will leak that event
-  // and all the corresponding buffers which is really bad.
-  cl_event last_event = get_last_event(queue);
-  if (last_event && last_event->user_cb)
-    cl_event_update_status(last_event, 1);
+  // We now keep a list of uncompleted events and check if they compelte
+  // every flush. This can make sure all events created have chance to be
+  // update status, so the callback functions or reference can be handled.
+  cl_event_update_last_events(queue,0);
+
   cl_event current_event = get_current_event(queue);
   if (current_event && err == CL_SUCCESS) {
     err = cl_event_flush(current_event);
@@ -276,6 +271,7 @@ LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
   cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
+  cl_event_update_last_events(queue,1);
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 89f39b3..bbb04ab 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -338,7 +338,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Compute the number of HW threads we need */
   if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
-    fprintf(stderr, "Beignet: Work group size exceed Kerne's work group size.\n");
+    fprintf(stderr, "Beignet: Work group size exceed Kernel's work group size.\n");
     return err;
   }
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 3552a16..63e078f 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -287,7 +287,14 @@
    devid == PCI_CHIP_SKYLAKE_SRV_GT4)
 
 #define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || IS_SKL_GT3(devid) || IS_SKL_GT4(devid))
-#define IS_GEN9(devid)      IS_SKYLAKE(devid)
+
+/* BXT */
+#define PCI_CHIP_BROXTON_P	0x5A84   /* Intel(R) BXT-P for mobile desktop */
+
+#define IS_BROXTON(devid)               \
+  (devid == PCI_CHIP_BROXTON_P)
+
+#define IS_GEN9(devid)      (IS_SKYLAKE(devid) || IS_BROXTON(devid))
 
 #endif /* __CL_DEVICE_DATA_H__ */
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 1778292..7b47c21 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -198,6 +198,17 @@ static struct _cl_device_id intel_skl_gt4_device = {
 #include "cl_gen75_device.h"
 };
 
+static struct _cl_device_id intel_bxt_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 18,
+  .max_thread_per_unit = 6,
+  .sub_slice_count = 3,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
 LOCAL cl_device_id
 cl_get_gt_device(void)
 {
@@ -518,6 +529,15 @@ skl_gt4_break:
       cl_intel_platform_enable_fp16_extension(ret);
       break;
 
+    case PCI_CHIP_BROXTON_P:
+      DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD Graphics Broxton-P");
+bxt_break:
+      intel_bxt_device.device_id = device_id;
+      intel_bxt_device.platform = cl_get_platform_default();
+      ret = &intel_bxt_device;
+      cl_intel_platform_enable_fp16_extension(ret);
+      break;
+
     case PCI_CHIP_SANDYBRIDGE_BRIDGE:
     case PCI_CHIP_SANDYBRIDGE_GT1:
     case PCI_CHIP_SANDYBRIDGE_GT2:
@@ -622,6 +642,7 @@ cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
                   // Atomic fail need to test SLM again with atomic in L3 feature disabled.
                   tested = 0;
                 }
+                clReleaseEvent(kernel_finished);
               }
             }
             clReleaseMemObject(buffer);
@@ -731,7 +752,8 @@ cl_get_device_info(cl_device_id     device,
                device != &intel_skl_gt1_device &&
                device != &intel_skl_gt2_device &&
                device != &intel_skl_gt3_device &&
-               device != &intel_skl_gt4_device
+               device != &intel_skl_gt4_device &&
+               device != &intel_bxt_device
                ))
     return CL_INVALID_DEVICE;
 
@@ -802,6 +824,7 @@ cl_get_device_info(cl_device_id     device,
     DECL_STRING_FIELD(VERSION, version)
     DECL_STRING_FIELD(PROFILE, profile)
     DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
+    DECL_STRING_FIELD(SPIR_VERSIONS, spir_versions)
     DECL_STRING_FIELD(EXTENSIONS, extensions);
     DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
     DECL_FIELD(PARENT_DEVICE, parent_device)
@@ -842,7 +865,9 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
                device != &intel_skl_gt1_device &&
                device != &intel_skl_gt2_device &&
                device != &intel_skl_gt3_device &&
-               device != &intel_skl_gt4_device))
+               device != &intel_skl_gt4_device &&
+               device != &intel_bxt_device
+               ))
     return CL_INVALID_DEVICE;
   if (ver == NULL)
     return CL_SUCCESS;
@@ -857,7 +882,8 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
         || device == &intel_brw_gt3_device || device == &intel_chv_device) {
     *ver = 8;
   } else if (device == &intel_skl_gt1_device || device == &intel_skl_gt2_device
-        || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device) {
+        || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device
+        || device == &intel_bxt_device) {
     *ver = 9;
   } else
     return CL_INVALID_VALUE;
@@ -945,7 +971,8 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
                device != &intel_skl_gt1_device &&
                device != &intel_skl_gt2_device &&
                device != &intel_skl_gt3_device &&
-               device != &intel_skl_gt4_device))
+               device != &intel_skl_gt4_device &&
+               device != &intel_bxt_device))
     return CL_INVALID_DEVICE;
 
   CHECK_KERNEL(kernel);
@@ -962,7 +989,16 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
         return CL_SUCCESS;
       }
     }
-    DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul)
+    case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+    {
+      if (param_value && param_value_size < sizeof(size_t))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(size_t);
+      if (param_value)
+        *(size_t*)param_value = interp_kernel_get_simd_width(kernel->opaque);
+      return CL_SUCCESS;
+    }
     case CL_KERNEL_LOCAL_MEM_SIZE:
     {
       size_t local_mem_sz =  interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index b5db91c..46f9810 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -97,6 +97,7 @@ struct _cl_device_id {
   const char *opencl_c_version;
   const char extensions[256];
   const char *driver_version;
+  const char *spir_versions;
   const char *built_in_kernels;
   size_t name_sz;
   size_t vendor_sz;
@@ -105,9 +106,8 @@ struct _cl_device_id {
   size_t opencl_c_version_sz;
   size_t extensions_sz;
   size_t driver_version_sz;
+  size_t spir_versions_sz;
   size_t built_in_kernels_sz;
-  /* Kernel specific info that we're assigning statically */
-  size_t preferred_wg_sz_mul;
   /* SubDevice specific info */
   cl_device_id parent_device;
   cl_uint      partition_max_sub_device;
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 1ab4dff..4ffca09 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -138,7 +138,7 @@ typedef void (cl_gpgpu_sync_cb)(void*);
 extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
 
 /* Bind a regular unformatted buffer */
-typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t size, uint8_t bti);
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, size_t size, uint8_t bti);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
 /* bind samplers defined in both kernel and kernel args. */
diff --git a/src/cl_event.c b/src/cl_event.c
index bf44197..3391669 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -28,6 +28,33 @@
 #include <assert.h>
 #include <stdio.h>
 
+void cl_event_update_last_events(cl_command_queue queue, int wait)
+{
+  cl_event last_event = get_last_event(queue);
+  if(!last_event) return;
+  cl_event next, now;
+  now = last_event;
+  while(now){
+    next = now->last_next;//get next first in case set status maintain it
+    cl_event_update_status(now,wait);//update event status
+    now = next;
+  }
+}
+
+void cl_event_insert_last_events(cl_command_queue queue,cl_event event)
+{
+  if(!event) return;
+  cl_event last_event = get_last_event(queue);
+  if(last_event){
+    cl_event now = last_event;
+    while(now->last_next)
+      now = now->last_next;
+    now->last_next = event;
+    event->last_prev = now;
+  }
+  else set_last_event(queue,event);
+}
+
 inline cl_bool
 cl_event_is_gpu_command_type(cl_command_type type)
 {
@@ -56,7 +83,7 @@ int cl_event_flush(cl_event event)
     event->gpgpu = NULL;
   }
   cl_gpgpu_event_flush(event->gpgpu_event);
-  set_last_event(event->queue, event);
+  cl_event_insert_last_events(event->queue,event);
   return err;
 }
 
@@ -117,9 +144,6 @@ void cl_event_delete(cl_event event)
   if (atomic_dec(&event->ref_n) > 1)
     return;
 
-  if(event->queue && get_last_event(event->queue) == event)
-    set_last_event(event->queue, NULL);
-
   /* Call all user's callback if haven't execute */
   cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE status will force all callbacks that are not executed to run
 
@@ -223,8 +247,10 @@ cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
     }
     if(event && event == &event_wait_list[i])
       goto error;
-    if(event_wait_list[i]->ctx != ctx)
-      goto error;
+    if(event_wait_list[i]->ctx != ctx) {
+      err = CL_INVALID_CONTEXT;
+      goto exit;
+    }
   }
 
 exit:
@@ -525,8 +551,18 @@ void cl_event_set_status(cl_event event, cl_int status)
     event->waits_head = NULL;
   }
 
-  if(event->status <= CL_COMPLETE)
+  if(event->status <= CL_COMPLETE){
+    /* Maintain the last_list when event completed*/
+    if (event->last_prev)
+      event->last_prev->last_next = event->last_next;
+    if (event->last_next)
+      event->last_next->last_prev = event->last_prev;
+    if(event->queue && get_last_event(event->queue) == event)
+      set_last_event(event->queue, event->last_next);
+    event->last_prev = NULL;
+    event->last_next = NULL;
     cl_event_delete(event);
+  }
 }
 
 void cl_event_update_status(cl_event event, int wait)
@@ -568,9 +604,7 @@ cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
     return CL_SUCCESS;
   }
 
-  cl_event last_event = get_last_event(queue);
-  if(last_event && last_event->gpgpu_event)
-    cl_gpgpu_event_update_status(last_event->gpgpu_event, 1);
+  cl_event_update_last_events(queue,1);
 
   cl_event_set_status(e, CL_COMPLETE);
   return CL_SUCCESS;
@@ -605,9 +639,7 @@ cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
     return CL_SUCCESS;
   }
 
-  cl_event last_event = get_last_event(queue);
-  if(last_event && last_event->gpgpu_event)
-    cl_gpgpu_event_update_status(last_event->gpgpu_event, 1);
+  cl_event_update_last_events(queue,1);
 
   cl_event_set_status(e, CL_COMPLETE);
   return CL_SUCCESS;
diff --git a/src/cl_event.h b/src/cl_event.h
index f7bf09f..67fab19 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -71,6 +71,7 @@ struct _cl_event {
   cl_bool            emplict;     /* Identify this event whether created by api emplict*/
   cl_ulong           timestamp[4];/* The time stamps for profiling. */
   cl_ulong	     queued_timestamp;
+  cl_event   last_next, last_prev;/* We need a list to monitor untouchable api event*/
 };
 
 /* Create a new event object */
@@ -115,5 +116,9 @@ cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
 cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event);
 /* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */
 cl_int cl_event_flush(cl_event event);
+/* monitor or block wait all events in the last_event list */
+void cl_event_update_last_events(cl_command_queue queuet, int wait);
+/* insert the event into the last_event list in queue */
+void cl_event_insert_last_events(cl_command_queue queue, cl_event event);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index bd87cc4..f523228 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -39,7 +39,6 @@
 .native_vector_width_float = 4,
 .native_vector_width_double = 2,
 .native_vector_width_half = 8,
-.preferred_wg_sz_mul = 16,
 .address_bits = 32,
 .max_mem_alloc_size = 512 * 1024 * 1024,
 .image_support = CL_TRUE,
@@ -119,6 +118,7 @@ DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
                                    "__cl_fill_image_3d;")
 
 DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
+DECL_INFO_STRING(spir_versions, "1.2")
 #undef DECL_INFO_STRING
 .parent_device = NULL,
 .partition_max_sub_device = 1,
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 507c910..035a103 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -85,8 +85,6 @@ intel_driver_delete(intel_driver_t *driver)
   if (driver == NULL)
     return;
 
-  if (driver->bufmgr)
-    drm_intel_bufmgr_destroy(driver->bufmgr);
   cl_free(driver);
 }
 
@@ -257,6 +255,10 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
 static void
 intel_driver_close(intel_driver_t *intel)
 {
+  //Due to the drm change about the test usrptr, we need to destroy the bufmgr
+  //befor the driver was closed, otherwise the test usrptr will not be freed.
+  if (intel->bufmgr)
+    drm_intel_bufmgr_destroy(intel->bufmgr);
 #ifdef HAS_X11
   if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
   if(intel->x11_display) XCloseDisplay(intel->x11_display);
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 901bd98..1a76c99 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -26,6 +26,7 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/utsname.h>
 #include <fcntl.h>
 #include <stddef.h>
 #include <errno.h>
@@ -86,7 +87,7 @@ typedef void (intel_gpgpu_set_base_address_t)(intel_gpgpu_t *gpgpu);
 intel_gpgpu_set_base_address_t *intel_gpgpu_set_base_address = NULL;
 
 typedef void (intel_gpgpu_setup_bti_t)(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
-                                       uint32_t size, unsigned char index, uint32_t format);
+                                       size_t size, unsigned char index, uint32_t format);
 intel_gpgpu_setup_bti_t *intel_gpgpu_setup_bti = NULL;
 
 
@@ -210,7 +211,7 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
     return;
 
   if(gpgpu->batch && gpgpu->batch->buffer &&
-     !drm_intel_bo_busy(gpgpu->batch->buffer)) {
+     drm_intel_bo_busy(gpgpu->batch->buffer)) {
     TRY_ALLOC_NO_ERR (node, CALLOC(struct intel_gpgpu_node));
     node->gpgpu = gpgpu;
     node->next = NULL;
@@ -282,9 +283,22 @@ intel_gpgpu_get_cache_ctrl_gen8()
 static uint32_t
 intel_gpgpu_get_cache_ctrl_gen9()
 {
-  //Pre-defined cache control registers 9:
+  //Kernel-defined cache control registers 2:
   //L3CC: WB; LeCC: WB; TC: LLC/eLLC;
-  return (0x9 << 1);
+  int major = 0, minor = 0;
+  int mocs_index = 0x2;
+
+  struct utsname buf;
+  uname(&buf);
+  sscanf(buf.release, "%d.%d", &major, &minor);
+  //From linux 4.3, kernel redefined the mocs table's value,
+  //But before 4.3, still used the hw defautl value.
+  if(strcmp(buf.sysname, "Linux") == 0 &&
+     major == 4 && minor < 3) { /* linux kernel support skl from  4.x, so check from 4 */
+    mocs_index = 0x9;
+  }
+
+  return (mocs_index << 1);
 }
 
 static void
@@ -1000,9 +1014,10 @@ intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t b
 
 static void
 intel_gpgpu_setup_bti_gen7(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
-                                   uint32_t size, unsigned char index, uint32_t format)
+                                   size_t size, unsigned char index, uint32_t format)
 {
-  uint32_t s = size - 1;
+  assert(size <= (2ul<<30));
+  size_t s = size - 1;
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
   gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
   memset(ss0, 0, sizeof(gen7_surface_state_t));
@@ -1030,9 +1045,10 @@ intel_gpgpu_setup_bti_gen7(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t int
 
 static void
 intel_gpgpu_setup_bti_gen75(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
-                                   uint32_t size, unsigned char index, uint32_t format)
+                                   size_t size, unsigned char index, uint32_t format)
 {
-  uint32_t s = size - 1;
+  assert(size <= (2ul<<30));
+  size_t s = size - 1;
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
   gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
   memset(ss0, 0, sizeof(gen7_surface_state_t));
@@ -1066,9 +1082,10 @@ intel_gpgpu_setup_bti_gen75(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t in
 
 static void
 intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
-                                   uint32_t size, unsigned char index, uint32_t format)
+                                   size_t size, unsigned char index, uint32_t format)
 {
-  uint32_t s = size - 1;
+  assert(size <= (2ul<<30));
+  size_t s = size - 1;
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
   gen8_surface_state_t *ss0 = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
   memset(ss0, 0, sizeof(gen8_surface_state_t));
@@ -1143,7 +1160,8 @@ static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_
         IS_HASWELL(gpgpu->drv->device_id) ||
         IS_BROADWELL(gpgpu->drv->device_id) ||
         IS_CHERRYVIEW(gpgpu->drv->device_id) ||
-        IS_SKYLAKE(gpgpu->drv->device_id))) &&
+        IS_SKYLAKE(gpgpu->drv->device_id) ||
+        IS_BROXTON(gpgpu->drv->device_id))) &&
       index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
       type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     surface_type = I965_SURFACE_2D;
@@ -1395,7 +1413,7 @@ intel_gpgpu_bind_image_gen9(intel_gpgpu_t *gpgpu,
 
 static void
 intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
-                     uint32_t internal_offset, uint32_t size, uint8_t bti)
+                     uint32_t internal_offset, size_t size, uint8_t bti)
 {
   assert(gpgpu->binded_n < max_buf_n);
   gpgpu->binded_buf[gpgpu->binded_n] = buf;
@@ -2178,7 +2196,7 @@ intel_set_gpgpu_callbacks(int device_id)
 	intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
     return;
   }
-  if (IS_SKYLAKE(device_id)) {
+  if (IS_SKYLAKE(device_id) || IS_BROXTON(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
     cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9;
diff --git a/utests/compiler_cl_finish.cpp b/utests/compiler_cl_finish.cpp
index 7c7dee3..c637ecc 100644
--- a/utests/compiler_cl_finish.cpp
+++ b/utests/compiler_cl_finish.cpp
@@ -9,7 +9,7 @@ static void compiler_cl_finish(void)
 {
   const size_t n = 16*1024*1024;
   struct timeval t1, t2;
-  float t_fin, t_map_w_fin,t_map_wo_fin;
+  float t_map_w_fin,t_map_wo_fin;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("test_cl_finish");
@@ -26,10 +26,7 @@ static void compiler_cl_finish(void)
 
   // 1st time map after clFinish
   OCL_NDRANGE(1);
-  T_GET(t1);
   OCL_FINISH();
-  T_GET(t2);
-  t_fin = T_LAPSE(t1, t2);
 
   T_GET(t1);
   OCL_MAP_BUFFER(0);
@@ -43,7 +40,7 @@ static void compiler_cl_finish(void)
   T_GET(t2);
   t_map_wo_fin = T_LAPSE(t1, t2);
 
-  OCL_ASSERT(t_fin > t_map_w_fin && t_map_wo_fin > t_map_w_fin);
+  OCL_ASSERT(t_map_wo_fin > t_map_w_fin);
   OCL_UNMAP_BUFFER(0);
 }
 
diff --git a/utests/profiling_exec.cpp b/utests/profiling_exec.cpp
index afa55ba..4232772 100644
--- a/utests/profiling_exec.cpp
+++ b/utests/profiling_exec.cpp
@@ -85,6 +85,7 @@ static void profiling_exec(void)
     OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL);
     OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL);
 
+    clReleaseEvent(exec_event);
     check_profiling_time(time_queue, time_submit, time_start, time_end);
 
     // Compare

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-commits mailing list