[beignet] 01/09: New upstream version 1.3.2
Rebecca Palmer
rnpalmer-guest at moszumanska.debian.org
Sat Oct 28 07:34:13 UTC 2017
This is an automated email from the git hooks/post-receive script.
rnpalmer-guest pushed a commit to branch master
in repository beignet.
commit e55975147457d5a666c2744da5caef738b44842d
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date: Thu Oct 26 22:48:43 2017 +0100
New upstream version 1.3.2
---
CMakeLists.txt | 2 +-
backend/src/backend/context.cpp | 6 +-
backend/src/backend/gen75_encoder.cpp | 10 +-
backend/src/backend/gen7_encoder.cpp | 10 +-
backend/src/backend/gen7_instruction.hpp | 5 +-
backend/src/backend/gen9_context.cpp | 4 +
backend/src/backend/gen9_context.hpp | 14 ++
backend/src/backend/gen_context.cpp | 38 +++--
backend/src/backend/gen_insn_selection.cpp | 80 +++++++++
backend/src/backend/gen_insn_selection.hpp | 7 +
.../src/backend/gen_insn_selection_optimize.cpp | 67 +++++++-
backend/src/backend/gen_insn_selection_output.cpp | 42 +++++
backend/src/backend/gen_program.cpp | 116 ++++++++-----
backend/src/backend/program.cpp | 163 +++++++++---------
backend/src/backend/program.h | 10 +-
backend/src/backend/program.hpp | 4 +-
backend/src/ir/function.hpp | 2 +-
backend/src/ir/half.cpp | 20 +++
backend/src/ir/lowering.cpp | 17 +-
backend/src/ir/profiling.cpp | 1 +
backend/src/ir/unit.hpp | 4 +-
backend/src/libocl/CMakeLists.txt | 4 +-
backend/src/libocl/include/ocl.h | 4 +
backend/src/libocl/include/ocl_enqueue.h | 41 +++--
backend/src/libocl/include/ocl_image.h | 2 +
backend/src/libocl/include/ocl_misc.h | 2 +
backend/src/libocl/src/ocl_enqueue.cl | 54 +++---
backend/src/libocl/src/ocl_memcpy.cl | 3 +-
backend/src/libocl/src/ocl_misc.cl | 62 +++----
backend/src/libocl/src/ocl_sampler.ll | 10 ++
backend/src/libocl/src/ocl_sampler_20.ll | 10 ++
backend/src/llvm/ExpandLargeIntegers.cpp | 6 +-
backend/src/llvm/ExpandUtils.cpp | 4 +
backend/src/llvm/PromoteIntegers.cpp | 5 +
backend/src/llvm/StripAttributes.cpp | 15 +-
backend/src/llvm/llvm_barrier_nodup.cpp | 11 +-
backend/src/llvm/llvm_bitcode_link.cpp | 67 ++++++--
backend/src/llvm/llvm_device_enqueue.cpp | 52 +++---
backend/src/llvm/llvm_gen_backend.cpp | 188 ++++++++++++++-------
backend/src/llvm/llvm_gen_backend.hpp | 13 +-
backend/src/llvm/llvm_gen_ocl_function.hxx | 4 +
backend/src/llvm/llvm_includes.hpp | 20 ++-
backend/src/llvm/llvm_intrinsic_lowering.cpp | 7 +-
backend/src/llvm/llvm_loadstore_optimization.cpp | 14 +-
backend/src/llvm/llvm_passes.cpp | 45 +++--
backend/src/llvm/llvm_printf_parser.cpp | 16 +-
backend/src/llvm/llvm_profiling.cpp | 46 +++--
backend/src/llvm/llvm_sampler_fix.cpp | 51 +++++-
backend/src/llvm/llvm_scalarize.cpp | 8 +-
backend/src/llvm/llvm_to_gen.cpp | 96 +++++------
backend/src/llvm/llvm_to_gen.hpp | 8 +-
backend/src/llvm/llvm_unroll.cpp | 21 ++-
docs/Beignet.mdwn | 2 +-
docs/NEWS.mdwn | 3 +
docs/howto/gl-buffer-sharing-howto.mdwn | 14 +-
include/CL/cl_intel.h | 6 +
kernels/compiler_if_else.cl | 2 -
kernels/compiler_remove_negative_add.cl | 4 +
kernels/compiler_reqd_sub_group_size.cl | 5 +
kernels/compiler_sqrt_div.cl | 8 +
kernels/test_fill_gl_image.cl | 2 +-
src/CMakeLists.txt | 4 +-
src/cl_context.c | 58 +++----
src/cl_context.h | 2 -
src/cl_device_data.h | 15 +-
src/cl_device_id.c | 78 ++++++++-
src/cl_device_id.h | 2 +
src/cl_extensions.c | 10 +-
src/cl_extensions.h | 3 +-
src/cl_gbe_loader.cpp | 5 +
src/cl_gbe_loader.h | 1 +
src/cl_gl_api.c | 150 ++++++++++++++++
src/cl_gt_device.h | 2 +
src/cl_program.c | 57 ++-----
src/intel/intel_driver.c | 31 ++--
src/intel/intel_gpgpu.c | 2 +
utests/CMakeLists.txt | 5 +-
utests/compiler_if_else.cpp | 15 +-
utests/compiler_remove_negative_add.cpp | 40 +++++
utests/compiler_reqd_sub_group_size.cpp | 46 +++++
utests/compiler_sqrt_div.cpp | 61 +++++++
utests/enqueue_built_in_kernels.cpp | 1 +
utests/utest_helper.cpp | 20 +++
utests/utest_helper.hpp | 3 +
84 files changed, 1569 insertions(+), 599 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index face3ce..c11acbb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -276,7 +276,7 @@ ENDIF(CAN_OPENCL_20)
set (LIBCL_DRIVER_VERSION_MAJOR 1)
set (LIBCL_DRIVER_VERSION_MINOR 3)
-set (LIBCL_DRIVER_VERSION_PATCH 1)
+set (LIBCL_DRIVER_VERSION_PATCH 2)
if (ENABLE_OPENCL_20)
set (LIBCL_C_VERSION_MAJOR 2)
set (LIBCL_C_VERSION_MINOR 0)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index e9ddd17..c9500c8 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -340,7 +340,6 @@ namespace gbe
///////////////////////////////////////////////////////////////////////////
// Generic Context (shared by the simulator and the HW context)
///////////////////////////////////////////////////////////////////////////
- IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
Context::Context(const ir::Unit &unit, const std::string &name) :
unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL), useDWLabel(false)
@@ -361,10 +360,7 @@ namespace gbe
}
void Context::startNewCG(uint32_t simdWidth) {
- if (simdWidth == 0 || OCL_SIMD_WIDTH != 15)
- this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
- else
- this->simdWidth = simdWidth;
+ this->simdWidth = simdWidth;
GBE_SAFE_DELETE(this->registerAllocator);
GBE_SAFE_DELETE(this->scratchAllocator);
GBE_ASSERT(dag != NULL && liveness != NULL);
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index b82cc43..06cca3c 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -53,8 +53,14 @@ namespace gbe
gen7_insn->header.quarter_control = this->curr.quarterControl;
gen7_insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
gen7_insn->header.mask_control = this->curr.noMask;
- gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
- gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+ if (insn->header.opcode == GEN_OPCODE_MAD || insn->header.opcode == GEN_OPCODE_LRP)
+ {
+ gen7_insn->bits1.da3src.flag_reg_nr = this->curr.flag;
+ gen7_insn->bits1.da3src.flag_sub_reg_nr = this->curr.subFlag;
+ } else {
+ gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+ gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+ }
if (this->curr.predicate != GEN_PREDICATE_NONE) {
gen7_insn->header.predicate_control = this->curr.predicate;
gen7_insn->header.predicate_inverse = this->curr.inversePredicate;
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index 4b2cd9a..d526f5d 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -46,8 +46,14 @@ namespace gbe
gen7_insn->header.quarter_control = this->curr.quarterControl;
gen7_insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
gen7_insn->header.mask_control = this->curr.noMask;
- gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
- gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+ if (insn->header.opcode == GEN_OPCODE_MAD || insn->header.opcode == GEN_OPCODE_LRP)
+ {
+ gen7_insn->bits1.da3src.flag_reg_nr = this->curr.flag;
+ gen7_insn->bits1.da3src.flag_sub_reg_nr = this->curr.subFlag;
+ } else {
+ gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+ gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+ }
if (this->curr.predicate != GEN_PREDICATE_NONE) {
gen7_insn->header.predicate_control = this->curr.predicate;
gen7_insn->header.predicate_inverse = this->curr.inversePredicate;
diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp
index 7d7eada..c985fb8 100644
--- a/backend/src/backend/gen7_instruction.hpp
+++ b/backend/src/backend/gen7_instruction.hpp
@@ -142,8 +142,9 @@ union Gen7NativeInstruction
struct {
uint32_t dest_reg_file:1;
- uint32_t flag_subreg_num:1;
- uint32_t pad0:2;
+ uint32_t flag_sub_reg_nr:1;
+ uint32_t flag_reg_nr:1;
+ uint32_t pad0:1;
uint32_t src0_abs:1;
uint32_t src0_negate:1;
uint32_t src1_abs:1;
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 483b2c3..2ce53b6 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -236,4 +236,8 @@ namespace gbe
this->sel = GBE_NEW(SelectionKbl, *this);
}
+ void GlkContext::newSelection(void) {
+ this->sel = GBE_NEW(SelectionGlk, *this);
+ }
+
}
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 9977e9a..0476661 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -82,5 +82,19 @@ namespace gbe
private:
virtual void newSelection(void);
};
+
+ /* This class is used to implement the geminilake
+ specific logic for context. */
+ class GlkContext : public BxtContext
+ {
+ public:
+ virtual ~GlkContext(void) { };
+ GlkContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+ : BxtContext(unit, name, deviceID, relaxMath) {
+ };
+
+ private:
+ virtual void newSelection(void);
+ };
}
#endif /* __GBE_GEN9_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index bb104cf..874e79f 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1949,23 +1949,33 @@ namespace gbe
indirect_src = GenRegister::indirect(dst.type, 0, GEN_WIDTH_1,
GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL, GEN_HORIZONTAL_STRIDE_0);
- p->push();
- p->curr.execWidth = 8;
- p->curr.quarterControl = GEN_COMPRESSION_Q1;
- p->MOV(a0, tmp);
- p->MOV(dst, indirect_src);
- p->pop();
-
- if (simdWidth == 16) {
+ if (sel->isScalarReg(dst.reg())) {
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->MOV(a0, tmp);
+ p->MOV(dst, indirect_src);
+ p->pop();
+ } else {
p->push();
p->curr.execWidth = 8;
- p->curr.quarterControl = GEN_COMPRESSION_Q2;
-
- const GenRegister nextDst = GenRegister::Qn(dst, 1);
- const GenRegister nextOffset = GenRegister::Qn(tmp, 1);
- p->MOV(a0, nextOffset);
- p->MOV(nextDst, indirect_src);
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MOV(a0, tmp);
+ p->MOV(dst, indirect_src);
p->pop();
+
+ if (simdWidth == 16) {
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+
+ const GenRegister nextDst = GenRegister::Qn(dst, 1);
+ const GenRegister nextOffset = GenRegister::Qn(tmp, 1);
+ p->MOV(a0, nextOffset);
+ p->MOV(nextDst, indirect_src);
+ p->pop();
+ }
}
}
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 22b0ddc..e91a14b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2859,6 +2859,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
opt_features = SIOF_LOGICAL_SRCMOD;
}
+ SelectionGlk::SelectionGlk(GenContext &ctx) : Selection(ctx) {
+ this->opaque->setHas32X32Mul(true);
+ this->opaque->setHasLongType(true);
+ this->opaque->setLongRegRestrict(true);
+ this->opaque->setHasDoubleType(true);
+ this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+ this->opaque->setSlowByteGather(false);
+ this->opaque->setHasHalfType(true);
+ opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT;
+ }
+
void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
uint32_t bti, bool is3D) {
uint32_t elemID = 0;
@@ -3741,6 +3752,74 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
}
};
+ /*! there some patterns like:
+ sqrt r1, r2;
+ load r4, 1.0; ===> rqrt r3, r2
+ div r3, r4, r1; */
+ class SqrtDivInstructionPattern : public SelectionPattern {
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ SqrtDivInstructionPattern(void) : SelectionPattern(1, 1) { this->opcodes.push_back(ir::OP_DIV); }
+
+ /*! Implements base class */
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+ using namespace ir;
+
+ // We are good to try. We need a MUL for one of the two sources
+ const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+ if (insn.getType() != TYPE_FLOAT)
+ return false;
+ SelectionDAG *child0 = dag.child[0];
+ SelectionDAG *child1 = dag.child[1];
+ const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT);
+
+ if (child1 && child1->insn.getOpcode() == OP_SQR) {
+ GBE_ASSERT(cast<ir::UnaryInstruction>(child1->insn).getType() == TYPE_FLOAT);
+ GenRegister srcSQR = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
+ const GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_FLOAT);
+ const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_FLOAT);
+ float immVal = 0.0f;
+
+ if (child0 && child0->insn.getOpcode() == OP_LOADI) {
+ const auto &loadimm = cast<LoadImmInstruction>(child0->insn);
+ const Immediate imm = loadimm.getImmediate();
+ const Type type = imm.getType();
+ if (type == TYPE_FLOAT)
+ immVal = imm.getFloatValue();
+ else if (type == TYPE_S32 || type == TYPE_U32)
+ immVal = imm.getIntegerValue();
+ }
+
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)))
+ sel.curr.execWidth = 1;
+
+ if (immVal == 1.0f) {
+ sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, srcSQR);
+ if (child1->child[0])
+ child1->child[0]->isRoot = 1;
+ } else {
+ sel.MATH(tmp, GEN_MATH_FUNCTION_RSQ, srcSQR);
+ if (immVal != 0.0f) {
+ GenRegister isrc = GenRegister::immf(immVal);
+ sel.MUL(dst, tmp, isrc);
+ } else {
+ sel.MUL(dst, src0, tmp);
+ if (child0)
+ child0->isRoot = 1;
+ }
+
+ if (child1->child[0])
+ child1->child[0]->isRoot = 1;
+ }
+ sel.pop();
+
+ return true;
+ }
+ return false;
+ }
+ };
+
/*! sel.{le,l,ge...} like patterns */
class SelectModifierInstructionPattern : public SelectionPattern
{
@@ -8078,6 +8157,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
SelectionLibrary::SelectionLibrary(void) {
this->insert<UnaryInstructionPattern>();
+ this->insert<SqrtDivInstructionPattern>();
this->insert<BinaryInstructionPattern>();
this->insert<TypedWriteInstructionPattern>();
this->insert<SyncInstructionPattern>();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index a99b8a9..8f34678 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -374,6 +374,13 @@ namespace gbe
SelectionKbl(GenContext &ctx);
};
+ class SelectionGlk: public Selection
+ {
+ public:
+ /*! Initialize internal structures used for the selection */
+ SelectionGlk(GenContext &ctx);
+ };
+
} /* namespace gbe */
#endif /* __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp b/backend/src/backend/gen_insn_selection_optimize.cpp
index d2e0fb9..2b2f8bc 100644
--- a/backend/src/backend/gen_insn_selection_optimize.cpp
+++ b/backend/src/backend/gen_insn_selection_optimize.cpp
@@ -9,6 +9,7 @@
#include <algorithm>
#include <climits>
#include <map>
+#include <math.h>
namespace gbe
{
@@ -74,8 +75,7 @@ namespace gbe
const GenRegister& replacement) :
insn(insn), intermedia(intermedia), replacement(replacement)
{
- assert(insn.opcode == SEL_OP_MOV);
- assert(&(insn.src(0)) == &replacement);
+ assert(insn.opcode == SEL_OP_MOV || insn.opcode == SEL_OP_ADD);
assert(&(insn.dst(0)) == &intermedia);
this->elements = CalculateElements(intermedia, insn.state.execWidth);
replacementOverwritten = false;
@@ -102,6 +102,7 @@ namespace gbe
void doReplacement(ReplaceInfo* info);
bool CanBeReplaced(const ReplaceInfo* info, const SelectionInstruction& insn, const GenRegister& var);
void cleanReplaceInfoMap();
+ void doNegAddOptimization(SelectionInstruction &insn);
SelectionBlock &bb;
const ir::Liveness::LiveOut& liveout;
@@ -159,8 +160,13 @@ namespace gbe
void SelBasicBlockOptimizer::addToReplaceInfoMap(SelectionInstruction& insn)
{
- assert(insn.opcode == SEL_OP_MOV);
- const GenRegister& src = insn.src(0);
+ assert(insn.opcode == SEL_OP_MOV || insn.opcode == SEL_OP_ADD);
+ GenRegister &src = insn.src(0);
+ if (insn.opcode == SEL_OP_ADD) {
+ if (src.file == GEN_IMMEDIATE_VALUE)
+ src = insn.src(1);
+ }
+
const GenRegister& dst = insn.dst(0);
if (src.type != dst.type || src.file != dst.file)
return;
@@ -184,6 +190,40 @@ namespace gbe
if (insn.opcode == SEL_OP_BSWAP) //should remove once bswap issue is fixed
return false;
+ //the src modifier is not supported by the following instructions
+ if(info->replacement.negation || info->replacement.absolute)
+ {
+ switch(insn.opcode)
+ {
+ case SEL_OP_MATH:
+ {
+ switch(insn.extra.function)
+ {
+ case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT:
+ case GEN_MATH_FUNCTION_INT_DIV_REMAINDER:
+ case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+ return false;
+ default:
+ break;
+ }
+
+ break;
+ }
+ case SEL_OP_CBIT:
+ case SEL_OP_FBH:
+ case SEL_OP_FBL:
+ case SEL_OP_BRC:
+ case SEL_OP_BRD:
+ case SEL_OP_BFREV:
+ case SEL_OP_LZD:
+ case SEL_OP_HADD:
+ case SEL_OP_RHADD:
+ return false;
+ default:
+ break;
+ }
+ }
+
if (insn.isWrite() || insn.isRead()) //register in selection vector
return false;
@@ -249,10 +289,29 @@ namespace gbe
if (insn.opcode == SEL_OP_MOV)
addToReplaceInfoMap(insn);
+
+ doNegAddOptimization(insn);
}
cleanReplaceInfoMap();
}
+ /* LLVM transform Mad(a, -b, c) to
+ Add b, -b, 0
+ Mad val, a, b, c
+ for Gen support negtive modifier, mad(a, -b, c) is native suppoted.
+ Also it can be used for the same like instruction sequence.
+ Do it just like a: mov b, -b, so it is a Mov operation like LocalCopyPropagation
+ */
+ void SelBasicBlockOptimizer::doNegAddOptimization(SelectionInstruction &insn) {
+ if (insn.opcode == SEL_OP_ADD) {
+ GenRegister src0 = insn.src(0);
+ GenRegister src1 = insn.src(1);
+ if ((src0.negation && src1.file == GEN_IMMEDIATE_VALUE && src1.value.f == 0.0f) ||
+ (src1.negation && src0.file == GEN_IMMEDIATE_VALUE && src0.value.f == 0.0f))
+ addToReplaceInfoMap(insn);
+ }
+ }
+
void SelBasicBlockOptimizer::run()
{
for (size_t i = 0; i < MaxTries; ++i) {
diff --git a/backend/src/backend/gen_insn_selection_output.cpp b/backend/src/backend/gen_insn_selection_output.cpp
index f23e8c8..33f0d15 100644
--- a/backend/src/backend/gen_insn_selection_output.cpp
+++ b/backend/src/backend/gen_insn_selection_output.cpp
@@ -142,6 +142,48 @@ namespace gbe
}
}
+ if (insn.opcode == SEL_OP_MATH) {
+ switch (insn.extra.function) {
+ case GEN_MATH_FUNCTION_INV:
+ strcat(opname, ".inv");
+ break;
+ case GEN_MATH_FUNCTION_LOG:
+ strcat(opname, ".log");
+ break;
+ case GEN_MATH_FUNCTION_EXP:
+ strcat(opname, ".exp");
+ break;
+ case GEN_MATH_FUNCTION_SQRT:
+ strcat(opname, ".sqrt");
+ break;
+ case GEN_MATH_FUNCTION_RSQ:
+ strcat(opname, ".rsq");
+ break;
+ case GEN_MATH_FUNCTION_SIN:
+ strcat(opname, ".sin");
+ break;
+ case GEN_MATH_FUNCTION_COS:
+ strcat(opname, ".cos");
+ break;
+ case GEN_MATH_FUNCTION_FDIV:
+ strcat(opname, ".fdiv");
+ break;
+ case GEN_MATH_FUNCTION_POW:
+ strcat(opname, ".pow");
+ break;
+ case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+ strcat(opname, ".intdivmod");
+ break;
+ case GEN_MATH_FUNCTION_INT_DIV_QUOTIENT:
+ strcat(opname, ".intdiv");
+ break;
+ case GEN_MATH_FUNCTION_INT_DIV_REMAINDER:
+ strcat(opname, ".intmod");
+ break;
+ }
+ }
+
+
int n = strlen(opname);
if(n >= OP_NAME_LENGTH - 20) {
cout << "opname too long: " << opname << endl;
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 073ede6..e06ed40 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -24,18 +24,18 @@
#ifdef GBE_COMPILER_AVAILABLE
#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/DataLayout.h"
-#else
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/DataLayout.h"
-#endif /* LLVM_VERSION_MINOR <= 2 */
#include "llvm-c/Linker.h"
+#include "llvm-c/BitReader.h"
+#include "llvm-c/BitWriter.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+#include "llvm/Bitcode/BitcodeWriter.h"
+#else
#include "llvm/Bitcode/ReaderWriter.h"
+#endif /* LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40 */
#include "llvm/Support/raw_ostream.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/MemoryBuffer.h"
@@ -61,6 +61,7 @@
#include <clang/CodeGen/CodeGenAction.h>
#endif
+#include "sys/cvar.hpp"
#include <cstring>
#include <sstream>
#include <memory>
@@ -127,30 +128,46 @@ namespace gbe {
void GenProgram::CleanLlvmResource(void){
#ifdef GBE_COMPILER_AVAILABLE
+ llvm::LLVMContext* ctx = NULL;
if(module){
+ ctx = &((llvm::Module*)module)->getContext();
+ (void)ctx;
delete (llvm::Module*)module;
module = NULL;
}
-
+//llvm's version < 3.9, ctx is global ctx, can't be deleted.
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+ //each module's context is individual, just delete it, ignaor llvm_ctx.
+ if (ctx != NULL)
+ delete ctx;
+#else
if(llvm_ctx){
delete (llvm::LLVMContext*)llvm_ctx;
llvm_ctx = NULL;
}
#endif
+#endif
}
/*! We must avoid spilling at all cost with Gen */
- static const struct CodeGenStrategy {
+ struct CodeGenStrategy {
uint32_t simdWidth;
uint32_t reservedSpillRegs;
bool limitRegisterPressure;
- } codeGenStrategy[] = {
+ };
+ static const struct CodeGenStrategy codeGenStrategyDefault[] = {
{16, 0, false},
{8, 0, false},
{8, 8, false},
{8, 16, false},
};
+ static const struct CodeGenStrategy codeGenStrategySimd16[] = {
+ {16, 0, false},
+ {16, 8, false},
+ {16, 16, false},
+ };
+ IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name,
bool relaxMath, int profiling) {
#ifdef GBE_COMPILER_AVAILABLE
@@ -158,19 +175,23 @@ namespace gbe {
// when the function already provides the simd width we need to use (i.e.
// non zero)
const ir::Function *fn = unit.getFunction(name);
+ const struct CodeGenStrategy* codeGenStrategy = codeGenStrategyDefault;
if(fn == NULL)
GBE_ASSERT(0);
- uint32_t codeGenNum = sizeof(codeGenStrategy) / sizeof(codeGenStrategy[0]);
+ uint32_t codeGenNum = sizeof(codeGenStrategyDefault) / sizeof(codeGenStrategyDefault[0]);
uint32_t codeGen = 0;
GenContext *ctx = NULL;
- if (fn->getSimdWidth() == 8) {
+ if ( fn->getSimdWidth() != 0 && OCL_SIMD_WIDTH != 15) {
+ GBE_ASSERTM(0, "unsupported SIMD width!");
+ }else if (fn->getSimdWidth() == 8 || OCL_SIMD_WIDTH == 8) {
codeGen = 1;
- } else if (fn->getSimdWidth() == 16) {
- codeGenNum = 1;
- } else if (fn->getSimdWidth() == 0) {
+ } else if (fn->getSimdWidth() == 16 || OCL_SIMD_WIDTH == 16){
+ codeGenStrategy = codeGenStrategySimd16;
+ codeGenNum = sizeof(codeGenStrategySimd16) / sizeof(codeGenStrategySimd16[0]);
+ } else if (fn->getSimdWidth() == 0 && OCL_SIMD_WIDTH == 15) {
codeGen = 0;
} else
- GBE_ASSERT(0);
+ GBE_ASSERTM(0, "unsupported SIMD width!");
Kernel *kernel = NULL;
// Stop when compilation is successful
@@ -188,6 +209,8 @@ namespace gbe {
ctx = GBE_NEW(BxtContext, unit, name, deviceID, relaxMath);
} else if (IS_KABYLAKE(deviceID)) {
ctx = GBE_NEW(KblContext, unit, name, deviceID, relaxMath);
+ } else if (IS_GEMINILAKE(deviceID)) {
+ ctx = GBE_NEW(GlkContext, unit, name, deviceID, relaxMath);
}
GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
@@ -243,6 +266,7 @@ namespace gbe {
GBHI_SKL = 5,
GBHI_BXT = 6,
GBHI_KBL = 7,
+ GBHI_GLK = 8,
GBHI_MAX,
};
#define GEN_BINARY_VERSION 1
@@ -254,7 +278,8 @@ namespace gbe {
{GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'B', 'D', 'W'},
{GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'S', 'K', 'L'},
{GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'B', 'X', 'T'},
- {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'K', 'B', 'T'}
+ {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'K', 'B', 'T'},
+ {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'G', 'L', 'K'}
};
#define FILL_GEN_HEADER(binary, index) do {int i = 0; do {*(binary+i) = gen_binary_header[index][i]; i++; }while(i < GEN_BINARY_HEADER_LENGTH);}while(0)
@@ -266,6 +291,7 @@ namespace gbe {
#define FILL_SKL_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_SKL)
#define FILL_BXT_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BXT)
#define FILL_KBL_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_KBL)
+#define FILL_GLK_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_GLK)
static bool genHeaderCompare(const unsigned char *BufPtr, GEN_BINARY_HEADER_INDEX index)
{
@@ -291,6 +317,7 @@ namespace gbe {
#define MATCH_SKL_HEADER(binary) genHeaderCompare(binary, GBHI_SKL)
#define MATCH_BXT_HEADER(binary) genHeaderCompare(binary, GBHI_BXT)
#define MATCH_KBL_HEADER(binary) genHeaderCompare(binary, GBHI_KBL)
+#define MATCH_GLK_HEADER(binary) genHeaderCompare(binary, GBHI_GLK)
#define MATCH_DEVICE(deviceID, binary) ((IS_IVYBRIDGE(deviceID) && MATCH_IVB_HEADER(binary)) || \
(IS_IVYBRIDGE(deviceID) && MATCH_IVB_HEADER(binary)) || \
@@ -300,7 +327,8 @@ namespace gbe {
(IS_CHERRYVIEW(deviceID) && MATCH_CHV_HEADER(binary)) || \
(IS_SKYLAKE(deviceID) && MATCH_SKL_HEADER(binary)) || \
(IS_BROXTON(deviceID) && MATCH_BXT_HEADER(binary)) || \
- (IS_KABYLAKE(deviceID) && MATCH_KBL_HEADER(binary)) \
+ (IS_KABYLAKE(deviceID) && MATCH_KBL_HEADER(binary)) || \
+ (IS_GEMINILAKE(deviceID) && MATCH_GLK_HEADER(binary)) \
)
static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
@@ -335,20 +363,20 @@ namespace gbe {
//the first byte stands for binary_type.
binary_content.assign(binary+1, size-1);
llvm::StringRef llvm_bin_str(binary_content);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- llvm::LLVMContext& c = GBEGetLLVMContext();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+ llvm::LLVMContext *c = new llvm::LLVMContext;
#else
- llvm::LLVMContext& c = llvm::getGlobalContext();
+ llvm::LLVMContext *c = &llvm::getGlobalContext();
#endif
llvm::SMDiagnostic Err;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
std::unique_ptr<llvm::MemoryBuffer> memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
acquireLLVMContextLock();
- llvm::Module* module = llvm::parseIR(memory_buffer->getMemBufferRef(), Err, c).release();
+ llvm::Module* module = llvm::parseIR(memory_buffer->getMemBufferRef(), Err, *c).release();
#else
llvm::MemoryBuffer* memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
acquireLLVMContextLock();
- llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
+ llvm::Module* module = llvm::ParseIR(memory_buffer, Err, *c);
#endif
// if load 32 bit spir binary, the triple should be spir-unknown-unknown.
llvm::Triple triple(module->getTargetTriple());
@@ -408,6 +436,8 @@ namespace gbe {
FILL_BXT_HEADER(*binary);
}else if(IS_KABYLAKE(prog->deviceID)){
FILL_KBL_HEADER(*binary);
+ }else if(IS_GEMINILAKE(prog->deviceID)){
+ FILL_GLK_HEADER(*binary);
}else {
free(*binary);
*binary = NULL;
@@ -436,7 +466,6 @@ namespace gbe {
}
static gbe_program genProgramNewFromLLVM(uint32_t deviceID,
- const char *fileName,
const void* module,
const void* llvm_ctx,
const char* asm_file_name,
@@ -456,7 +485,7 @@ namespace gbe {
#ifdef GBE_COMPILER_AVAILABLE
std::string error;
// Try to compile the program
- if (program->buildFromLLVMFile(fileName, module, error, optLevel) == false) {
+ if (program->buildFromLLVMModule(module, error, optLevel) == false) {
if (err != NULL && errSize != NULL && stringSize > 0u) {
const size_t msgSize = std::min(error.size(), stringSize-1u);
std::memcpy(err, error.c_str(), msgSize);
@@ -486,31 +515,39 @@ namespace gbe {
{
#ifdef GBE_COMPILER_AVAILABLE
using namespace gbe;
- char* errMsg;
+ char* errMsg = NULL;
if(((GenProgram*)dst_program)->module == NULL){
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+ LLVMModuleRef modRef;
+ LLVMParseBitcodeInContext2(wrap(new llvm::LLVMContext()),
+ LLVMWriteBitcodeToMemoryBuffer(wrap((llvm::Module*)((GenProgram*)src_program)->module)),
+ &modRef);
+ ((GenProgram*)dst_program)->module = llvm::unwrap(modRef);
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module).release();
#else
((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module);
#endif
errSize = 0;
- }else{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- // Src now will be removed automatically. So clone it.
- llvm::Module* src = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module).release();
-#else
+ } else {
llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
-#endif
llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- if (LLVMLinkModules2(wrap(dst), wrap(src))) {
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+ if (&src->getContext() != &dst->getContext()) {
+ LLVMModuleRef modRef;
+ LLVMParseBitcodeInContext2(wrap(&dst->getContext()),
+ LLVMWriteBitcodeToMemoryBuffer(wrap(src)),
+ &modRef);
+ src = llvm::unwrap(modRef);
+ }
+ llvm::Module* clone = llvm::CloneModule(src).release();
+ if (LLVMLinkModules2(wrap(dst), wrap(clone))) {
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource_Removed, &errMsg)) {
#else
if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource, &errMsg)) {
#endif
- if (err != NULL && errSize != NULL && stringSize > 0u) {
+ if (err != NULL && errSize != NULL && stringSize > 0u && errMsg) {
strncpy(err, errMsg, stringSize-1);
err[stringSize-1] = '\0';
*errSize = strlen(err);
@@ -518,7 +555,6 @@ namespace gbe {
return true;
}
}
- // Everything run fine
#endif
return false;
}
@@ -579,7 +615,7 @@ namespace gbe {
acquireLLVMContextLock();
llvm::Module* module = (llvm::Module*)p->module;
- if (p->buildFromLLVMFile(NULL, module, error, optLevel) == false) {
+ if (p->buildFromLLVMModule(module, error, optLevel) == false) {
if (err != NULL && errSize != NULL && stringSize > 0u) {
const size_t msgSize = std::min(error.size(), stringSize-1u);
std::memcpy(err, error.c_str(), msgSize);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 09c79d8..c37c595 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -40,6 +40,7 @@
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IRReader/IRReader.h"
#endif
#include <cstring>
@@ -52,34 +53,23 @@
#include <mutex>
#ifdef GBE_COMPILER_AVAILABLE
-/* Not defined for LLVM 3.0 */
-#if !defined(LLVM_VERSION_MAJOR)
-#define LLVM_VERSION_MAJOR 3
-#endif /* !defined(LLVM_VERSION_MAJOR) */
-
-/* Not defined for LLVM 3.0 */
-#if !defined(LLVM_VERSION_MINOR)
-#define LLVM_VERSION_MINOR 0
-#endif /* !defined(LLVM_VERSION_MINOR) */
#include <clang/CodeGen/CodeGenAction.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/CompilerInvocation.h>
-#if LLVM_VERSION_MINOR <= 1
-#include <clang/Frontend/DiagnosticOptions.h>
-#else
#include <clang/Basic/DiagnosticOptions.h>
-#endif /* LLVM_VERSION_MINOR <= 1 */
#include <clang/Frontend/TextDiagnosticPrinter.h>
#include <clang/Basic/TargetInfo.h>
#include <clang/Basic/TargetOptions.h>
#include <llvm/ADT/IntrusiveRefCntPtr.h>
-#if LLVM_VERSION_MINOR <= 2
-#include <llvm/Module.h>
-#else
#include <llvm/IR/Module.h>
-#endif /* LLVM_VERSION_MINOR <= 2 */
+
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+#include <llvm/Bitcode/BitcodeWriter.h>
+#include <clang/Lex/PreprocessorOptions.h>
+#else
#include <llvm/Bitcode/ReaderWriter.h>
+#endif
#include <llvm/Support/raw_ostream.h>
#endif
@@ -124,32 +114,17 @@ namespace gbe {
IVAR(OCL_PROFILING_LOG, 0, 0, 1); // Int for different profiling types.
BVAR(OCL_OUTPUT_BUILD_LOG, false);
- bool Program::buildFromLLVMFile(const char *fileName,
- const void* module,
- std::string &error,
- int optLevel) {
+ bool Program::buildFromLLVMModule(const void* module,
+ std::string &error,
+ int optLevel) {
ir::Unit *unit = new ir::Unit();
- llvm::Module * cloned_module = NULL;
bool ret = false;
- if(module){
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
- cloned_module = llvm::CloneModule((llvm::Module*)module).release();
-#else
- cloned_module = llvm::CloneModule((llvm::Module*)module);
-#endif
- }
+
bool strictMath = true;
if (fast_relaxed_math || !OCL_STRICT_CONFORMANCE)
strictMath = false;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- llvm::Module * linked_module = module ? llvm::CloneModule((llvm::Module*)module).release() : NULL;
- // Src now will be removed automatically. So clone it.
- if (llvmToGen(*unit, fileName, linked_module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
-#else
- if (llvmToGen(*unit, fileName, module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
-#endif
- if (fileName)
- error = std::string(fileName) + " not found";
+
+ if (llvmToGen(*unit, module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
delete unit;
return false;
}
@@ -158,13 +133,8 @@ namespace gbe {
if(!unit->getValid()) {
delete unit; //clear unit
unit = new ir::Unit();
- if(cloned_module){
- //suppose file exists and llvmToGen will not return false.
- llvmToGen(*unit, fileName, cloned_module, 0, strictMath, OCL_PROFILING_LOG, error);
- }else{
- //suppose file exists and llvmToGen will not return false.
- llvmToGen(*unit, fileName, module, 0, strictMath, OCL_PROFILING_LOG, error);
- }
+ //suppose file exists and llvmToGen will not return false.
+ llvmToGen(*unit, module, 0, strictMath, OCL_PROFILING_LOG, error);
}
if(unit->getValid()){
std::string error2;
@@ -174,9 +144,6 @@ namespace gbe {
error = error + error2;
}
delete unit;
- if(cloned_module){
- delete (llvm::Module*) cloned_module;
- }
return ret;
}
@@ -668,7 +635,7 @@ namespace gbe {
// The ParseCommandLineOptions used for mllvm args can not be used with multithread
// and GVN now have a 100 inst limit on block scan. Now only pass a bigger limit
// for each context only once, this can also fix multithread bug.
-#if LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
static bool ifsetllvm = false;
if(!ifsetllvm) {
args.push_back("-mllvm");
@@ -686,10 +653,6 @@ namespace gbe {
args.push_back("-disable-llvm-optzns");
if(bFastMath)
args.push_back("-D __FAST_RELAXED_MATH__=1");
-#if LLVM_VERSION_MINOR <= 2
- args.push_back("-triple");
- args.push_back("nvptx");
-#else
args.push_back("-x");
args.push_back("cl");
args.push_back("-triple");
@@ -698,7 +661,6 @@ namespace gbe {
args.push_back("-fblocks");
} else
args.push_back("spir");
-#endif /* LLVM_VERSION_MINOR <= 2 */
args.push_back("stringInput.cl");
args.push_back("-ffp-contract=on");
if(OCL_DEBUGINFO) args.push_back("-g");
@@ -716,24 +678,33 @@ namespace gbe {
llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
+ llvm::StringRef srcString(source);
// Create the compiler invocation
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ auto CI = std::make_shared<clang::CompilerInvocation>();
+ CI->getPreprocessorOpts().addRemappedFile("stringInput.cl",
+#else
std::unique_ptr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
- clang::CompilerInvocation::CreateFromArgs(*CI,
- &args[0],
- &args[0] + args.size(),
- Diags);
- llvm::StringRef srcString(source);
(*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#endif
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
llvm::MemoryBuffer::getMemBuffer(srcString)
#else
llvm::MemoryBuffer::getMemBuffer(srcString).release()
#endif
);
+ clang::CompilerInvocation::CreateFromArgs(*CI,
+ &args[0],
+ &args[0] + args.size(),
+ Diags);
// Create the compiler instance
clang::CompilerInstance Clang;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ Clang.setInvocation(std::move(CI));
+#else
Clang.setInvocation(CI.release());
+#endif
// Get ready to report problems
Clang.createDiagnostics(DiagClient, false);
@@ -777,7 +748,7 @@ namespace gbe {
if (!retVal)
return false;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
llvm::Module *module = Act->takeModule();
#else
llvm::Module *module = Act->takeModule().release();
@@ -786,16 +757,12 @@ namespace gbe {
*out_module = module;
// Dump the LLVM if requested.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 6)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 36
if (!dumpLLVMFileName.empty()) {
std::string err;
llvm::raw_fd_ostream ostream (dumpLLVMFileName.c_str(),
err,
- #if LLVM_VERSION_MINOR == 3
- 0
- #else
llvm::sys::fs::F_None
- #endif
);
if (err.empty()) {
@@ -807,11 +774,7 @@ namespace gbe {
std::string err;
llvm::raw_fd_ostream ostream (dumpSPIRBinaryName.c_str(),
err,
- #if LLVM_VERSION_MINOR == 3
- 0
- #else
llvm::sys::fs::F_None
- #endif
);
if (err.empty())
llvm::WriteBitcodeToFile(*out_module, ostream);
@@ -1109,13 +1072,15 @@ EXTEND_QUOTE:
fclose(asmDumpStream);
}
- p = gbe_program_new_from_llvm(deviceID, NULL, out_module, llvm_ctx,
+ p = gbe_program_new_from_llvm(deviceID, out_module, llvm_ctx,
dumpASMFileName.empty() ? NULL : dumpASMFileName.c_str(),
stringSize, err, errSize, optLevel, options);
if (err != NULL)
*errSize += clangErrSize;
if (OCL_OUTPUT_BUILD_LOG && options)
- llvm::errs() << options;
+ llvm::errs() << "options:" << options << "\n";
+ if (OCL_OUTPUT_BUILD_LOG && err && *errSize)
+ llvm::errs() << err << "\n";
} else
p = NULL;
@@ -1128,6 +1093,45 @@ EXTEND_QUOTE:
#ifdef GBE_COMPILER_AVAILABLE
+ static gbe_program programNewFromLLVMFile(uint32_t deviceID,
+ const char *fileName,
+ size_t string_size,
+ char *err,
+ size_t *err_size)
+ {
+ gbe_program p = NULL;
+ if (fileName == NULL)
+ return NULL;
+
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+ llvm::LLVMContext *c = new llvm::LLVMContext;
+#else
+ llvm::LLVMContext *c = &llvm::getGlobalContext();
+#endif
+ // Get the module from its file
+ llvm::SMDiagnostic errDiag;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
+ llvm::Module *module = parseIRFile(fileName, errDiag, *c).release();
+#else
+ llvm::Module *module = ParseIRFile(fileName, errDiag, *c);
+#endif
+
+ int optLevel = 1;
+
+ //module will be delete in programCleanLlvmResource
+ p = gbe_program_new_from_llvm(deviceID, module, c, NULL,
+ string_size, err, err_size, optLevel, NULL);
+ if (OCL_OUTPUT_BUILD_LOG && err && *err_size)
+ llvm::errs() << err << "\n";
+
+ return p;
+ }
+#endif
+
+
+
+#ifdef GBE_COMPILER_AVAILABLE
+
static gbe_program programCompileFromSource(uint32_t deviceID,
const char *source,
const char *temp_header_path,
@@ -1148,11 +1152,10 @@ EXTEND_QUOTE:
gbe_program p;
acquireLLVMContextLock();
- //FIXME: if use new allocated context to link two modules there would be context mismatch
- //for some functions, so we use global context now, need switch to new context later.
+
llvm::Module * out_module;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- llvm::LLVMContext* llvm_ctx = &GBEGetLLVMContext();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+ llvm::LLVMContext* llvm_ctx = new llvm::LLVMContext;
#else
llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
#endif
@@ -1169,7 +1172,9 @@ EXTEND_QUOTE:
p = gbe_program_new_gen_program(deviceID, out_module, NULL, NULL);
if (OCL_OUTPUT_BUILD_LOG && options)
- llvm::errs() << options;
+ llvm::errs() << "options:" << options << "\n";
+ if (OCL_OUTPUT_BUILD_LOG && err && *errSize)
+ llvm::errs() << err << "\n";
} else
p = NULL;
releaseLLVMContextLock();
@@ -1513,6 +1518,7 @@ void releaseLLVMContextLock()
}
GBE_EXPORT_SYMBOL gbe_program_new_from_source_cb *gbe_program_new_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_file_cb *gbe_program_new_from_llvm_file = NULL;
GBE_EXPORT_SYMBOL gbe_program_compile_from_source_cb *gbe_program_compile_from_source = NULL;
GBE_EXPORT_SYMBOL gbe_program_link_program_cb *gbe_program_link_program = NULL;
GBE_EXPORT_SYMBOL gbe_program_check_opt_cb *gbe_program_check_opt = NULL;
@@ -1575,6 +1581,7 @@ namespace gbe
{
CallBackInitializer(void) {
gbe_program_new_from_source = gbe::programNewFromSource;
+ gbe_program_new_from_llvm_file = gbe::programNewFromLLVMFile;
gbe_program_compile_from_source = gbe::programCompileFromSource;
gbe_program_link_program = gbe::programLinkProgram;
gbe_program_check_opt = gbe::programCheckOption;
@@ -1625,7 +1632,7 @@ namespace gbe
}
~CallBackInitializer() {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 34
llvm::llvm_shutdown();
#endif
}
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index e601c97..2017845 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -180,6 +180,15 @@ extern gbe_dup_printfset_cb *gbe_dup_printfset;
typedef void (gbe_output_printf_cb) (void* printf_info, void* buf_addr);
extern gbe_output_printf_cb* gbe_output_printf;
+
+/*! Create a new program from the llvm file (zero terminated string) */
+typedef gbe_program (gbe_program_new_from_llvm_file_cb)(uint32_t deviceID,
+ const char *fileName,
+ size_t stringSize,
+ char *err,
+ size_t *err_size);
+extern gbe_program_new_from_llvm_file_cb *gbe_program_new_from_llvm_file;
+
/*! Create a new program from the given source code (zero terminated string) */
typedef gbe_program (gbe_program_new_from_source_cb)(uint32_t deviceID,
const char *source,
@@ -231,7 +240,6 @@ extern gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary;
/*! Create a new program from the given LLVM file */
typedef gbe_program (gbe_program_new_from_llvm_cb)(uint32_t deviceID,
- const char *fileName,
const void *module,
const void *llvm_ctx,
const char *asm_file_name,
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 1aff8b9..4a68e33 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -305,8 +305,8 @@ namespace gbe {
}
/*! Build a program from a ir::Unit */
bool buildFromUnit(const ir::Unit &unit, std::string &error);
- /*! Buils a program from a LLVM source code */
- bool buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel);
+ /*! Buils a program from a LLVM Module */
+ bool buildFromLLVMModule(const void* module, std::string &error, int optLevel);
/*! Buils a program from a OCL string */
bool buildFromSource(const char *source, std::string &error);
/*! Get size of the global constant arrays */
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 5fcb14a..64d9727 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -186,7 +186,7 @@ namespace ir {
// only llvm-3.6 or later has kernel_arg_base_type in metadata.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
bool isImage1dT() const {
return typeName.compare("image1d_t") == 0;
}
diff --git a/backend/src/ir/half.cpp b/backend/src/ir/half.cpp
index 1c0d7eb..0abc6cb 100644
--- a/backend/src/ir/half.cpp
+++ b/backend/src/ir/half.cpp
@@ -29,7 +29,11 @@ namespace ir {
{
uint64_t v64 = static_cast<uint64_t>(v);
llvm::APInt apInt(16, v64, false);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ return llvm::APFloat(llvm::APFloat::IEEEhalf(), apInt);
+#else
return llvm::APFloat(llvm::APFloat::IEEEhalf, apInt);
+#endif
}
static uint16_t convAPFloatToU16(const llvm::APFloat& apf)
@@ -42,14 +46,22 @@ namespace ir {
half::operator float(void) const {
bool loseInfo;
llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ apf_self.convert(llvm::APFloat::IEEEsingle(), llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#else
apf_self.convert(llvm::APFloat::IEEEsingle, llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#endif
return apf_self.convertToFloat();
}
half::operator double(void) const {
bool loseInfo;
llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ apf_self.convert(llvm::APFloat::IEEEdouble(), llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#else
apf_self.convert(llvm::APFloat::IEEEdouble, llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+#endif
return apf_self.convertToDouble();
}
@@ -70,7 +82,11 @@ namespace ir {
}
half half::convToHalf(uint16_t u16) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ llvm::APFloat res(llvm::APFloat::IEEEhalf(), llvm::APInt(16, 0, false));
+#else
llvm::APFloat res(llvm::APFloat::IEEEhalf, llvm::APInt(16, 0, false));
+#endif
uint64_t u64 = static_cast<uint64_t>(u16);
llvm::APInt apInt(16, u64, false);
res.convertFromAPInt(apInt, false, llvm::APFloat::rmNearestTiesToEven);
@@ -78,7 +94,11 @@ namespace ir {
}
half half::convToHalf(int16_t v16) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ llvm::APFloat res(llvm::APFloat::IEEEhalf(), llvm::APInt(16, 0, true));
+#else
llvm::APFloat res(llvm::APFloat::IEEEhalf, llvm::APInt(16, 0, true));
+#endif
uint64_t u64 = static_cast<uint64_t>(v16);
llvm::APInt apInt(16, u64, true);
res.convertFromAPInt(apInt, true, llvm::APFloat::rmNearestTiesToEven);
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 93bd96a..bcf5940 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -199,6 +199,7 @@ namespace ir {
GBE_SAFE_DELETE(liveness);
this->liveness = GBE_NEW(ir::Liveness, *fn);
this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+ bool needRefreshDag = false;
// Process all structure arguments and find all the direct loads we can
// replace
@@ -207,13 +208,27 @@ namespace ir {
for (uint32_t argID = 0; argID < argNum; ++argID) {
FunctionArgument &arg = fn->getArg(argID);
if (arg.type != FunctionArgument::STRUCTURE) continue;
- if(this->lower(argID) == ARG_INDIRECT_READ)
+ if(this->lower(argID) == ARG_INDIRECT_READ) {
indirctReadArgs.push_back(argID);
+ //when the return value is ARG_INDIRECT_READ, there is still possible
+ //that some IRs read it directly, and will be handled in buildConstantPush()
+ //so we need to refresh the dag afer function buildConstantPush
+ for (const auto &loadAddImm : seq) {
+ if (loadAddImm.argID == argID)
+ needRefreshDag = true;
+ }
+ }
}
// Build the constant push description and remove the instruction that
// therefore become useless
this->buildConstantPush();
+ if (needRefreshDag) {
+ GBE_SAFE_DELETE(dag);
+ GBE_SAFE_DELETE(liveness);
+ this->liveness = GBE_NEW(ir::Liveness, *fn);
+ this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+ }
for (uint32_t i = 0; i < indirctReadArgs.size(); ++i){
lowerIndirectRead(indirctReadArgs[i]);
}
diff --git a/backend/src/ir/profiling.cpp b/backend/src/ir/profiling.cpp
index ac61e9b..3289e76 100644
--- a/backend/src/ir/profiling.cpp
+++ b/backend/src/ir/profiling.cpp
@@ -24,6 +24,7 @@
#include <stdlib.h>
#include "ir/profiling.hpp"
#include "src/cl_device_data.h"
+#include <inttypes.h>
namespace gbe
{
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 46d7be7..d7a2a67 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -32,8 +32,6 @@
#include "sys/map.hpp"
#include <string.h>
-#include "llvm/IR/Instructions.h"
-
namespace gbe {
namespace ir {
@@ -46,7 +44,7 @@ namespace ir {
public:
typedef map<std::string, Function*> FunctionSet;
/*! Moved from printf pass */
- map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs;
+ map<void *, PrintfSet::PrintfFmt*> printfs;
vector<std::string> blockFuncs;
/*! Create an empty unit */
Unit(PointerSize pointerSize = POINTER_32_BITS);
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index c68ecb0..2917e6d 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -211,7 +211,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
)
ENDMACRO(ADD_LL_TO_BC_TARGET)
-SET (OCL_LL_MODULES_12 ocl_barrier ocl_clz ocl_ctz)
+SET (OCL_LL_MODULES_12 ocl_barrier ocl_clz ocl_ctz ocl_sampler)
FOREACH(f ${OCL_LL_MODULES_12})
COPY_THE_LL(${f})
ADD_LL_TO_BC_TARGET(${f})
@@ -255,7 +255,7 @@ if (ENABLE_OPENCL_20)
ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS_20}")
ENDFOREACH(f)
- SET (OCL_LL_MODULES_20 ocl_barrier_20 ocl_clz_20 ocl_ctz_20 ocl_atomic_20)
+ SET (OCL_LL_MODULES_20 ocl_barrier_20 ocl_clz_20 ocl_ctz_20 ocl_atomic_20 ocl_sampler_20)
FOREACH(f ${OCL_LL_MODULES_20})
COPY_THE_LL(${f})
ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 2548cb7..5408048 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -124,6 +124,10 @@
#define cl_intel_subgroups
#define cl_intel_subgroups_short
+#if __clang_major__*10 + __clang_minor__ > 40
+#define cl_intel_required_subgroup_size
+#endif
+
#pragma OPENCL EXTENSION cl_khr_fp64 : disable
#pragma OPENCL EXTENSION cl_khr_fp16 : disable
#endif
diff --git a/backend/src/libocl/include/ocl_enqueue.h b/backend/src/libocl/include/ocl_enqueue.h
index 6479df7..8e9fd32 100644
--- a/backend/src/libocl/include/ocl_enqueue.h
+++ b/backend/src/libocl/include/ocl_enqueue.h
@@ -38,19 +38,35 @@ struct Block_literal {
void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
int flags;
int reserved;
- __global void (*invoke)(void *, ...);
+ __global void* invoke;
struct Block_descriptor_1 {
unsigned long int reserved; // NULL
unsigned long int size; // sizeof(struct Block_literal_1)
// optional helper functions
- void (*copy_helper)(void *dst, void *src); // IFF (1<<25)
- void (*dispose_helper)(void *src); // IFF (1<<25)
+ void *copy_helper; // IFF (1<<25)
+ void *dispose_helper; // IFF (1<<25)
// required ABI.2010.3.16
const char *signature; // IFF (1<<30)
} *descriptor;
// imported variables
};
+#if __clang_major__*10 + __clang_minor__ >= 50
+typedef struct ndrange_info_t ndrange_t;
+#endif
+
+#if __clang_major__*10 + __clang_minor__ >= 50
+#define BLOCK_TYPE void*
+#else
+#define BLOCK_TYPE __private void*
+#endif
+
+#if __clang_major__*10 + __clang_minor__ >= 40
+#define EVENT_TYPE clk_event_t*
+#else
+#define EVENT_TYPE __private clk_event_t*
+#endif
+
clk_event_t create_user_event(void);
void retain_event(clk_event_t event);
void release_event(clk_event_t event);
@@ -58,21 +74,16 @@ void set_user_event_status(clk_event_t event, int status);
bool is_valid_event(clk_event_t event);
void capture_event_profiling_info(clk_event_t event, int name, global void *value);
-uint __get_kernel_work_group_size_impl(__private void *block);
-uint __get_kernel_preferred_work_group_multiple_impl(__private void *block);
+uint __get_kernel_work_group_size_impl(BLOCK_TYPE block);
+uint __get_kernel_preferred_work_group_multiple_impl(BLOCK_TYPE block);
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void));
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
- uint num_events_in_wait_list, const clk_event_t *event_wait_list,
- clk_event_t *event_ret, void (^block)(void));
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, __private void *block, uint size0, ...);
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
- uint num_events_in_wait_list, const clk_event_t *event_wait_list,
- clk_event_t *event_ret, __private void *block, uint size0, ...);
+int __enqueue_kernel_basic(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block);
+int __enqueue_kernel_basic_events(queue_t q, int flag, ndrange_t ndrange,
+ uint num_events_in_wait_list, const EVENT_TYPE event_wait_list,
+ EVENT_TYPE event_ret, BLOCK_TYPE block);
queue_t get_default_queue(void);
-int __gen_enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void), int size);
-int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes);
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block, int count, __private int* slm_sizes);
OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size);
OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size);
diff --git a/backend/src/libocl/include/ocl_image.h b/backend/src/libocl/include/ocl_image.h
index 5a679aa..f816e1a 100644
--- a/backend/src/libocl/include/ocl_image.h
+++ b/backend/src/libocl/include/ocl_image.h
@@ -23,6 +23,8 @@
#define int1 int
#define float1 float
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+
#define DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, int##N coord); \
OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, float##N coord);
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 2c0d700..a15cfe4 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -48,6 +48,7 @@ DEF(char)
DEF(uchar)
DEF(short)
DEF(ushort)
+DEF(half)
DEF(int)
DEF(uint)
DEF(float)
@@ -112,6 +113,7 @@ DEF(char)
DEF(uchar)
DEF(short)
DEF(ushort)
+DEF(half)
DEF(int)
DEF(uint)
DEF(float)
diff --git a/backend/src/libocl/src/ocl_enqueue.cl b/backend/src/libocl/src/ocl_enqueue.cl
index dc8fa3b..1ae43aa 100644
--- a/backend/src/libocl/src/ocl_enqueue.cl
+++ b/backend/src/libocl/src/ocl_enqueue.cl
@@ -30,7 +30,7 @@ ndrange_t __gen_ocl_set_ndrange_info(__private struct ndrange_info_t *info);
__private struct ndrange_info_t* __gen_ocl_get_ndrange_info(ndrange_t info);
__global int* __gen_ocl_get_enqueue_info_addr(void);
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void))
+int __enqueue_kernel_basic(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block)
{
int i;
__private struct Block_literal *literal = (__private struct Block_literal *)block;
@@ -40,8 +40,11 @@ OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^b
__global int* start_addr = __gen_ocl_get_enqueue_info_addr();
int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t));
__global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+#if __clang_major__*10 + __clang_minor__ >= 50
+ __private struct ndrange_info_t *info = to_private(&ndrange);
+#else
__private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
-
+#endif
*((__global struct ndrange_info_t *)addr) = *info;
addr += sizeof(*info);
@@ -51,14 +54,14 @@ OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^b
return 0;
}
-OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
- uint num_events_in_wait_list, const clk_event_t *event_wait_list,
- clk_event_t *event_ret, void (^block)(void))
+int __enqueue_kernel_basic_events(queue_t q, int flag, ndrange_t ndrange,
+ uint num_events_in_wait_list, const EVENT_TYPE event_wait_list,
+ EVENT_TYPE event_ret, BLOCK_TYPE block)
{
- return enqueue_kernel(q, flag, ndrange, block);
+ return __enqueue_kernel_basic(q, flag, ndrange, block);
}
-int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes)
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, BLOCK_TYPE block, int count, __private int* slm_sizes)
{
int i;
__private struct Block_literal* literal = (__private struct Block_literal *)block;
@@ -69,7 +72,11 @@ int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private v
__global int* start_addr = __gen_ocl_get_enqueue_info_addr();
int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t) + slm_size);
__global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+#if __clang_major__*10 + __clang_minor__ >= 50
+ __private struct ndrange_info_t *info = to_private(&ndrange);
+#else
__private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+#endif
*((__global struct ndrange_info_t *)addr) = *info;
addr += sizeof(*info);
@@ -111,12 +118,12 @@ bool is_valid_event(clk_event_t event)
return 1;
}
-uint __get_kernel_work_group_size_impl(__private void *block)
+uint __get_kernel_work_group_size_impl(BLOCK_TYPE block)
{
return 256;
}
-uint __get_kernel_preferred_work_group_multiple_impl(__private void *block)
+uint __get_kernel_preferred_work_group_multiple_impl(BLOCK_TYPE block)
{
return 16;
}
@@ -127,13 +134,19 @@ void capture_event_profiling_info(clk_event_t event, int name, global void *valu
((__global ulong *)value)[0] = 0x3000;
((__global ulong *)value)[1] = 0x6000;
}
+
+#if __clang_major__*10 + __clang_minor__ >= 50
+#define RET_INFO return info;
+#else
+#define RET_INFO return __gen_ocl_set_ndrange_info(&info);
+#endif
+
OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size)
{
struct ndrange_info_t info;
info.type = 0x1;
info.global_work_size[0] = global_work_size;
- return __gen_ocl_set_ndrange_info(&info);
- //return ndrange;
+ RET_INFO;
}
OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size)
@@ -142,8 +155,7 @@ OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_siz
info.type = 0x2;
info.global_work_size[0] = global_work_size;
info.local_work_size[0] = local_work_size;
- return __gen_ocl_set_ndrange_info(&info);
- // return ndrange;
+ RET_INFO;
}
@@ -154,8 +166,7 @@ OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_
info.global_work_size[0] = global_work_size;
info.local_work_size[0] = local_work_size;
info.global_work_offset[0] = global_work_offset;
- return __gen_ocl_set_ndrange_info(&info);
- //return ndrange;
+ RET_INFO;
}
OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
@@ -164,8 +175,7 @@ OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
info.type = 0x11;
info.global_work_size[0] = global_work_size[0];
info.global_work_size[1] = global_work_size[1];
- return __gen_ocl_set_ndrange_info(&info);
- //return ndrange;
+ RET_INFO;
}
OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2])
@@ -176,7 +186,7 @@ OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t
info.global_work_size[1] = global_work_size[1];
info.local_work_size[0] = local_work_size[0];
info.local_work_size[1] = local_work_size[1];
- return __gen_ocl_set_ndrange_info(&info);
+ RET_INFO;
}
@@ -190,7 +200,7 @@ OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size
info.local_work_size[1] = local_work_size[1];
info.global_work_offset[0] = global_work_offset[0];
info.global_work_offset[1] = global_work_offset[1];
- return __gen_ocl_set_ndrange_info(&info);
+ RET_INFO;
}
OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
@@ -200,7 +210,7 @@ OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
info.global_work_size[0] = global_work_size[0];
info.global_work_size[1] = global_work_size[1];
info.global_work_size[2] = global_work_size[2];
- return __gen_ocl_set_ndrange_info(&info);
+ RET_INFO;
}
OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3])
@@ -213,7 +223,7 @@ OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t
info.local_work_size[0] = local_work_size[0];
info.local_work_size[1] = local_work_size[1];
info.local_work_size[2] = local_work_size[2];
- return __gen_ocl_set_ndrange_info(&info);
+ RET_INFO;
}
OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3])
@@ -229,7 +239,7 @@ OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size
info.global_work_offset[0] = global_work_offset[0];
info.global_work_offset[1] = global_work_offset[1];
info.global_work_offset[2] = global_work_offset[2];
- return __gen_ocl_set_ndrange_info(&info);
+ RET_INFO;
}
int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret)
diff --git a/backend/src/libocl/src/ocl_memcpy.cl b/backend/src/libocl/src/ocl_memcpy.cl
index 131574d..8c0409a 100644
--- a/backend/src/libocl/src/ocl_memcpy.cl
+++ b/backend/src/libocl/src/ocl_memcpy.cl
@@ -16,12 +16,13 @@
*
*/
#include "ocl_memcpy.h"
+typedef int __attribute__((may_alias)) AI;
#define DECL_TWO_SPACE_MEMCOPY_FN(NAME, DST_SPACE, SRC_SPACE) \
void __gen_memcpy_ ##NAME## _align (DST_SPACE uchar* dst, SRC_SPACE uchar* src, size_t size) { \
size_t index = 0; \
while((index + 4) <= size) { \
- *((DST_SPACE uint *)(dst + index)) = *((SRC_SPACE uint *)(src + index)); \
+ *((DST_SPACE AI *)(dst + index)) = *((SRC_SPACE AI *)(src + index)); \
index += 4; \
} \
while(index < size) { \
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 3b2eb92..9a99df9 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -87,6 +87,7 @@ DEF(char)
DEF(uchar)
DEF(short)
DEF(ushort)
+DEF(half)
DEF(int)
DEF(uint)
DEF(float)
@@ -107,8 +108,8 @@ DEF(ulong)
#define DEC2X(TYPE, MASKTYPE) \
OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask) { \
TYPE##2 z; \
- z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
- z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
return z; \
}
@@ -120,10 +121,10 @@ DEF(ulong)
#define DEC4X(TYPE, MASKTYPE) \
OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask) { \
TYPE##4 z; \
- z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
- z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
- z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
- z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+ z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = (mask.s2 & 31) < 16 ? ((TYPE *)&x)[mask.s2 & 31] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = (mask.s3 & 31) < 16 ? ((TYPE *)&x)[mask.s3 & 31] : ((TYPE *)&y)[mask.s3 & 15]; \
return z; \
}
@@ -135,14 +136,14 @@ DEF(ulong)
#define DEC8X(TYPE, MASKTYPE) \
OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask) { \
TYPE##8 z; \
- z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
- z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
- z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
- z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
- z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
- z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
- z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
- z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+ z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = (mask.s2 & 31) < 16 ? ((TYPE *)&x)[mask.s2 & 31] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = (mask.s3 & 31) < 16 ? ((TYPE *)&x)[mask.s3 & 31] : ((TYPE *)&y)[mask.s3 & 15]; \
+ z.s4 = (mask.s4 & 31) < 16 ? ((TYPE *)&x)[mask.s4 & 31] : ((TYPE *)&y)[mask.s4 & 15]; \
+ z.s5 = (mask.s5 & 31) < 16 ? ((TYPE *)&x)[mask.s5 & 31] : ((TYPE *)&y)[mask.s5 & 15]; \
+ z.s6 = (mask.s6 & 31) < 16 ? ((TYPE *)&x)[mask.s6 & 31] : ((TYPE *)&y)[mask.s6 & 15]; \
+ z.s7 = (mask.s7 & 31) < 16 ? ((TYPE *)&x)[mask.s7 & 31] : ((TYPE *)&y)[mask.s7 & 15]; \
return z; \
}
@@ -154,22 +155,22 @@ DEF(ulong)
#define DEC16X(TYPE, MASKTYPE) \
OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask) { \
TYPE##16 z; \
- z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
- z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
- z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
- z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
- z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
- z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
- z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
- z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
- z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
- z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
- z.sA = mask.sA < 16 ? ((TYPE *)&x)[mask.sA] : ((TYPE *)&y)[mask.sA & 15]; \
- z.sB = mask.sB < 16 ? ((TYPE *)&x)[mask.sB] : ((TYPE *)&y)[mask.sB & 15]; \
- z.sC = mask.sC < 16 ? ((TYPE *)&x)[mask.sC] : ((TYPE *)&y)[mask.sC & 15]; \
- z.sD = mask.sD < 16 ? ((TYPE *)&x)[mask.sD] : ((TYPE *)&y)[mask.sD & 15]; \
- z.sE = mask.sE < 16 ? ((TYPE *)&x)[mask.sE] : ((TYPE *)&y)[mask.sE & 15]; \
- z.sF = mask.sF < 16 ? ((TYPE *)&x)[mask.sF] : ((TYPE *)&y)[mask.sF & 15]; \
+ z.s0 = (mask.s0 & 31) < 16 ? ((TYPE *)&x)[mask.s0 & 31] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = (mask.s1 & 31) < 16 ? ((TYPE *)&x)[mask.s1 & 31] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = (mask.s2 & 31) < 16 ? ((TYPE *)&x)[mask.s2 & 31] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = (mask.s3 & 31) < 16 ? ((TYPE *)&x)[mask.s3 & 31] : ((TYPE *)&y)[mask.s3 & 15]; \
+ z.s4 = (mask.s4 & 31) < 16 ? ((TYPE *)&x)[mask.s4 & 31] : ((TYPE *)&y)[mask.s4 & 15]; \
+ z.s5 = (mask.s5 & 31) < 16 ? ((TYPE *)&x)[mask.s5 & 31] : ((TYPE *)&y)[mask.s5 & 15]; \
+ z.s6 = (mask.s6 & 31) < 16 ? ((TYPE *)&x)[mask.s6 & 31] : ((TYPE *)&y)[mask.s6 & 15]; \
+ z.s7 = (mask.s7 & 31) < 16 ? ((TYPE *)&x)[mask.s7 & 31] : ((TYPE *)&y)[mask.s7 & 15]; \
+ z.s8 = (mask.s8 & 31) < 16 ? ((TYPE *)&x)[mask.s8 & 31] : ((TYPE *)&y)[mask.s8 & 15]; \
+ z.s9 = (mask.s9 & 31) < 16 ? ((TYPE *)&x)[mask.s9 & 31] : ((TYPE *)&y)[mask.s9 & 15]; \
+ z.sA = (mask.sA & 31) < 16 ? ((TYPE *)&x)[mask.sA & 31] : ((TYPE *)&y)[mask.sA & 15]; \
+ z.sB = (mask.sB & 31) < 16 ? ((TYPE *)&x)[mask.sB & 31] : ((TYPE *)&y)[mask.sB & 15]; \
+ z.sC = (mask.sC & 31) < 16 ? ((TYPE *)&x)[mask.sC & 31] : ((TYPE *)&y)[mask.sC & 15]; \
+ z.sD = (mask.sD & 31) < 16 ? ((TYPE *)&x)[mask.sD & 31] : ((TYPE *)&y)[mask.sD & 15]; \
+ z.sE = (mask.sE & 31) < 16 ? ((TYPE *)&x)[mask.sE & 31] : ((TYPE *)&y)[mask.sE & 15]; \
+ z.sF = (mask.sF & 31) < 16 ? ((TYPE *)&x)[mask.sF & 31] : ((TYPE *)&y)[mask.sF & 15]; \
return z; \
}
@@ -201,6 +202,7 @@ DEF(char)
DEF(uchar)
DEF(short)
DEF(ushort)
+DEF(half)
DEF(int)
DEF(uint)
DEF(float)
diff --git a/backend/src/libocl/src/ocl_sampler.ll b/backend/src/libocl/src/ocl_sampler.ll
new file mode 100644
index 0000000..6d39fdb
--- /dev/null
+++ b/backend/src/libocl/src/ocl_sampler.ll
@@ -0,0 +1,10 @@
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+%opencl.sampler_t = type opaque
+
+declare %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32)
+
+define %opencl.sampler_t addrspace(2)*@__translate_sampler_initializer(i32 %s) {
+ %call = call %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32 %s)
+ ret %opencl.sampler_t addrspace(2)* %call
+}
diff --git a/backend/src/libocl/src/ocl_sampler_20.ll b/backend/src/libocl/src/ocl_sampler_20.ll
new file mode 100644
index 0000000..bea6d75
--- /dev/null
+++ b/backend/src/libocl/src/ocl_sampler_20.ll
@@ -0,0 +1,10 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+%opencl.sampler_t = type opaque
+
+declare %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32)
+
+define %opencl.sampler_t addrspace(2)*@__translate_sampler_initializer(i32 %s) {
+ %call = call %opencl.sampler_t addrspace(2)*@__gen_ocl_int_to_sampler(i32 %s)
+ ret %opencl.sampler_t addrspace(2)* %call
+}
diff --git a/backend/src/llvm/ExpandLargeIntegers.cpp b/backend/src/llvm/ExpandLargeIntegers.cpp
index 60740f5..8515dc1 100644
--- a/backend/src/llvm/ExpandLargeIntegers.cpp
+++ b/backend/src/llvm/ExpandLargeIntegers.cpp
@@ -93,7 +93,7 @@
using namespace llvm;
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
#define DEBUG_TYPE "nacl-expand-ints"
#endif
@@ -766,7 +766,7 @@ static void convertInstruction(Instruction *Inst, ConversionState &State,
bool ExpandLargeIntegers::runOnFunction(Function &F) {
// Don't support changing the function arguments. Illegal function arguments
// should not be generated by clang.
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
for (const Argument &Arg : F.args())
#else
for (const Argument &Arg : F.getArgumentList())
@@ -789,7 +789,7 @@ bool ExpandLargeIntegers::runOnFunction(Function &F) {
// Only attempt to convert an instruction if its result or any of its
// operands are illegal.
bool ShouldConvert = shouldConvert(&I);
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
for (Value *Op : I.operands())
ShouldConvert |= shouldConvert(Op);
#else
diff --git a/backend/src/llvm/ExpandUtils.cpp b/backend/src/llvm/ExpandUtils.cpp
index a09d990..cb1736b 100644
--- a/backend/src/llvm/ExpandUtils.cpp
+++ b/backend/src/llvm/ExpandUtils.cpp
@@ -101,7 +101,11 @@ namespace llvm {
Function *RecreateFunction(Function *Func, FunctionType *NewType) {
Function *NewFunc = Function::Create(NewType, Func->getLinkage());
NewFunc->copyAttributesFrom(Func);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ Func->getParent()->getFunctionList().insert(Func->getIterator(), NewFunc);
+#else
Func->getParent()->getFunctionList().insert(ilist_iterator<Function>(Func), NewFunc);
+#endif
NewFunc->takeName(Func);
NewFunc->getBasicBlockList().splice(NewFunc->begin(),
Func->getBasicBlockList());
diff --git a/backend/src/llvm/PromoteIntegers.cpp b/backend/src/llvm/PromoteIntegers.cpp
index a500311..d433771 100644
--- a/backend/src/llvm/PromoteIntegers.cpp
+++ b/backend/src/llvm/PromoteIntegers.cpp
@@ -605,8 +605,13 @@ static void convertInstruction(Instruction *Inst, ConversionState &State) {
for (SwitchInst::CaseIt I = Switch->case_begin(),
E = Switch->case_end();
I != E; ++I) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+ NewInst->addCase(cast<ConstantInt>(convertConstant(I->getCaseValue())),
+ I->getCaseSuccessor());
+#else
NewInst->addCase(cast<ConstantInt>(convertConstant(I.getCaseValue())),
I.getCaseSuccessor());
+#endif
}
Switch->eraseFromParent();
} else {
diff --git a/backend/src/llvm/StripAttributes.cpp b/backend/src/llvm/StripAttributes.cpp
index 9d07c29..99e5a71 100644
--- a/backend/src/llvm/StripAttributes.cpp
+++ b/backend/src/llvm/StripAttributes.cpp
@@ -79,10 +79,13 @@ namespace {
class StripAttributes : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
- StripAttributes() : FunctionPass(ID) {
+ StripAttributes(bool lastTime) : FunctionPass(ID),
+ lastTime(lastTime) {
}
virtual bool runOnFunction(Function &Func);
+ private:
+ bool lastTime; //last time all StripAttributes
};
}
@@ -93,7 +96,11 @@ bool StripAttributes::runOnFunction(Function &Func) {
Func.setLinkage(GlobalValue::ExternalLinkage);
if (!gbe::isKernelFunction(Func)) {
Func.addFnAttr(Attribute::AlwaysInline);
- Func.setLinkage(GlobalValue::LinkOnceAnyLinkage);
+ if (lastTime ||
+ (Func.getName().find("__gen_mem") == std::string::npos))
+ // Memcpy and memset functions could be deleted at last inline.
+ // Delete memcpy and memset functions for output llvm ir friendly.
+ Func.setLinkage(GlobalValue::LinkOnceAnyLinkage);
}
for (Function::iterator BB = Func.begin(), E = Func.end();
@@ -109,6 +116,6 @@ bool StripAttributes::runOnFunction(Function &Func) {
return true;
}
-FunctionPass *llvm::createStripAttributesPass() {
- return new StripAttributes();
+FunctionPass *llvm::createStripAttributesPass(bool lastTime) {
+ return new StripAttributes(lastTime);
}
diff --git a/backend/src/llvm/llvm_barrier_nodup.cpp b/backend/src/llvm/llvm_barrier_nodup.cpp
index 727e6bd..b8ffdf4 100644
--- a/backend/src/llvm/llvm_barrier_nodup.cpp
+++ b/backend/src/llvm/llvm_barrier_nodup.cpp
@@ -48,7 +48,12 @@ namespace gbe {
}
- virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual StringRef getPassName() const
+#else
+ virtual const char *getPassName() const
+#endif
+ {
return "SPIR backend: set barrier no duplicate attr";
}
@@ -69,7 +74,11 @@ namespace gbe {
if (F.hasFnAttribute(Attribute::NoDuplicate)) {
auto attrs = F.getAttributes();
F.setAttributes(attrs.removeAttribute(M.getContext(),
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+ AttributeList::FunctionIndex,
+#else
AttributeSet::FunctionIndex,
+#endif
Attribute::NoDuplicate));
changed = true;
}
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 89d5e7c..ef56e4c 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -60,7 +60,7 @@ namespace gbe
return NULL;
}
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
oclLib = getLazyIRFileModule(FilePath, Err, ctx);
#else
oclLib = getLazyIRFileModule(FilePath, Err, ctx).release();
@@ -117,17 +117,28 @@ namespace gbe
std::string ErrInfo;// = "Not Materializable";
if (!fromSrc && newMF->isMaterializable()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
- if (newMF->Materialize(&ErrInfo)) {
- printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ if (llvm::Error EC = newMF->materialize()) {
+ std::string Msg;
+ handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {
+ Msg = EIB.message();
+ });
+ printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), Msg.c_str());
return false;
}
-#else
+ Gvs.push_back((GlobalValue *)newMF);
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
if (std::error_code EC = newMF->materialize()) {
printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), EC.message().c_str());
return false;
}
Gvs.push_back((GlobalValue *)newMF);
+#else
+ if (newMF->Materialize(&ErrInfo)) {
+ printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+ return false;
+ }
+
#endif
}
if (!materializedFuncCall(src, lib, *newMF, MFS, Gvs))
@@ -250,21 +261,30 @@ namespace gbe
}
std::string ErrInfo;// = "Not Materializable";
if (newMF->isMaterializable()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
- if (newMF->Materialize(&ErrInfo)) {
- printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ if (llvm::Error EC = newMF->materialize()) {
+ std::string Msg;
+ handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {
+ Msg = EIB.message();
+ });
+ printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), Msg.c_str());
delete clonedLib;
return NULL;
}
- }
-#else
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
if (std::error_code EC = newMF->materialize()) {
printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), EC.message().c_str());
delete clonedLib;
return NULL;
}
- }
+#else
+ if (newMF->Materialize(&ErrInfo)) {
+ printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+ delete clonedLib;
+ return NULL;
+ }
#endif
+ }
if (!materializedFuncCall(*mod, *clonedLib, *newMF, materializedFuncs, Gvs)) {
delete clonedLib;
@@ -287,12 +307,17 @@ namespace gbe
* pass to extract the functions and values in Gvs from the library module.
* After extract what we need and remove what we do not need, we use
* materializeAll to mark the module as materialized. */
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
/* Get all GlobalValue from module. */
Module::GlobalListType &GVlist = clonedLib->getGlobalList();
for(Module::global_iterator GVitr = GVlist.begin();GVitr != GVlist.end();++GVitr) {
GlobalValue * GV = &*GVitr;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ ExitOnError ExitOnErr("Can not materialize the clonedLib: ");
+ ExitOnErr(clonedLib->materialize(GV));
+#else
clonedLib->materialize(GV);
+#endif
Gvs.push_back(GV);
}
llvm::legacy::PassManager Extract;
@@ -300,8 +325,13 @@ namespace gbe
Extract.add(createGVExtractionPass(Gvs, false));
Extract.run(*clonedLib);
/* Mark the library module as materialized for later use. */
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ ExitOnError ExitOnErr("Can not materialize the clonedLib: ");
+ ExitOnErr(clonedLib->materializeAll());
+#else
clonedLib->materializeAll();
#endif
+#endif
/* the SPIR binary datalayout maybe different with beignet's bitcode */
if(clonedLib->getDataLayout() != mod->getDataLayout())
@@ -309,23 +339,24 @@ namespace gbe
/* We use beignet's bitcode as dst because it will have a lot of
lazy functions which will not be loaded. */
- char* errorMsg;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- if(LLVMLinkModules2(wrap(clonedLib), wrap(mod))) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+ llvm::Module * linked_module = llvm::CloneModule((llvm::Module*)mod).release();
+ if(LLVMLinkModules2(wrap(clonedLib), wrap(linked_module))) {
#else
+ char* errorMsg;
if(LLVMLinkModules(wrap(clonedLib), wrap(mod), LLVMLinkerDestroySource, &errorMsg)) {
+ printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
#endif
delete clonedLib;
- printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
return NULL;
}
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
llvm::legacy::PassManager passes;
#else
llvm::PassManager passes;
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
auto PreserveKernel = [=](const GlobalValue &GV) {
for(size_t i = 0;i < kernels.size(); ++i)
if(strcmp(GV.getName().data(), kernels[i]))
diff --git a/backend/src/llvm/llvm_device_enqueue.cpp b/backend/src/llvm/llvm_device_enqueue.cpp
index ee236de..58aa681 100644
--- a/backend/src/llvm/llvm_device_enqueue.cpp
+++ b/backend/src/llvm/llvm_device_enqueue.cpp
@@ -29,6 +29,7 @@ namespace gbe {
BitCastInst* bt = dyn_cast<BitCastInst>(I);
if (bt == NULL)
return NULL;
+//bt->dump();
Type* type = bt->getOperand(0)->getType();
if(!type->isPointerTy())
@@ -62,7 +63,7 @@ namespace gbe {
for (Value::use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
// After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
// which is more straightforward.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
User *theUser = *iter;
#else
User *theUser = iter->getUser();
@@ -84,7 +85,7 @@ namespace gbe {
Function* setFunctionAsKernel(Module *mod, Function *Fn)
{
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 9)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
LLVMContext &Context = mod->getContext();
Type *intTy = IntegerType::get(mod->getContext(), 32);
SmallVector<llvm::Metadata *, 5> kernelMDArgs;
@@ -112,7 +113,8 @@ namespace gbe {
ValueToValueMapTy VMap;
for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; ++I) {
PointerType *ty = dyn_cast<PointerType>(I->getType());
- if(ty && ty->getAddressSpace() == 0) //Foce set the address space to global
+ //Foce set the address space to global
+ if(ty && (ty->getAddressSpace() == 0 || ty->getAddressSpace() == 4))
ty = PointerType::get(ty->getPointerElementType(), 1);
ParamTys.push_back(ty);
}
@@ -210,7 +212,7 @@ namespace gbe {
}
for (Value::use_iterator iter = bt->use_begin(); iter != bt->use_end(); ++iter) {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
User *theUser = *iter;
#else
User *theUser = iter->getUser();
@@ -252,12 +254,13 @@ namespace gbe {
if(gep == NULL)
continue;
- BitCastInst* fnPointer = dyn_cast<BitCastInst>(gep->getOperand(0));
- if(fnPointer == NULL)
+ Value *fnPointer = gep->getOperand(0)->stripPointerCasts();
+
+ if(fnPointer == gep->getOperand(0))
continue;
- if(BitCastInst* bt = dyn_cast<BitCastInst>(fnPointer->getOperand(0))) {
- std::string fnName = blocks[bt->getOperand(0)];
+ if(blocks.find(fnPointer) != blocks.end()) {
+ std::string fnName = blocks[fnPointer];
Function* f = mod->getFunction(fnName);
CallInst *newCI = builder.CreateCall(f, args);
CI->replaceAllUsesWith(newCI);
@@ -266,7 +269,7 @@ namespace gbe {
}
//the function is global variable
- if(GlobalVariable* gv = dyn_cast<GlobalVariable>(fnPointer->getOperand(0))) {
+ if(GlobalVariable* gv = dyn_cast<GlobalVariable>(fnPointer)) {
Constant *c = gv->getInitializer();
ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
@@ -277,7 +280,7 @@ namespace gbe {
continue;
}
- ld = dyn_cast<LoadInst>(fnPointer->getOperand(0));
+ ld = dyn_cast<LoadInst>(fnPointer);
if(ld == NULL)
continue;
@@ -298,15 +301,13 @@ namespace gbe {
if(AllocaInst *ai = dyn_cast<AllocaInst>(ld->getPointerOperand())) {
Value *v = NULL;
for (Value::use_iterator iter = ai->use_begin(); iter != ai->use_end(); ++iter) {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
User *theUser = *iter;
#else
User *theUser = iter->getUser();
#endif
if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
- bt = dyn_cast<BitCastInst>(st->getValueOperand());
- if(bt)
- v = bt->getOperand(0);
+ v = st->getValueOperand()->stripPointerCasts();
}
}
if(blocks.find(v) == blocks.end()) {
@@ -339,23 +340,19 @@ namespace gbe {
Type *type = CI->getArgOperand(block_index)->getType();
if(type->isIntegerTy())
block_index = 6;
- Value *block = CI->getArgOperand(block_index);
- while(isa<BitCastInst>(block))
- block = dyn_cast<BitCastInst>(block)->getOperand(0);
+ Value *block = CI->getArgOperand(block_index)->stripPointerCasts();
LoadInst *ld = dyn_cast<LoadInst>(block);
Value *v = NULL;
if(ld) {
Value *block = ld->getPointerOperand();
for (Value::use_iterator iter = block->use_begin(); iter != block->use_end(); ++iter) {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
User *theUser = *iter;
#else
User *theUser = iter->getUser();
#endif
if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
- BitCastInst *bt = dyn_cast<BitCastInst>(st->getValueOperand());
- if(bt)
- v = bt->getOperand(0);
+ v = st->getValueOperand()->stripPointerCasts();
}
}
if(blocks.find(v) == blocks.end()) {
@@ -378,15 +375,20 @@ namespace gbe {
if( fn->isVarArg() ) {
//enqueue function with slm, convert to __gen_enqueue_kernel_slm call
//store the slm information to a alloca address.
- int start = block_index + 1;
+ int start = block_index + 1 + 1; //the first is count, skip
int count = CI->getNumArgOperands() - start;
Type *intTy = IntegerType::get(mod->getContext(), 32);
+ Type *int64Ty = IntegerType::get(mod->getContext(), 64);
AllocaInst *AI = builder.CreateAlloca(intTy, ConstantInt::get(intTy, count));
for(uint32_t i = start; i < CI->getNumArgOperands(); i++) {
Value *ptr = builder.CreateGEP(AI, ConstantInt::get(intTy, i-start));
- builder.CreateStore(CI->getArgOperand(i), ptr);
+ Value *argSize = CI->getArgOperand(i);
+ if (argSize->getType() == int64Ty) {
+ argSize = builder.CreateTrunc(argSize, intTy);
+ }
+ builder.CreateStore(argSize, ptr);
}
SmallVector<Value*, 16> args(CI->op_begin(), CI->op_begin() + 3);
args.push_back(CI->getArgOperand(block_index));
@@ -394,8 +396,8 @@ namespace gbe {
args.push_back(AI);
std::vector<Type *> ParamTys;
- for (Value** I = args.begin(); I != args.end(); ++I)
- ParamTys.push_back((*I)->getType());
+ for (Value** iter = args.begin(); iter != args.end(); ++iter)
+ ParamTys.push_back((*iter)->getType());
CallInst* newCI = builder.CreateCall(cast<llvm::Function>(mod->getOrInsertFunction(
"__gen_enqueue_kernel_slm", FunctionType::get(intTy, ParamTys, false))), args);
CI->replaceAllUsesWith(newCI);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3fefa92..c93d89c 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -95,9 +95,9 @@
#define LLVM_VERSION_MINOR 0
#endif /* !defined(LLVM_VERSION_MINOR) */
-#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR < 3)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 33
#error "Only LLVM 3.3 and newer are supported"
-#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4) */
+#endif
using namespace llvm;
@@ -357,6 +357,16 @@ namespace gbe
GBE_ASSERT(! (isa<Constant>(value) && !isa<GlobalValue>(value)));
Type *type = value->getType();
auto typeID = type->getTypeID();
+ if (typeID == Type::PointerTyID)
+ {
+ Type *eltTy = dyn_cast<PointerType>(type)->getElementType();
+ if (eltTy->isStructTy()) {
+ StructType *strTy = dyn_cast<StructType>(eltTy);
+ if (!strTy->isLiteral() && strTy->getName().data() &&
+ strstr(strTy->getName().data(), "sampler"))
+ type = Type::getInt32Ty(value->getContext());
+ }
+ }
switch (typeID) {
case Type::IntegerTyID:
case Type::FloatTyID:
@@ -565,7 +575,7 @@ namespace gbe
has_errors(false),
legacyMode(true)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
#else
initializeLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -573,10 +583,14 @@ namespace gbe
pass = PASS_EMIT_REGISTERS;
}
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual llvm::StringRef getPassName() const { return "Gen Back-End"; }
+#else
virtual const char *getPassName() const { return "Gen Back-End"; }
+#endif
void getAnalysisUsage(AnalysisUsage &AU) const {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
AU.addRequired<LoopInfoWrapperPass>();
#else
AU.addRequired<LoopInfo>();
@@ -611,7 +625,7 @@ namespace gbe
if (legacyMode)
analyzePointerOrigin(F);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
#else
LI = &getAnalysis<LoopInfo>();
@@ -726,6 +740,8 @@ namespace gbe
DECL_VISIT_FN(AtomicCmpXchgInst, AtomicCmpXchgInst);
#undef DECL_VISIT_FN
+ // Emit rounding instructions from gen native function
+ void emitRoundingCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
// Emit unary instructions from gen native function
void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode, ir::Type = ir::TYPE_FLOAT);
// Emit unary instructions from gen native function
@@ -745,9 +761,6 @@ namespace gbe
void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
void visitInvokeInst(InvokeInst &I) {NOT_SUPPORTED;}
-#if LLVM_VERSION_MINOR == 0
- void visitUnwindInst(UnwindInst &I) {NOT_SUPPORTED;}
-#endif /* __LLVM_30__ */
void visitResumeInst(ResumeInst &I) {NOT_SUPPORTED;}
void visitInlineAsm(CallInst &I) {NOT_SUPPORTED;}
void visitIndirectBrInst(IndirectBrInst &I) {NOT_SUPPORTED;}
@@ -766,7 +779,7 @@ namespace gbe
void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
- if (unit.printfs.find(inst) == unit.printfs.end())
+ if (unit.printfs.find((void *)inst) == unit.printfs.end())
return NULL;
return unit.printfs[inst];
}
@@ -837,7 +850,7 @@ namespace gbe
for (Value::use_iterator iter = work->use_begin(); iter != work->use_end(); ++iter) {
// After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
// which is more straightforward.
- #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+ #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
User *theUser = *iter;
#else
User *theUser = iter->getUser();
@@ -961,7 +974,7 @@ namespace gbe
CallInst *ci = dyn_cast<CallInst>(theUser);
pointer = ci ? ci->getArgOperand(0) : NULL;
} else {
- theUser->dump();
+ //theUser->dump();
GBE_ASSERT(0 && "Unknown instruction operating on pointers\n");
}
@@ -1091,7 +1104,7 @@ namespace gbe
if (predBB->getTerminator())
Builder2.SetInsertPoint(predBB->getTerminator());
-#if (LLVM_VERSION_MAJOR== 3 && LLVM_VERSION_MINOR < 6)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 36
// llvm 3.5 and older version don't have CreateBitOrPointerCast() define
Type *srcTy = base->getType();
Type *dstTy = ptr->getType();
@@ -1109,7 +1122,7 @@ namespace gbe
pointerBaseMap.insert(std::make_pair(ptr, basePhi));
return basePhi;
} else {
- ptr->dump();
+ //ptr->dump();
GBE_ASSERT(0 && "Unhandled instruction in getPointerBase\n");
return ptr;
}
@@ -1190,7 +1203,7 @@ namespace gbe
BtiValueMap.insert(std::make_pair(Val, btiPhi));
return btiPhi;
} else {
- Val->dump();
+ //Val->dump();
GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
return Val;
}
@@ -1250,7 +1263,7 @@ namespace gbe
uint32_t ops = clKernels->getNumOperands();
for(uint32_t x = 0; x < ops; x++) {
MDNode* node = clKernels->getOperand(x);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
Value * op = node->getOperand(0);
#else
auto *V = cast<ValueAsMetadata>(node->getOperand(0));
@@ -1274,7 +1287,7 @@ namespace gbe
MDNode *typeNameNode = NULL;
MDNode *typeBaseNameNode = NULL;
MDNode *typeQualNode = NULL;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
typeNameNode = F.getMetadata("kernel_arg_type");
typeBaseNameNode = F.getMetadata("kernel_arg_base_type");
typeQualNode = F.getMetadata("kernel_arg_type_qual");
@@ -1300,7 +1313,7 @@ namespace gbe
ir::FunctionArgument::InfoFromLLVM llvmInfo;
for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
unsigned opID = argID;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 39
opID += 1;
#endif
@@ -1342,7 +1355,7 @@ namespace gbe
for (Value::use_iterator iter = work->use_begin(); iter != work->use_end(); ++iter) {
// After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
// which is more straightforward.
- #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+ #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
User *theUser = *iter;
#else
User *theUser = iter->getUser();
@@ -1505,15 +1518,15 @@ namespace gbe
Value *pointer = expr->getOperand(0);
if (expr->getOpcode() == Instruction::GetElementPtr) {
uint32_t constantOffset = 0;
- CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+ Type* EltTy = pointer->getType();
for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
int32_t TypeIndex;
ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
GBE_ASSERTM(ConstOP != NULL, "must be constant index");
TypeIndex = ConstOP->getZExtValue();
GBE_ASSERT(TypeIndex >= 0);
- constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
- CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+ constantOffset += getGEPConstOffset(unit, pointer->getType(), TypeIndex);
+ EltTy = getEltType(EltTy, TypeIndex);
}
ir::Constant cc = unit.getConstantSet().getConstant(pointer->getName());
@@ -1644,7 +1657,7 @@ namespace gbe
}
default:
{
- c->dump();
+ //c->dump();
NOT_IMPLEMENTED;
}
}
@@ -1749,7 +1762,6 @@ namespace gbe
{
GBE_ASSERT(dyn_cast<ConstantExpr>(CPV) == NULL);
-#if LLVM_VERSION_MINOR > 0
ConstantDataSequential *seq = dyn_cast<ConstantDataSequential>(CPV);
if (seq) {
@@ -1772,7 +1784,6 @@ namespace gbe
GBE_ASSERTM(0, "Const data array never be half float\n");
}
} else
-#endif /* LLVM_VERSION_MINOR > 0 */
if (dyn_cast<ConstantAggregateZero>(CPV)) {
Type* Ty = CPV->getType();
@@ -1898,7 +1909,7 @@ namespace gbe
ir::ImmediateIndex GenWriter::processConstantImmIndex(Constant *CPV, int32_t index) {
if (dyn_cast<ConstantExpr>(CPV) == NULL)
return processConstantImmIndexImpl(CPV, index);
- CPV->dump();
+ //CPV->dump();
GBE_ASSERT(0 && "unsupported constant.\n");
return ctx.newImmediate((uint32_t)0);
}
@@ -2114,6 +2125,7 @@ namespace gbe
// Loop over the kernel metadatas to set the required work group size.
size_t reqd_wg_sz[3] = {0, 0, 0};
size_t hint_wg_sz[3] = {0, 0, 0};
+ size_t reqd_sg_sz = 0;
ir::FunctionArgument::InfoFromLLVM llvmInfo;
MDNode *addrSpaceNode = NULL;
MDNode *typeNameNode = NULL;
@@ -2124,7 +2136,7 @@ namespace gbe
std::string functionAttributes;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
/* LLVM 3.9 change kernel arg info as function metadata */
addrSpaceNode = F.getMetadata("kernel_arg_addr_space");
accessQualNode = F.getMetadata("kernel_arg_access_qual");
@@ -2209,6 +2221,27 @@ namespace gbe
functionAttributes += buffer;
functionAttributes += " ";
}
+ if ((attrNode = F.getMetadata("intel_reqd_sub_group_size"))) {
+ GBE_ASSERT(attrNode->getNumOperands() == 1);
+ ConstantInt *sz = mdconst::extract<ConstantInt>(attrNode->getOperand(0));
+ GBE_ASSERT(sz);
+ reqd_sg_sz = sz->getZExtValue();
+ if(!(reqd_sg_sz == 8 || reqd_sg_sz == 16)){
+ F.getContext().emitError("Required sub group size is illegal!");
+ ctx.getUnit().setValid(false);
+ return;
+ }
+ functionAttributes += "intel_reqd_sub_group_size";
+ std::stringstream param;
+ char buffer[100] = {0};
+ param << "(";
+ param << reqd_sg_sz;
+ param << ")";
+ param >> buffer;
+ functionAttributes += buffer;
+ functionAttributes += " ";
+ }
+
#else
/* First find the meta data belong to this function. */
MDNode *node = getKernelFunctionMetadata(&F);
@@ -2226,7 +2259,7 @@ namespace gbe
if (attrName->getString() == "reqd_work_group_size") {
GBE_ASSERT(attrNode->getNumOperands() == 4);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
@@ -2268,13 +2301,13 @@ namespace gbe
} else if (attrName->getString() == "vec_type_hint") {
GBE_ASSERT(attrNode->getNumOperands() == 3);
functionAttributes += attrName->getString();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
Value* V = attrNode->getOperand(1);
#else
auto *Op1 = cast<ValueAsMetadata>(attrNode->getOperand(1));
Value *V = Op1 ? Op1->getValue() : NULL;
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
ConstantInt *sign = dyn_cast<ConstantInt>(attrNode->getOperand(2));
#else
ConstantInt *sign = mdconst::extract<ConstantInt>(attrNode->getOperand(2));
@@ -2303,7 +2336,7 @@ namespace gbe
functionAttributes += " ";
} else if (attrName->getString() == "work_group_size_hint") {
GBE_ASSERT(attrNode->getNumOperands() == 4);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
@@ -2334,6 +2367,8 @@ namespace gbe
#endif /* LLVM 3.9 Function metadata */
ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
+ if (reqd_sg_sz)
+ ctx.setSimdWidth(reqd_sg_sz);
ctx.getFunction().setFunctionAttributes(functionAttributes);
// Loop over the arguments and output registers for them
@@ -2343,18 +2378,15 @@ namespace gbe
Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
// Insert a new register for each function argument
-#if LLVM_VERSION_MINOR <= 1
- const AttrListPtr &PAL = F.getAttributes();
-#endif /* LLVM_VERSION_MINOR <= 1 */
for (; I != E; ++I, ++argID) {
uint32_t opID = argID;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 39
opID += 1;
#endif
const std::string &argName = I->getName().str();
Type *type = I->getType();
if(addrSpaceNode) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(opID)))->getZExtValue();
#else
llvmInfo.addrSpace = (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(opID)))->getZExtValue();
@@ -2417,10 +2449,11 @@ namespace gbe
}
if (llvmInfo.isSamplerType()) {
- ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
+ ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, llvmInfo, 4, 4, 0);
(void)ctx.getFunction().getSamplerSet()->append(reg, &ctx);
continue;
}
+
if(llvmInfo.isPipeType()) {
llvmInfo.typeSize = getTypeSize(F.getParent(),unit,llvmInfo.typeName);
ctx.input(argName, ir::FunctionArgument::PIPE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), BtiMap.find(&*I)->second);
@@ -2435,11 +2468,7 @@ namespace gbe
continue;
Type *pointed = pointerType->getElementType();
// By value structure
-#if LLVM_VERSION_MINOR <= 1
- if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
-#else
if (I->hasByValAttr()) {
-#endif /* LLVM_VERSION_MINOR <= 1 */
const size_t structSize = getTypeByteSize(unit, pointed);
ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
}
@@ -2925,7 +2954,7 @@ namespace gbe
const Instruction *insn = NULL;
for(Value::const_use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
// After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR < 35
const User *theUser = *iter;
#else
const User *theUser = iter->getUser();
@@ -2980,10 +3009,12 @@ namespace gbe
this->newRegister(const_cast<GlobalVariable*>(&v));
ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
- ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
if (!legacyMode) {
- ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, reg);
- }
+ ir::Register regload = ctx.reg(getFamily(getType(ctx, v.getType())));
+ ctx.LOADI(getType(ctx, v.getType()), regload, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
+ ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, regload);
+ } else
+ ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
}
} else if(addrSpace == ir::MEM_PRIVATE) {
this->newRegister(const_cast<GlobalVariable*>(&v));
@@ -3163,15 +3194,9 @@ namespace gbe
void GenWriter::emitFunction(Function &F)
{
switch (F.getCallingConv()) {
-#if LLVM_VERSION_MINOR <= 2
- case CallingConv::PTX_Device: // we do not emit device function
- return;
- case CallingConv::PTX_Kernel:
-#else
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::SPIR_KERNEL:
-#endif
break;
default:
GBE_ASSERTM(false, "Unsupported calling convention");
@@ -3401,11 +3426,12 @@ namespace gbe
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
const ir::Register tmp = ctx.reg(getFamily(ctx, I.getType()));
+ const ir::Register tmp1 = ctx.reg(getFamily(ctx, I.getType()));
Value *cv = ConstantInt::get(I.getType(), 1);
switch (I.getPredicate()) {
case ICmpInst::FCMP_OEQ: ctx.EQ(type, dst, src0, src1); break;
- case ICmpInst::FCMP_ONE: ctx.NE(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_UNE: ctx.NE(type, dst, src0, src1); break;
case ICmpInst::FCMP_OLE: ctx.LE(type, dst, src0, src1); break;
case ICmpInst::FCMP_OGE: ctx.GE(type, dst, src0, src1); break;
case ICmpInst::FCMP_OLT: ctx.LT(type, dst, src0, src1); break;
@@ -3451,9 +3477,10 @@ namespace gbe
ctx.GT(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv));
break;
- case ICmpInst::FCMP_UNE:
- ctx.EQ(type, tmp, src0, src1);
- ctx.XOR(insnType, dst, tmp, getRegister(cv));
+ case ICmpInst::FCMP_ONE:
+ ctx.LT(type, tmp, src0, src1);
+ ctx.GT(type, tmp1, src0, src1);
+ ctx.OR(insnType, dst, tmp, tmp1);
break;
case ICmpInst::FCMP_TRUE:
ctx.MOV(insnType, dst, getRegister(cv));
@@ -3788,14 +3815,12 @@ namespace gbe
break;
case Intrinsic::stackrestore:
break;
-#if LLVM_VERSION_MINOR >= 2
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
break;
case Intrinsic::fmuladd:
this->newRegister(&I);
break;
-#endif /* LLVM_VERSION_MINOR >= 2 */
case Intrinsic::debugtrap:
case Intrinsic::trap:
case Intrinsic::dbg_value:
@@ -4081,6 +4106,15 @@ namespace gbe
regTranslator.newValueProxy(srcValue, dst);
break;
}
+ case GEN_OCL_INT_TO_SAMPLER:
+ case GEN_OCL_SAMPLER_TO_INT:
+ {
+ Value *srcValue = I.getOperand(0);
+ //srcValue->dump();
+ //dst->dump();
+ regTranslator.newValueProxy(srcValue, dst);
+ break;
+ }
case GEN_OCL_ENQUEUE_GET_ENQUEUE_INFO_ADDR:
regTranslator.newScalarProxy(ir::ocl::enqueuebufptr, dst);
break;
@@ -4122,6 +4156,21 @@ namespace gbe
};
}
+ void GenWriter::emitRoundingCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode) {
+ if (I.getType()->isHalfTy()) {
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ const ir::Register srcFloat = ctx.reg(ir::FAMILY_DWORD);
+ const ir::Register dstFloat = ctx.reg(ir::FAMILY_DWORD);
+ const ir::Register dst = this->getRegister(&I);
+ ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, srcFloat, src);
+ ctx.ALU1(opcode, ir::TYPE_FLOAT, dstFloat, srcFloat);
+ ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, dst, dstFloat);
+ } else {
+ GBE_ASSERT(I.getType()->isFloatTy());
+ this->emitUnaryCallInst(I,CS,opcode);
+ }
+ }
+
void GenWriter::emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode, ir::Type type) {
CallSite::arg_iterator AI = CS.arg_begin();
#if GBE_DEBUG
@@ -4499,10 +4548,19 @@ namespace gbe
/* append a new sampler. should be called before any reference to
* a sampler_t value. */
uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ CallInst *TC = dyn_cast<CallInst>(*AI);
+ Constant *CPV = TC ? dyn_cast<Constant>(TC->getOperand(0)) : NULL;
+#else
Constant *CPV = dyn_cast<Constant>(*AI);
+#endif
uint8_t index;
if (CPV != NULL)
{
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ // Check if the Callee is sampler convert function
+ GBE_ASSERT(TC->getCalledFunction()->getName().str() == "__gen_ocl_int_to_sampler");
+#endif
// This is not a kernel argument sampler, we need to append it to sampler set,
// and allocate a sampler slot for it.
const ir::Immediate &x = processConstantImm(CPV);
@@ -4549,11 +4607,9 @@ namespace gbe
ctx.MOV(ir::getType(family), dst, src);
}
break;
-#if LLVM_VERSION_MINOR >= 2
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
break;
-#endif /* LLVM_VERSION_MINOR >= 2 */
case Intrinsic::debugtrap:
case Intrinsic::trap:
case Intrinsic::dbg_value:
@@ -4699,10 +4755,10 @@ namespace gbe
}
break;
case Intrinsic::sqrt: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
- case Intrinsic::ceil: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
- case Intrinsic::trunc: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
- case Intrinsic::rint: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
- case Intrinsic::floor: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
+ case Intrinsic::ceil: this->emitRoundingCallInst(I,CS,ir::OP_RNDU); break;
+ case Intrinsic::trunc: this->emitRoundingCallInst(I,CS,ir::OP_RNDZ); break;
+ case Intrinsic::rint: this->emitRoundingCallInst(I,CS,ir::OP_RNDE); break;
+ case Intrinsic::floor: this->emitRoundingCallInst(I,CS,ir::OP_RNDD); break;
case Intrinsic::sin: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
@@ -5484,15 +5540,21 @@ namespace gbe
case GEN_OCL_GET_PIPE:
case GEN_OCL_MAKE_RID:
case GEN_OCL_GET_RID:
+ case GEN_OCL_INT_TO_SAMPLER:
+ case GEN_OCL_SAMPLER_TO_INT:
{
break;
}
case GEN_OCL_ENQUEUE_SET_NDRANGE_INFO:
{
GBE_ASSERT(AI != AE);
+ Value *dstValue;
+ if(I.hasStructRetAttr())
+ dstValue = *AI++;
+ else
+ dstValue = &I;
Value *srcValue = *AI;
++AI;
- Value *dstValue = &I;
regTranslator.newValueProxy(srcValue, dstValue);
break;
}
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 1ab77c9..b4715b1 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -46,7 +46,7 @@ namespace llvm {
FunctionPass *createExpandConstantExprPass();
FunctionPass *createExpandLargeIntegersPass();
FunctionPass *createPromoteIntegersPass();
- FunctionPass *createStripAttributesPass();
+ FunctionPass *createStripAttributesPass(bool lastTime);
// Copy debug information from Original to New, and return New.
template <typename T> T *CopyDebug(T *New, llvm::Instruction *Original) {
New->setDebugLoc(Original->getDebugLoc());
@@ -82,9 +82,9 @@ namespace gbe
auto it = map.find(symbol);
if (it == map.end()) {
- int status;
+ int status = 0; /* set for libcxxrt */
char *realName = abi::__cxa_demangle(symbol.c_str(), NULL, NULL, &status);
- if (status == 0) {
+ if (realName) {
std::string realFnName(realName), stripName;
stripName = realFnName.substr(0, realFnName.find("("));
it = map.find(stripName);
@@ -118,7 +118,10 @@ namespace gbe
uint32_t getTypeByteSize(const ir::Unit &unit, llvm::Type* Ty);
/*! Get GEP constant offset for the specified operand.*/
- int32_t getGEPConstOffset(const ir::Unit &unit, llvm::CompositeType *CompTy, int32_t TypeIndex);
+ int32_t getGEPConstOffset(const ir::Unit &unit, llvm::Type *eltTy, int32_t TypeIndex);
+
+ /*! Get element type for a type.*/
+ llvm::Type* getEltType(llvm::Type *eltTy, uint32_t index = 0);
/*! whether this is a kernel function */
bool isKernelFunction(const llvm::Function &f);
@@ -146,7 +149,7 @@ namespace gbe
/*! Insert the time stamp for profiling. */
llvm::FunctionPass* createProfilingInserterPass(int profilingType, ir::Unit &unit);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
/* customized loop unrolling pass. */
llvm::LoopPass *createCustomLoopUnrollPass();
#endif
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 86485da..08087cb 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -266,3 +266,7 @@ DECL_LLVM_GEN_FUNCTION(MAKE_RID, __gen_ocl_make_rid)
DECL_LLVM_GEN_FUNCTION(ENQUEUE_SET_NDRANGE_INFO, __gen_ocl_set_ndrange_info)
DECL_LLVM_GEN_FUNCTION(ENQUEUE_GET_NDRANGE_INFO, __gen_ocl_get_ndrange_info)
DECL_LLVM_GEN_FUNCTION(ENQUEUE_GET_ENQUEUE_INFO_ADDR, __gen_ocl_get_enqueue_info_addr)
+
+// sampler helper functions
+DECL_LLVM_GEN_FUNCTION(SAMPLER_TO_INT, __gen_ocl_sampler_to_int)
+DECL_LLVM_GEN_FUNCTION(INT_TO_SAMPLER, __gen_ocl_int_to_sampler)
diff --git a/backend/src/llvm/llvm_includes.hpp b/backend/src/llvm/llvm_includes.hpp
index 0b80979..184553a 100644
--- a/backend/src/llvm/llvm_includes.hpp
+++ b/backend/src/llvm/llvm_includes.hpp
@@ -24,6 +24,7 @@
#ifndef __GBE_IR_LLVM_INCLUDES_HPP__
#define __GBE_IR_LLVM_INCLUDES_HPP__
+#ifdef GBE_COMPILER_AVAILABLE
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
@@ -75,7 +76,12 @@
#include "llvm-c/Linker.h"
#include "llvm/IRReader/IRReader.h"
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+#include <llvm/Bitcode/BitcodeWriter.h>
+//#include <llvm/Bitcode/BitcodeReader.h>
+#else
#include "llvm/Bitcode/ReaderWriter.h"
+#endif
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -91,7 +97,7 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
#include "llvm/IR/Mangler.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CFG.h"
@@ -111,7 +117,7 @@
#include "llvm/Target/Mangler.h"
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/LegacyPassManager.h"
#else
@@ -122,14 +128,20 @@
#include <clang/CodeGen/CodeGenAction.h>
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
#include "llvm/Transforms/IPO/FunctionAttrs.h"
#include "llvm/Transforms/Scalar/GVN.h"
#endif
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
+#include "llvm/Support/Error.h"
+#endif
+
+#endif /*GBE_COMPILER_AVAILABLE */
+
#endif /* __GBE_IR_LLVM_INCLUDES_HPP__ */
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index f01bb51..57c933f 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -40,7 +40,12 @@ namespace gbe {
}
- virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual StringRef getPassName() const
+#else
+ virtual const char *getPassName() const
+#endif
+ {
return "SPIR backend: lowering instrinsics";
}
static char convertSpaceToName(Value *val) {
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index e797e98..5aa38be 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -35,7 +35,7 @@ namespace gbe {
GenLoadStoreOptimization() : BasicBlockPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();
#else
@@ -46,12 +46,12 @@ namespace gbe {
}
virtual bool runOnBasicBlock(BasicBlock &BB) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
#else
SE = &getAnalysis<ScalarEvolution>();
#endif
- #if LLVM_VERSION_MINOR >= 7
+ #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
TD = &BB.getModule()->getDataLayout();
#elif LLVM_VERSION_MINOR >= 5
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
@@ -75,8 +75,12 @@ namespace gbe {
const BasicBlock::iterator &start,
unsigned maxVecSize,
bool isLoad);
-
- virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual StringRef getPassName() const
+#else
+ virtual const char *getPassName() const
+#endif
+ {
return "Merge compatible Load/stores for Gen";
}
};
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index c5f3ffe..10752a3 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -42,7 +42,7 @@ namespace gbe
{
bool isKernelFunction(const llvm::Function &F) {
bool bKernel = false;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
bKernel = F.getMetadata("kernel_arg_name") != NULL;
#else
const Module *module = F.getParent();
@@ -53,7 +53,7 @@ namespace gbe
uint32_t ops = md.getNumOperands();
for(uint32_t x = 0; x < ops; x++) {
MDNode* node = md.getOperand(x);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR <= 35
Value * op = node->getOperand(0);
#else
Value * op = cast<ValueAsMetadata>(node->getOperand(0))->getValue();
@@ -74,7 +74,7 @@ namespace gbe
if(ops > 0) {
uint32_t major = 0, minor = 0;
MDNode* node = version->getOperand(0);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
major = mdconst::extract<ConstantInt>(node->getOperand(0))->getZExtValue();
minor = mdconst::extract<ConstantInt>(node->getOperand(1))->getZExtValue();
#else
@@ -180,12 +180,23 @@ namespace gbe
return size_bit/8;
}
- int32_t getGEPConstOffset(const ir::Unit &unit, CompositeType *CompTy, int32_t TypeIndex) {
+ Type* getEltType(Type* eltTy, uint32_t index) {
+ Type *elementType = NULL;
+ if (PointerType* ptrType = dyn_cast<PointerType>(eltTy))
+ elementType = ptrType->getElementType();
+ else if(SequentialType * seqType = dyn_cast<SequentialType>(eltTy))
+ elementType = seqType->getElementType();
+ else if(CompositeType * compTy= dyn_cast<CompositeType>(eltTy))
+ elementType = compTy->getTypeAtIndex(index);
+ GBE_ASSERT(elementType);
+ return elementType;
+ }
+
+ int32_t getGEPConstOffset(const ir::Unit &unit, Type *eltTy, int32_t TypeIndex) {
int32_t offset = 0;
- SequentialType * seqType = dyn_cast<SequentialType>(CompTy);
- if (seqType != NULL) {
+ if (!eltTy->isStructTy()) {
if (TypeIndex != 0) {
- Type *elementType = seqType->getElementType();
+ Type *elementType = getEltType(eltTy);
uint32_t elementSize = getTypeByteSize(unit, elementType);
uint32_t align = getAlignmentByte(unit, elementType);
elementSize += getPadding(elementSize, align);
@@ -193,17 +204,16 @@ namespace gbe
}
} else {
int32_t step = TypeIndex > 0 ? 1 : -1;
- GBE_ASSERT(CompTy->isStructTy());
for(int32_t ty_i=0; ty_i != TypeIndex; ty_i += step)
{
- Type* elementType = CompTy->getTypeAtIndex(ty_i);
+ Type* elementType = getEltType(eltTy, ty_i);
uint32_t align = getAlignmentByte(unit, elementType);
offset += getPadding(offset, align * step);
offset += getTypeByteSize(unit, elementType) * step;
}
//add getPaddingding for accessed type
- const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+ const uint32_t align = getAlignmentByte(unit, getEltType(eltTy ,TypeIndex));
offset += getPadding(offset, align * step);
}
return offset;
@@ -222,7 +232,11 @@ namespace gbe
AU.setPreservesCFG();
}
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual StringRef getPassName() const {
+#else
virtual const char *getPassName() const {
+#endif
return "SPIR backend: insert special spir instructions";
}
@@ -247,8 +261,8 @@ namespace gbe
{
const uint32_t ptrSize = unit.getPointerSize();
Value* parentPointer = GEPInst->getOperand(0);
- CompositeType* CompTy = parentPointer ? cast<CompositeType>(parentPointer->getType()) : NULL;
- if(!CompTy)
+ Type* eltTy = parentPointer ? parentPointer->getType() : NULL;
+ if(!eltTy)
return false;
Value* currentAddrInst =
@@ -262,14 +276,15 @@ namespace gbe
ConstantInt* ConstOP = dyn_cast<ConstantInt>(GEPInst->getOperand(op));
if (ConstOP != NULL) {
TypeIndex = ConstOP->getZExtValue();
- constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
+ constantOffset += getGEPConstOffset(unit, eltTy, TypeIndex);
}
else {
// we only have array/vectors here,
// therefore all elements have the same size
TypeIndex = 0;
- Type* elementType = CompTy->getTypeAtIndex(TypeIndex);
+ Type* elementType = getEltType(eltTy);
+
uint32_t size = getTypeByteSize(unit, elementType);
//add padding
@@ -326,7 +341,7 @@ namespace gbe
}
//step down in type hirachy
- CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+ eltTy = getEltType(eltTy, TypeIndex);
}
//insert addition of new offset before GEPInst when it is not zero
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 800f343..6bb7c52 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -309,7 +309,11 @@ error:
bool parseOnePrintfInstruction(CallInst * call);
bool generateOneParameterInst(PrintfSlot& slot, Value* arg, Value*& new_arg);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual StringRef getPassName() const
+#else
virtual const char *getPassName() const
+#endif
{
return "Printf Parser";
}
@@ -381,7 +385,7 @@ error:
}
GBE_ASSERT(unit.printfs.find(call) == unit.printfs.end());
- unit.printfs.insert(std::pair<llvm::CallInst*, PrintfSet::PrintfFmt*>(call, printf_fmt));
+ unit.printfs.insert(std::pair<void *, PrintfSet::PrintfFmt*>((void *)call, printf_fmt));
return true;
}
@@ -389,15 +393,9 @@ error:
{
bool hasPrintf = false;
switch (F.getCallingConv()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
- case CallingConv::PTX_Device:
- return false;
- case CallingConv::PTX_Kernel:
-#else
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::SPIR_KERNEL:
-#endif
break;
default:
GBE_ASSERTM(false, "Unsupported calling convention");
@@ -521,7 +519,11 @@ error:
case Type::FloatTyID: {
/* llvm 3.6 will give a undef value for NAN. */
if (dyn_cast<llvm::UndefValue>(arg)) {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ APFloat nan = APFloat::getNaN(APFloat::IEEEsingle(), false);
+#else
APFloat nan = APFloat::getNaN(APFloat::IEEEsingle, false);
+#endif
new_arg = ConstantFP::get(module->getContext(), nan);
}
diff --git a/backend/src/llvm/llvm_profiling.cpp b/backend/src/llvm/llvm_profiling.cpp
index 96c95ee..2d2ee11 100644
--- a/backend/src/llvm/llvm_profiling.cpp
+++ b/backend/src/llvm/llvm_profiling.cpp
@@ -26,29 +26,15 @@
#include <stdlib.h>
#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Function.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#else
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
-#endif /* LLVM_VERSION_MINOR <= 2 */
#include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 1
-#include "llvm/Support/IRBuilder.h"
-#elif LLVM_VERSION_MINOR == 2
-#include "llvm/IRBuilder.h"
-#else
#include "llvm/IR/IRBuilder.h"
-#endif /* LLVM_VERSION_MINOR <= 1 */
-#if LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CFG.h"
#else
@@ -97,7 +83,11 @@ namespace gbe
{
}
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual StringRef getPassName() const
+#else
virtual const char *getPassName() const
+#endif
{
return "Timestamp Parser";
}
@@ -111,15 +101,9 @@ namespace gbe
int pointNum = 0;
switch (F.getCallingConv()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
- case CallingConv::PTX_Device:
- return false;
- case CallingConv::PTX_Kernel:
-#else
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::SPIR_KERNEL:
-#endif
break;
default:
GBE_ASSERTM(false, "Unsupported calling convention");
@@ -178,12 +162,19 @@ namespace gbe
/* Add the timestamp store function call. */
// __gen_ocl_store_timestamp(int nth, int type);
Value *Args[2] = {ConstantInt::get(intTy, pointNum++), ConstantInt::get(intTy, profilingType)};
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
"__gen_ocl_calc_timestamp", Type::getVoidTy(module->getContext()),
IntegerType::getInt32Ty(module->getContext()),
+ IntegerType::getInt32Ty(module->getContext()))),
+ ArrayRef<Value*>(Args));
+#else
+ builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+ "__gen_ocl_calc_timestamp", Type::getVoidTy(module->getContext()),
IntegerType::getInt32Ty(module->getContext()),
- NULL)),
+ IntegerType::getInt32Ty(module->getContext()), nullptr)),
ArrayRef<Value*>(Args));
+#endif
}
/* We insert one store_profiling at the end of the last block to hold the place. */
llvm::Function::iterator BE = F.end();
@@ -193,12 +184,19 @@ namespace gbe
builder->SetInsertPoint(&*retInst);
Value *Args2[2] = {profilingBuf, ConstantInt::get(intTy, profilingType)};
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+ builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+ "__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
+ ptrTy,
+ IntegerType::getInt32Ty(module->getContext()))),
+ ArrayRef<Value*>(Args2));
+#else
builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
"__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
ptrTy,
- IntegerType::getInt32Ty(module->getContext()),
- NULL)),
+ IntegerType::getInt32Ty(module->getContext()), nullptr)),
ArrayRef<Value*>(Args2));
+#endif
delete builder;
return changed;
diff --git a/backend/src/llvm/llvm_sampler_fix.cpp b/backend/src/llvm/llvm_sampler_fix.cpp
index de7ebdb..c9ec817 100644
--- a/backend/src/llvm/llvm_sampler_fix.cpp
+++ b/backend/src/llvm/llvm_sampler_fix.cpp
@@ -33,7 +33,7 @@ namespace gbe {
class SamplerFix : public FunctionPass {
public:
SamplerFix() : FunctionPass(ID) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
#else
initializeDominatorTreePass(*PassRegistry::getPassRegistry());
@@ -55,9 +55,17 @@ namespace gbe {
// ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST));
bool needFix = true;
Value *needFixVal;
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ CallInst *init = dyn_cast<CallInst>(I->getOperand(0));
+ if (init && init->getCalledValue()->getName().compare("__translate_sampler_initializer"))
+ {
+ const ConstantInt *ci = dyn_cast<ConstantInt>(init->getOperand(0));
+ uint32_t samplerInt = ci->getZExtValue();
+#else
if (dyn_cast<ConstantInt>(I->getOperand(0))) {
const ConstantInt *ci = dyn_cast<ConstantInt>(I->getOperand(0));
uint32_t samplerInt = ci->getZExtValue();
+#endif
needFix = ((samplerInt & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP &&
(samplerInt & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST);
needFixVal = ConstantInt::get(boolTy, needFix);
@@ -65,14 +73,28 @@ namespace gbe {
IRBuilder<> Builder(I->getParent());
Builder.SetInsertPoint(I);
+
Value *addressMask = ConstantInt::get(i32Ty, __CLK_ADDRESS_MASK);
- Value *addressMode = Builder.CreateAnd(I->getOperand(0), addressMask);
Value *clampInt = ConstantInt::get(i32Ty, CLK_ADDRESS_CLAMP);
- Value *isClampMode = Builder.CreateICmpEQ(addressMode, clampInt);
Value *filterMask = ConstantInt::get(i32Ty, __CLK_FILTER_MASK);
- Value *filterMode = Builder.CreateAnd(I->getOperand(0), filterMask);
Value *nearestInt = ConstantInt::get(i32Ty, CLK_FILTER_NEAREST);
+
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ Module *M = I->getParent()->getParent()->getParent();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+ Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType());
+#else
+ Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType(), nullptr);
+#endif
+ Value *samplerVal = Builder.CreateCall(samplerCvt, {I->getOperand(0)});
+#else
+ Value *samplerVal = I->getOperand(0);
+#endif
+ Value *addressMode = Builder.CreateAnd(samplerVal, addressMask);
+ Value *isClampMode = Builder.CreateICmpEQ(addressMode, clampInt);
+ Value *filterMode = Builder.CreateAnd(samplerVal, filterMask);
Value *isNearestMode = Builder.CreateICmpEQ(filterMode, nearestInt);
+
needFixVal = Builder.CreateAnd(isClampMode, isNearestMode);
}
@@ -83,16 +105,35 @@ namespace gbe {
// return ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0);
bool needFix = true;
Value *needFixVal;
+ #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ CallInst *init = dyn_cast<CallInst>(I->getOperand(0));
+ if (init && init->getCalledValue()->getName().compare("__translate_sampler_initializer"))
+ {
+ const ConstantInt *ci = dyn_cast<ConstantInt>(init->getOperand(0));
+ uint32_t samplerInt = ci->getZExtValue();
+#else
if (dyn_cast<ConstantInt>(I->getOperand(0))) {
const ConstantInt *ci = dyn_cast<ConstantInt>(I->getOperand(0));
uint32_t samplerInt = ci->getZExtValue();
+#endif
needFix = samplerInt & CLK_NORMALIZED_COORDS_TRUE;
needFixVal = ConstantInt::get(boolTy, needFix);
} else {
IRBuilder<> Builder(I->getParent());
Builder.SetInsertPoint(I);
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ Module *M = I->getParent()->getParent()->getParent();
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 50
+ Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType());
+#else
+ Value* samplerCvt = M->getOrInsertFunction("__gen_ocl_sampler_to_int", i32Ty, I->getOperand(0)->getType(), nullptr);
+#endif
+ Value *samplerVal = Builder.CreateCall(samplerCvt, {I->getOperand(0)});
+#else
+ Value *samplerVal = I->getOperand(0);
+#endif
Value *normalizeMask = ConstantInt::get(i32Ty, CLK_NORMALIZED_COORDS_TRUE);
- Value *normalizeMode = Builder.CreateAnd(I->getOperand(0), normalizeMask);
+ Value *normalizeMode = Builder.CreateAnd(samplerVal, normalizeMask);
needFixVal = Builder.CreateICmpEQ(normalizeMode, ConstantInt::get(i32Ty, 0));
}
I->replaceAllUsesWith(needFixVal);
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 8850abb..e9a2a66 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -96,7 +96,7 @@ namespace gbe {
Scalarize() : FunctionPass(ID)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
#else
initializeDominatorTreePass(*PassRegistry::getPassRegistry());
@@ -873,15 +873,9 @@ namespace gbe {
bool Scalarize::runOnFunction(Function& F)
{
switch (F.getCallingConv()) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
- case CallingConv::PTX_Device:
- return false;
- case CallingConv::PTX_Kernel:
-#else
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::SPIR_KERNEL:
-#endif
break;
default:
GBE_ASSERTM(false, "Unsupported calling convention");
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index bef4df1..7f7deff 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -46,14 +46,7 @@ namespace gbe
BVAR(OCL_OUTPUT_CFG_GEN_IR, false);
using namespace llvm;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- llvm::LLVMContext& GBEGetLLVMContext() {
- static llvm::LLVMContext GBEContext;
- return GBEContext;
- }
-#endif
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
#define TARGETLIBRARY TargetLibraryInfoImpl
#else
#define TARGETLIBRARY TargetLibraryInfo
@@ -61,32 +54,32 @@ namespace gbe
void runFuntionPass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
legacy::FunctionPassManager FPM(&mod);
#else
FunctionPassManager FPM(&mod);
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
FPM.add(new DataLayoutPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR == 35
FPM.add(new DataLayoutPass(DL));
#else
FPM.add(new DataLayout(DL));
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
FPM.add(createVerifierPass(true));
#else
FPM.add(createVerifierPass());
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
FPM.add(new TargetLibraryInfoWrapperPass(*libraryInfo));
#else
FPM.add(new TargetLibraryInfo(*libraryInfo));
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
FPM.add(createTypeBasedAAWrapperPass());
FPM.add(createBasicAAWrapperPass());
#else
@@ -108,27 +101,27 @@ namespace gbe
void runModulePass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL, int optLevel, bool strictMath)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
legacy::PassManager MPM;
#else
PassManager MPM;
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
MPM.add(new DataLayoutPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR == 35
MPM.add(new DataLayoutPass(DL));
#else
MPM.add(new DataLayout(DL));
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
MPM.add(new TargetLibraryInfoWrapperPass(*libraryInfo));
#else
MPM.add(new TargetLibraryInfo(*libraryInfo));
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
MPM.add(createTypeBasedAAWrapperPass());
MPM.add(createBasicAAWrapperPass());
#else
@@ -139,7 +132,7 @@ namespace gbe
MPM.add(createBarrierNodupPass(false)); // remove noduplicate fnAttr before inlining.
MPM.add(createFunctionInliningPass(20000));
MPM.add(createBarrierNodupPass(true)); // restore noduplicate fnAttr after inlining.
- MPM.add(createStripAttributesPass()); // Strip unsupported attributes and calling conventions.
+ MPM.add(createStripAttributesPass(false)); // Strip unsupported attributes and calling conventions.
MPM.add(createSamplerFixPass());
MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
@@ -149,9 +142,9 @@ namespace gbe
MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
MPM.add(createPruneEHPass()); // Remove dead EH info
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
MPM.add(createPostOrderFunctionAttrsLegacyPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
MPM.add(createPostOrderFunctionAttrsPass()); // Set readonly/readnone attrs
#else
MPM.add(createFunctionAttrsPass()); // Set readonly/readnone attrs
@@ -159,7 +152,7 @@ namespace gbe
//MPM.add(createScalarReplAggregatesPass(64, true, -1, -1, 64))
if(optLevel > 0)
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
MPM.add(createSROAPass());
#else
MPM.add(createSROAPass(/*RequiresDomTree*/ false));
@@ -182,14 +175,14 @@ namespace gbe
MPM.add(createLoopDeletionPass()); // Delete dead loops
MPM.add(createLoopUnrollPass(640)); //1024, 32, 1024, 512)); //Unroll loops
if(optLevel > 0) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
MPM.add(createSROAPass());
#else
MPM.add(createSROAPass(/*RequiresDomTree*/ false));
#endif
MPM.add(createGVNPass()); // Remove redundancies
}
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
// FIXME Workaround: we find that CustomLoopUnroll may increase register pressure greatly,
// and it may even make som cl kernel cannot compile because of limited scratch memory for spill.
// As we observe this under strict math. So we disable CustomLoopUnroll if strict math is enabled.
@@ -199,7 +192,7 @@ namespace gbe
#endif
MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
if(optLevel > 0) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
MPM.add(createSROAPass());
#else
MPM.add(createSROAPass(/*RequiresDomTree*/ false));
@@ -230,7 +223,7 @@ namespace gbe
}
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
#define OUTPUT_BITCODE(STAGE, MOD) do { \
legacy::PassManager passes__; \
if (OCL_OUTPUT_LLVM_##STAGE) { \
@@ -238,7 +231,7 @@ namespace gbe
passes__.run(MOD); \
} \
}while(0)
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
#define OUTPUT_BITCODE(STAGE, MOD) do { \
PassManager passes__; \
if (OCL_OUTPUT_LLVM_##STAGE) { \
@@ -288,7 +281,7 @@ namespace gbe
dc->process(diagnostic);
}
- bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module,
+ bool llvmToGen(ir::Unit &unit, const void* module,
int optLevel, bool strictMath, int profiling, std::string &errors)
{
std::string errInfo;
@@ -296,29 +289,15 @@ namespace gbe
if (OCL_OUTPUT_LLVM_BEFORE_LINK || OCL_OUTPUT_LLVM_AFTER_LINK || OCL_OUTPUT_LLVM_AFTER_GEN)
o = std::unique_ptr<llvm::raw_fd_ostream>(new llvm::raw_fd_ostream(fileno(stdout), false));
- // Get the module from its file
- llvm::SMDiagnostic Err;
-
Module* cl_mod = NULL;
if (module) {
cl_mod = reinterpret_cast<Module*>(const_cast<void*>(module));
- } else if (fileName){
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- llvm::LLVMContext& c = GBEGetLLVMContext();
-#else
- llvm::LLVMContext& c = llvm::getGlobalContext();
-#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
- cl_mod = parseIRFile(fileName, Err, c).release();
-#else
- cl_mod = ParseIRFile(fileName, Err, c);
-#endif
}
if (!cl_mod) return false;
OUTPUT_BITCODE(BEFORE_LINK, (*cl_mod));
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
legacy::PassManager passes__;
#else
PassManager passes__;
@@ -335,8 +314,7 @@ namespace gbe
/* Before do any thing, we first filter in all CL functions in bitcode. */
/* Also set unit's pointer size in runBitCodeLinker */
M.reset(runBitCodeLinker(cl_mod, strictMath, unit));
- if (!module)
- delete cl_mod;
+
if (M.get() == 0)
return true;
@@ -346,7 +324,7 @@ namespace gbe
gbeDiagnosticContext dc;
mod.getContext().setDiagnosticHandler(&gbeDiagnosticHandler,&dc);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
mod.setDataLayout(DL);
#endif
Triple TargetTriple(mod.getTargetTriple());
@@ -357,24 +335,24 @@ namespace gbe
runFuntionPass(mod, libraryInfo, DL);
runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
legacy::PassManager passes;
#else
PassManager passes;
#endif
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
passes.add(new DataLayoutPass());
-#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
+#elif LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR == 35
passes.add(new DataLayoutPass(DL));
#else
passes.add(new DataLayout(DL));
#endif
// Print the code before further optimizations
passes.add(createIntrinsicLoweringPass());
- passes.add(createStripAttributesPass()); // Strip unsupported attributes and calling conventions.
+ passes.add(createStripAttributesPass(true)); // Strip unsupported attributes and calling conventions.
passes.add(createFunctionInliningPass(20000));
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
passes.add(createSROAPass());
#else
passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
@@ -402,9 +380,17 @@ namespace gbe
passes.add(createScalarizePass()); // Expand all vector ops
if(OCL_OUTPUT_CFG)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ passes.add(createCFGPrinterLegacyPassPass());
+#else
passes.add(createCFGPrinterPass());
+#endif
if(OCL_OUTPUT_CFG_ONLY)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ passes.add(createCFGOnlyPrinterLegacyPassPass());
+#else
passes.add(createCFGOnlyPrinterPass());
+#endif
passes.add(createGenPass(unit));
passes.run(mod);
errors = dc.str();
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
index d3928c6..d2247bb 100644
--- a/backend/src/llvm/llvm_to_gen.hpp
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -23,7 +23,7 @@
*/
#ifndef __GBE_IR_LLVM_TO_GEN_HPP__
#define __GBE_IR_LLVM_TO_GEN_HPP__
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 39
#include "llvm/IR/LLVMContext.h"
#endif
@@ -35,12 +35,8 @@ namespace gbe {
/*! Convert the LLVM IR code to a GEN IR code,
optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
- bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module,
+ bool llvmToGen(ir::Unit &unit, const void* module,
int optLevel, bool strictMath, int profiling, std::string &errors);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
- extern llvm::LLVMContext& GBEGetLLVMContext();
-#endif
-
} /* namespace gbe */
#endif /* __GBE_IR_LLVM_TO_GEN_HPP__ */
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
index e24dc4f..107d793 100644
--- a/backend/src/llvm/llvm_unroll.cpp
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -16,7 +16,7 @@
*/
#include "llvm/Config/llvm-config.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
#include <set>
#include "llvm_includes.hpp"
@@ -36,7 +36,7 @@ namespace gbe {
LoopPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const {
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 7)
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
#else
@@ -47,7 +47,7 @@ namespace gbe {
AU.addPreservedID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addPreservedID(LCSSAID);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();
#else
@@ -91,7 +91,7 @@ namespace gbe {
assert(MD->getNumOperands() == 2 &&
"Unroll count hint metadata should have two operands.");
unsigned Count;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
Count = mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
#else
Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue();
@@ -105,7 +105,7 @@ namespace gbe {
void setUnrollID(Loop *L, bool enable) {
assert(enable);
LLVMContext &Context = L->getHeader()->getContext();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 36
SmallVector<Metadata *, 2> forceUnroll;
forceUnroll.push_back(MDString::get(Context, "llvm.loop.unroll.enable"));
MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
@@ -169,7 +169,7 @@ namespace gbe {
// be unrolled.
bool handleParentLoops(Loop *L, LPPassManager &LPM) {
Loop *currL = L;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
#else
@@ -205,7 +205,7 @@ namespace gbe {
if (parentTripCount != 0 && currTripCount * parentTripCount > 32) {
//Don't change the unrollID if doesn't force unroll.
//setUnrollID(parentL, false);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
loopInfo.markAsRemoved(parentL);
#else
LPM.deleteLoopFromQueue(parentL);
@@ -238,7 +238,12 @@ namespace gbe {
return true;
}
- virtual const char *getPassName() const {
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
+ virtual StringRef getPassName() const
+#else
+ virtual const char *getPassName() const
+#endif
+ {
return "SPIR backend: custom loop unrolling pass";
}
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index b1019da..75039d4 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -283,7 +283,7 @@ Documents for OpenCL application developers
- [[OpenGL Buffer Sharing|Beignet/howto/gl-buffer-sharing-howto]]
- [[Video Motion Estimation|Beignet/howto/video-motion-estimation-howto]]
- [[Stand Alone Unit Test|Beignet/howto/stand-alone-utest-howto]]
-- [[Android build|Beignet/android-build-howto]]
+- [[Android build|Beignet/howto/android-build-howto]]
The wiki URL is as below:
[http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index 601a07f..5ec3a45 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,5 +1,8 @@
# News
+## Oct 26, 2017
+[Beignet 1.3.2](https://01.org/beignet/downloads/beignet-1.3.2-2017-10-26) is released. This is a bug-fix release.
+
## Mar 13, 2017
[Beignet 1.3.1](https://01.org/beignet/downloads/beignet-1.3.1-2017-03-13) is released. This is a bug-fix release.
diff --git a/docs/howto/gl-buffer-sharing-howto.mdwn b/docs/howto/gl-buffer-sharing-howto.mdwn
index 6b3a751..5fb3d87 100644
--- a/docs/howto/gl-buffer-sharing-howto.mdwn
+++ b/docs/howto/gl-buffer-sharing-howto.mdwn
@@ -1,16 +1,16 @@
GL Buffer Sharing HowTo
=========================
-Beignet now support cl_khr_gl_sharing partially(the most commonly used part), which is an offcial
+Beignet now supports cl_khr_gl_sharing partially (the most commonly used part), which is an official
extension of Khronos OpenCL. With this extension, Beignet can create memory object from OpenGL/OpenGL
-ES buffer, texture or renderbuffer object with zero-copy. Currently, we just support create memory
-object from GL buffer object or 2d texture(the most common target type). We will support creating
+ES buffer, texture or renderbuffer object with zero-copy. Currently, we just support creating memory
+object from GL buffer object or 2d texture (the most common target type). We will support creating
from other GL target type if necessary.
Prerequisite
------------
-Mesa GL library and Mesa EGL libray are required. Both version should be greater or equal than
+Mesa GL library and Mesa EGL library are required. Both version should be greater or equal than
13.0.0.
Steps
@@ -18,7 +18,7 @@ Steps
A typical procedure of using cl_khr_gl_sharing is as below:
-- Basic egl routine(eglGetDisplay, eglInitialize, eglCreateContext...).
+- Basic egl routine (eglGetDisplay, eglInitialize, eglCreateContext...).
- Create GL 2d texture in normal OpenGL way.
@@ -44,7 +44,7 @@ A typical procedure of using cl_khr_gl_sharing is as below:
- Access this cl image object as an usual cl image object.
-- Relase cl image object by calling clEnqueueReleaseGLObjects.
+- Release cl image object by calling clEnqueueReleaseGLObjects.
- Ensure any pending OpenCL operations which access this cl image object have completed by clFinish.
@@ -54,7 +54,7 @@ Sample code
-----------
We have developed an example showing how to utilize cl_khr_gl_sharing in examples/gl_buffer_sharing
-directory. A cl image object is created from a gl 2d texutre and processed by OpenCL kernel, then
+directory. A cl image object is created from a gl 2d texture and processed by OpenCL kernel, then
is shown on screen.
Steps to build and run this example:
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 47bae46..3cb8515 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -197,6 +197,12 @@ typedef CL_API_ENTRY cl_int
void* /*param_value*/,
size_t* /*param_value_size_ret*/ );
#endif
+
+/* cl_intel_required_subgroup_size extension*/
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A
+
#ifdef __cplusplus
}
#endif
diff --git a/kernels/compiler_if_else.cl b/kernels/compiler_if_else.cl
index 7ae8f99..c8b2cb8 100644
--- a/kernels/compiler_if_else.cl
+++ b/kernels/compiler_if_else.cl
@@ -5,10 +5,8 @@ compiler_if_else(__global int *src, __global int *dst)
dst[id] = src[id];
if (dst[id] >= 0) {
dst[id] = src[id+1];
- src[id] = 1;
} else {
dst[id]--;
- src[id] = 2;
}
}
diff --git a/kernels/compiler_remove_negative_add.cl b/kernels/compiler_remove_negative_add.cl
new file mode 100644
index 0000000..d6f7270
--- /dev/null
+++ b/kernels/compiler_remove_negative_add.cl
@@ -0,0 +1,4 @@
+kernel void compiler_remove_negative_add(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = exp2(-src[i]);
+};
diff --git a/kernels/compiler_reqd_sub_group_size.cl b/kernels/compiler_reqd_sub_group_size.cl
new file mode 100644
index 0000000..0ce70e9
--- /dev/null
+++ b/kernels/compiler_reqd_sub_group_size.cl
@@ -0,0 +1,5 @@
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+__kernel void compiler_reqd_sub_group_size(global int* src)
+{
+
+}
diff --git a/kernels/compiler_sqrt_div.cl b/kernels/compiler_sqrt_div.cl
new file mode 100644
index 0000000..7d5a2f0
--- /dev/null
+++ b/kernels/compiler_sqrt_div.cl
@@ -0,0 +1,8 @@
+kernel void compiler_sqrt_div(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ float tmp = sqrt(src[i]);
+ dst[i*4] = 1.0f/tmp;
+ dst[i*4+1] = (float)i/tmp;
+ dst[i*4+2] = 2.0f/tmp;
+ dst[i*4+3] = 1.0f/tmp + tmp;
+};
diff --git a/kernels/test_fill_gl_image.cl b/kernels/test_fill_gl_image.cl
index 4250a57..7b5dce7 100644
--- a/kernels/test_fill_gl_image.cl
+++ b/kernels/test_fill_gl_image.cl
@@ -1,5 +1,5 @@
__kernel void
-test_fill_gl_image(image2d_t img, int color)
+test_fill_gl_image(write_only image2d_t img, int color)
{
int2 coord;
float4 color_v4;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f3c4632..0cd41fc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,13 +18,13 @@ foreach (KF ${KERNEL_FILES})
OUTPUT ${output_file}
COMMAND rm -rf ${output_file}
COMMAND ${GBE_BIN_GENERATER} -s -o${output_file} -t${GEN_PCI_ID} ${input_file}
- DEPENDS ${input_file} ${GBE_BIN_FILE})
+ DEPENDS ${input_file} ${GBE_BIN_FILE} beignet_bitcode)
else(GEN_PCI_ID)
add_custom_command(
OUTPUT ${output_file}
COMMAND rm -rf ${output_file}
COMMAND ${GBE_BIN_GENERATER} -s -o${output_file} ${input_file}
- DEPENDS ${input_file} ${GBE_BIN_FILE})
+ DEPENDS ${input_file} ${GBE_BIN_FILE} beignet_bitcode)
endif(GEN_PCI_ID)
endforeach (KF)
endmacro (MakeKernelBinStr)
diff --git a/src/cl_context.c b/src/cl_context.c
index 1ba2302..fef36f8 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -358,29 +358,37 @@ cl_context_delete(cl_context ctx)
if (UNLIKELY(ctx == NULL))
return;
+ int internal_ctx_refs = 1;
+ // determine how many ctx refs are held by internal_prgs and built_in_prgs
+ for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
+ if (ctx->internal_kernels[i] && ctx->internal_prgs[i])
+ ++internal_ctx_refs;
+ }
+
/* We are not done yet */
- if (CL_OBJECT_DEC_REF(ctx) > 1)
+ if (CL_OBJECT_DEC_REF(ctx) > internal_ctx_refs)
return;
+ // create a temporary extra ref here so cl_program_delete doesn't
+ // attempt a recursive full cl_context_delete when cleaning up
+ // our internal programs
+ CL_OBJECT_INC_REF(ctx);
+
/* delete the internal programs. */
for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
if (ctx->internal_kernels[i]) {
- cl_kernel_delete(ctx->internal_kernels[i]);
+ cl_kernel k = ctx->internal_kernels[i];
ctx->internal_kernels[i] = NULL;
+ cl_kernel_delete(k);
assert(ctx->internal_prgs[i]);
- cl_program_delete(ctx->internal_prgs[i]);
+ cl_program p = ctx->internal_prgs[i];
ctx->internal_prgs[i] = NULL;
- }
-
- if (ctx->built_in_kernels[i]) {
- cl_kernel_delete(ctx->built_in_kernels[i]);
- ctx->built_in_kernels[i] = NULL;
+ cl_program_delete(p);
}
}
- cl_program_delete(ctx->built_in_prgs);
- ctx->built_in_prgs = NULL;
+ CL_OBJECT_DEC_REF(ctx);
cl_free(ctx->prop_user);
cl_free(ctx->devices);
@@ -427,32 +435,18 @@ cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
ctx->internal_prgs[index]->is_built = 1;
- /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
- if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
- int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
- for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
- if (index != i) {
- assert(ctx->internal_prgs[i] == NULL);
- assert(ctx->internal_kernels[i] == NULL);
- cl_program_add_ref(ctx->internal_prgs[index]);
- ctx->internal_prgs[i] = ctx->internal_prgs[index];
- }
-
- if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
- ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+ ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
"__cl_fill_region_align8_2", NULL);
- } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
- ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ } else if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+ ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
"__cl_fill_region_align8_4", NULL);
- } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
- ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ } else if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+ ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
"__cl_fill_region_align8_8", NULL);
- } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
- ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ } else if (index == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ ctx->internal_kernels[index] = cl_program_create_kernel(ctx->internal_prgs[index],
"__cl_fill_region_align8_16", NULL);
- } else
- assert(0);
- }
} else {
ctx->internal_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
}
diff --git a/src/cl_context.h b/src/cl_context.h
index 4812afd..a46f2f5 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -120,8 +120,6 @@ struct _cl_context {
/* All programs internal used, for example clEnqueuexxx api use */
cl_kernel internal_kernels[CL_INTERNAL_KERNEL_MAX];
/* All kernels for clenqueuexxx api, for example clEnqueuexxx api use */
- cl_program built_in_prgs; /*all built-in kernels belongs to this program only*/
- cl_kernel built_in_kernels[CL_INTERNAL_KERNEL_MAX];
uint32_t ver; /* Gen version */
struct _cl_context_prop props;
cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index f3c5204..123b619 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -247,7 +247,9 @@
/* SKL */
#define PCI_CHIP_SKYLAKE_ULT_GT1 0x1906 /* Intel(R) Skylake ULT - GT1 */
#define PCI_CHIP_SKYLAKE_ULT_GT2 0x1916 /* Intel(R) Skylake ULT - GT2 */
-#define PCI_CHIP_SKYLAKE_ULT_GT3 0x1926 /* Intel(R) Skylake ULT - GT3 */
+#define PCI_CHIP_SKYLAKE_ULT_GT3 0x1923 /* Intel(R) Skylake ULT - GT3 */
+#define PCI_CHIP_SKYLAKE_ULT_GT3E1 0x1926 /* Intel(R) Skylake ULT - GT3E */
+#define PCI_CHIP_SKYLAKE_ULT_GT3E2 0x1927 /* Intel(R) Skylake ULT - GT3E */
#define PCI_CHIP_SKYLAKE_ULT_GT2F 0x1921 /* Intel(R) Skylake ULT - GT2F */
#define PCI_CHIP_SKYLAKE_ULX_GT1 0x190E /* Intel(R) Skylake ULX - GT1 */
#define PCI_CHIP_SKYLAKE_ULX_GT2 0x191E /* Intel(R) Skylake ULX - GT2 */
@@ -284,6 +286,8 @@
#define IS_SKL_GT3(devid) \
(devid == PCI_CHIP_SKYLAKE_ULT_GT3 || \
+ devid == PCI_CHIP_SKYLAKE_ULT_GT3E1 || \
+ devid == PCI_CHIP_SKYLAKE_ULT_GT3E2 || \
devid == PCI_CHIP_SKYLAKE_HALO_GT3 || \
devid == PCI_CHIP_SKYLAKE_SRV_GT3 || \
devid == PCI_CHIP_SKYLAKE_MEDIA_SRV_GT3)
@@ -361,7 +365,14 @@
#define IS_KABYLAKE(devid) (IS_KBL_GT1(devid) || IS_KBL_GT15(devid) || IS_KBL_GT2(devid) || IS_KBL_GT3(devid) || IS_KBL_GT4(devid))
-#define IS_GEN9(devid) (IS_SKYLAKE(devid) || IS_BROXTON(devid) || IS_KABYLAKE(devid))
+#define PCI_CHIP_GLK_3x6 0x3184
+#define PCI_CHIP_GLK_2x6 0x3185
+
+#define IS_GEMINILAKE(devid) \
+ (devid == PCI_CHIP_GLK_3x6 || \
+ devid == PCI_CHIP_GLK_2x6)
+
+#define IS_GEN9(devid) (IS_SKYLAKE(devid) || IS_BROXTON(devid) || IS_KABYLAKE(devid) || IS_GEMINILAKE(devid))
#define MAX_OCLVERSION(devid) (IS_GEN9(devid) ? 200 : 120)
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index d4f4208..76e77d7 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -254,6 +254,26 @@ static struct _cl_device_id intel_kbl_gt4_device = {
#include "cl_gen9_device.h"
};
+static struct _cl_device_id intel_glk18eu_device = {
+ .max_compute_unit = 18,
+ .max_thread_per_unit = 6,
+ .sub_slice_count = 3,
+ .max_work_item_sizes = {512, 512, 512},
+ .max_work_group_size = 512,
+ .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
+static struct _cl_device_id intel_glk12eu_device = {
+ .max_compute_unit = 12,
+ .max_thread_per_unit = 6,
+ .sub_slice_count = 2,
+ .max_work_item_sizes = {512, 512, 512},
+ .max_work_group_size = 512,
+ .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
LOCAL cl_device_id
cl_get_gt_device(cl_device_type device_type)
{
@@ -585,6 +605,10 @@ skl_gt2_break:
case PCI_CHIP_SKYLAKE_ULT_GT3:
DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3");
+ case PCI_CHIP_SKYLAKE_ULT_GT3E1:
+ DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3E");
+ case PCI_CHIP_SKYLAKE_ULT_GT3E2:
+ DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3E");
case PCI_CHIP_SKYLAKE_HALO_GT3:
DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Halo GT3");
case PCI_CHIP_SKYLAKE_SRV_GT3:
@@ -737,6 +761,26 @@ kbl_gt4_break:
cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
break;
+ case PCI_CHIP_GLK_3x6:
+ DECL_INFO_STRING(glk18eu_break, intel_bxt18eu_device, name, "Intel(R) HD Graphics Geminilake(3x6)");
+glk18eu_break:
+ intel_glk18eu_device.device_id = device_id;
+ intel_glk18eu_device.platform = cl_get_platform_default();
+ ret = &intel_glk18eu_device;
+ cl_intel_platform_get_default_extension(ret);
+ cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+ break;
+
+ case PCI_CHIP_GLK_2x6:
+ DECL_INFO_STRING(glk12eu_break, intel_bxt12eu_device, name, "Intel(R) HD Graphics Geminilake(2x6)");
+glk12eu_break:
+ intel_glk12eu_device.device_id = device_id;
+ intel_glk12eu_device.platform = cl_get_platform_default();
+ ret = &intel_glk12eu_device;
+ cl_intel_platform_get_default_extension(ret);
+ cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+ break;
+
case PCI_CHIP_SANDYBRIDGE_BRIDGE:
case PCI_CHIP_SANDYBRIDGE_GT1:
case PCI_CHIP_SANDYBRIDGE_GT2:
@@ -942,7 +986,9 @@ LOCAL cl_bool is_gen_device(cl_device_id device) {
device == &intel_kbl_gt15_device ||
device == &intel_kbl_gt2_device ||
device == &intel_kbl_gt3_device ||
- device == &intel_kbl_gt4_device;
+ device == &intel_kbl_gt4_device ||
+ device == &intel_glk18eu_device ||
+ device == &intel_glk12eu_device;
}
LOCAL cl_int
@@ -1333,6 +1379,10 @@ cl_get_device_info(cl_device_id device,
src_ptr = device->driver_version;
src_size = device->driver_version_sz;
break;
+ case CL_DEVICE_SUB_GROUP_SIZES_INTEL:
+ src_ptr = device->sub_group_sizes;
+ src_size = device->sub_group_sizes_sz;
+ break;
default:
return CL_INVALID_VALUE;
@@ -1363,7 +1413,8 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
|| device == &intel_skl_gt3_device || device == &intel_skl_gt4_device
|| device == &intel_bxt18eu_device || device == &intel_bxt12eu_device || device == &intel_kbl_gt1_device
|| device == &intel_kbl_gt2_device || device == &intel_kbl_gt3_device
- || device == &intel_kbl_gt4_device || device == &intel_kbl_gt15_device) {
+ || device == &intel_kbl_gt4_device || device == &intel_kbl_gt15_device
+ || device == &intel_glk18eu_device || device == &intel_glk12eu_device) {
*ver = 9;
} else
return CL_INVALID_VALUE;
@@ -1475,6 +1526,7 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
case CL_KERNEL_GLOBAL_WORK_SIZE:
+ {
dimension = cl_check_builtin_kernel_dimension(kernel, device);
if ( !dimension ) return CL_INVALID_VALUE;
if (param_value_size_ret != NULL)
@@ -1492,6 +1544,18 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
return CL_SUCCESS;
}
return CL_SUCCESS;
+ }
+ case CL_KERNEL_SPILL_MEM_SIZE_INTEL:
+ {
+ if (param_value && param_value_size < sizeof(cl_ulong))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret != NULL)
+ *param_value_size_ret = sizeof(cl_ulong);
+ if (param_value)
+ *(cl_ulong*)param_value = (cl_ulong)interp_kernel_get_scratch_size(kernel->opaque);
+ return CL_SUCCESS;
+ }
+
default:
return CL_INVALID_VALUE;
};
@@ -1575,6 +1639,16 @@ cl_get_kernel_subgroup_info(cl_kernel kernel,
}
break;
}
+ case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL:
+ {
+ if (param_value && param_value_size < sizeof(size_t))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret != NULL)
+ *param_value_size_ret = sizeof(size_t);
+ if (param_value)
+ *(size_t*)param_value = interp_kernel_get_simd_width(kernel->opaque);
+ return CL_SUCCESS;
+ }
default:
return CL_INVALID_VALUE;
};
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 6b8f2eb..93bd2f1 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -136,6 +136,8 @@ struct _cl_device_id {
uint32_t atomic_test_result;
cl_uint image_pitch_alignment;
cl_uint image_base_address_alignment;
+ size_t sub_group_sizes[2];
+ size_t sub_group_sizes_sz;
//inited as NULL, created only when cmrt kernel is used
void* cmrt_device; //realtype: CmDevice*
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index a3c71ca..56099ad 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -42,7 +42,7 @@ void check_opt1_extension(cl_extensions_t *extensions)
{
if (id == EXT_ID(khr_icd))
extensions->extensions[id].base.ext_enabled = 1;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 35
if (id == EXT_ID(khr_spir))
extensions->extensions[id].base.ext_enabled = 1;
#endif
@@ -69,8 +69,16 @@ check_intel_extension(cl_extensions_t *extensions)
{
int id;
for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
+ {
if(id != EXT_ID(intel_motion_estimation))
extensions->extensions[id].base.ext_enabled = 1;
+ if(id == EXT_ID(intel_required_subgroup_size))
+#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR > 40
+ extensions->extensions[id].base.ext_enabled = 1;
+#else
+ extensions->extensions[id].base.ext_enabled = 0;
+#endif
+ }
}
void
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 52a4953..e6c64ba 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -29,7 +29,8 @@
DECL_EXT(intel_accelerator) \
DECL_EXT(intel_motion_estimation) \
DECL_EXT(intel_subgroups) \
- DECL_EXT(intel_subgroups_short)
+ DECL_EXT(intel_subgroups_short) \
+ DECL_EXT(intel_required_subgroup_size)
#define DECL_GL_EXTENSIONS \
DECL_EXT(khr_gl_sharing)\
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index f190b0d..0379b3e 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -24,6 +24,7 @@
//function pointer from libgbe.so
gbe_program_new_from_source_cb *compiler_program_new_from_source = NULL;
+gbe_program_new_from_llvm_file_cb *compiler_program_new_from_llvm_file = NULL;
gbe_program_compile_from_source_cb *compiler_program_compile_from_source = NULL;
gbe_program_new_gen_program_cb *compiler_program_new_gen_program = NULL;
gbe_program_link_program_cb *compiler_program_link_program = NULL;
@@ -298,6 +299,10 @@ struct GbeLoaderInitializer
if (compiler_program_new_from_source == NULL)
return;
+ compiler_program_new_from_llvm_file = *(gbe_program_new_from_llvm_file_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm_file");
+ if (compiler_program_new_from_llvm_file == NULL)
+ return;
+
compiler_program_compile_from_source = *(gbe_program_compile_from_source_cb **)dlsym(dlhCompiler, "gbe_program_compile_from_source");
if (compiler_program_compile_from_source == NULL)
return;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index df885d2..df85f1e 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -25,6 +25,7 @@
extern "C" {
#endif
extern gbe_program_new_from_source_cb *compiler_program_new_from_source;
+extern gbe_program_new_from_llvm_file_cb *compiler_program_new_from_llvm_file;
extern gbe_program_compile_from_source_cb *compiler_program_compile_from_source;
extern gbe_program_new_gen_program_cb *compiler_program_new_gen_program;
extern gbe_program_link_program_cb *compiler_program_link_program;
diff --git a/src/cl_gl_api.c b/src/cl_gl_api.c
index 897edb4..61b3ab8 100644
--- a/src/cl_gl_api.c
+++ b/src/cl_gl_api.c
@@ -35,6 +35,8 @@
#include "cl_sampler.h"
#include "cl_alloc.h"
#include "cl_utils.h"
+#include "cl_enqueue.h"
+#include "cl_event.h"
#include "CL/cl.h"
#include "CL/cl_gl.h"
@@ -134,6 +136,80 @@ cl_int clEnqueueAcquireGLObjects (cl_command_queue command_queue,
cl_event *event)
{
cl_int err = CL_SUCCESS;
+ cl_int e_status, i;
+ cl_event e = NULL;
+ enqueue_data *data = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (UNLIKELY(command_queue->ctx->props.gl_type == CL_GL_NOSHARE)) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if ((num_objects == 0 && mem_objects != NULL) ||
+ (num_objects > 0 && mem_objects == NULL)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ for (i = 0; i < num_objects; i++) {
+ if (!cl_mem_image(mem_objects[i])) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (!IS_GL_IMAGE(mem_objects[i])) {
+ err = CL_INVALID_GL_OBJECT;
+ break;
+ }
+ }
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_ACQUIRE_GL_OBJECTS, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e_status = cl_event_is_ready(e);
+
+ data = &e->exec_data;
+ data->type = EnqueueReturnSuccesss;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
return err;
}
@@ -146,5 +222,79 @@ cl_int clEnqueueReleaseGLObjects (cl_command_queue command_queue,
cl_event *event)
{
cl_int err = CL_SUCCESS;
+ cl_int e_status, i;
+ cl_event e = NULL;
+ enqueue_data *data = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (UNLIKELY(command_queue->ctx->props.gl_type == CL_GL_NOSHARE)) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if ((num_objects == 0 && mem_objects != NULL) ||
+ (num_objects > 0 && mem_objects == NULL)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ for (i = 0; i < num_objects; i++) {
+ if (!cl_mem_image(mem_objects[i])) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (!IS_GL_IMAGE(mem_objects[i])) {
+ err = CL_INVALID_GL_OBJECT;
+ break;
+ }
+ }
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_ACQUIRE_GL_OBJECTS, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e_status = cl_event_is_ready(e);
+
+ data = &e->exec_data;
+ data->type = EnqueueReturnSuccesss;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
return err;
}
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index ca4f3c5..f6cb5f8 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -153,4 +153,6 @@ DECL_INFO_STRING(spir_versions, "1.2")
.partition_type = {0},
.image_pitch_alignment = 1,
.image_base_address_alignment = 4096,
+.sub_group_sizes = {8, 16},
+.sub_group_sizes_sz = sizeof(size_t) * 2,
.cmrt_device = NULL
diff --git a/src/cl_program.c b/src/cl_program.c
index 363aed5..3cb5906 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -371,6 +371,7 @@ cl_program_create_with_built_in_kernles(cl_context ctx,
cl_int * errcode_ret)
{
cl_int err = CL_SUCCESS;
+ cl_program built_in_prgs = NULL;
assert(ctx);
INVALID_DEVICE_IF (num_devices != 1);
@@ -382,54 +383,24 @@ cl_program_create_with_built_in_kernles(cl_context ctx,
extern size_t cl_internal_built_in_kernel_str_size;
char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
- ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
- &ctx->devices[0],
- (size_t*)&cl_internal_built_in_kernel_str_size,
- (const unsigned char **)&p_built_in_kernel_str,
- &binary_status, &err);
- if (!ctx->built_in_prgs)
+ built_in_prgs = cl_program_create_from_binary(ctx, 1,
+ &ctx->devices[0],
+ (size_t*)&cl_internal_built_in_kernel_str_size,
+ (const unsigned char **)&p_built_in_kernel_str,
+ &binary_status, &err);
+ if (!built_in_prgs)
return NULL;
- err = cl_program_build(ctx->built_in_prgs, NULL);
+ err = cl_program_build(built_in_prgs, NULL);
if (err != CL_SUCCESS)
return NULL;
- ctx->built_in_prgs->is_built = 1;
-
- char delims[] = ";";
- char* saveptr = NULL;
- char* local_kernel_names;
- char* kernel = NULL;
- char* matched_kernel;
- int i = 0;
-
- //copy the content to local_kernel_names to protect the kernel_names.
- TRY_ALLOC(local_kernel_names, cl_calloc(strlen(kernel_names)+1, sizeof(char) ) );
- memcpy(local_kernel_names, kernel_names, strlen(kernel_names)+1);
-
- kernel = strtok_r( local_kernel_names, delims , &saveptr);
- while( kernel != NULL ) {
- matched_kernel = strstr(ctx->devices[0]->built_in_kernels, kernel);
- if(matched_kernel){
- for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
- assert(ctx->built_in_prgs->ker[i]);
- const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
- if (ker_name != NULL && strcmp(ker_name, kernel) == 0) {
- break;
- }
- }
-
- ctx->built_in_kernels[i] = cl_program_create_kernel(ctx->built_in_prgs, kernel, NULL);
- }
- kernel = strtok_r((char*)saveptr , delims, &saveptr );
- }
-
- cl_free(local_kernel_names);
+ built_in_prgs->is_built = 1;
exit:
if (errcode_ret)
*errcode_ret = err;
- return ctx->built_in_prgs;
+ return built_in_prgs;
error:
goto exit;
@@ -458,7 +429,7 @@ cl_program_create_from_llvm(cl_context ctx,
goto error;
}
- program->opaque = compiler_program_new_from_llvm(ctx->devices[0]->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
+ program->opaque = compiler_program_new_from_llvm_file(ctx->devices[0]->device_id, file_name, program->build_log_max_sz, program->build_log, &program->build_log_sz);
if (UNLIKELY(program->opaque == NULL)) {
err = CL_INVALID_PROGRAM;
goto error;
@@ -675,7 +646,8 @@ cl_program_build(cl_program p, const char *options)
memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
copyed += sz;
}
- if ((err = get_program_global_data(p)) != CL_SUCCESS)
+ uint32_t ocl_version = interp_kernel_get_ocl_version(interp_program_get_kernel(p->opaque, 0));
+ if (ocl_version >= 200 && (err = get_program_global_data(p)) != CL_SUCCESS)
goto error;
p->is_built = 1;
@@ -784,7 +756,8 @@ cl_program_link(cl_context context,
copyed += sz;
}
- if ((err = get_program_global_data(p)) != CL_SUCCESS)
+ uint32_t ocl_version = interp_kernel_get_ocl_version(interp_program_get_kernel(p->opaque, 0));
+ if (ocl_version >= 200 && (err = get_program_global_data(p)) != CL_SUCCESS)
goto error;
done:
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index b8a1b52..3caf737 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -137,19 +137,28 @@ return 1;
static int
intel_driver_context_init(intel_driver_t *driver)
{
-driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
-if (!driver->ctx)
- return 0;
-driver->null_bo = NULL;
+ driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
+ if (!driver->ctx)
+ return 0;
+ driver->null_bo = NULL;
#ifdef HAS_BO_SET_SOFTPIN
-drm_intel_bo *bo = dri_bo_alloc(driver->bufmgr, "null_bo", 64*1024, 4096);
-drm_intel_bo_set_softpin_offset(bo, 0);
-// don't reuse it, that would make two bo trying to bind to same address,
-// which is un-reasonable.
-drm_intel_bo_disable_reuse(bo);
-driver->null_bo = bo;
+ drm_intel_bo *bo = dri_bo_alloc(driver->bufmgr, "null_bo", 64*1024, 4096);
+ drm_intel_bo_set_softpin_offset(bo, 0);
+ // don't reuse it, that would make two bo trying to bind to same address,
+ // which is un-reasonable.
+ drm_intel_bo_disable_reuse(bo);
+
+ drm_intel_bo_map(bo, 1);
+ *(uint32_t *)bo->virtual = MI_BATCH_BUFFER_END;
+ drm_intel_bo_unmap(bo);
+
+ if (drm_intel_gem_bo_context_exec(bo, driver->ctx, 0, 0) == 0) {
+ driver->null_bo = bo;
+ } else {
+ drm_intel_bo_unreference(bo);
+ }
#endif
-return 1;
+ return 1;
}
static void
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 283b07a..4f6989d 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -2529,6 +2529,8 @@ intel_set_gpgpu_callbacks(int device_id)
intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //SKL need not restore SLM, same as gen7
intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+ if(IS_GEMINILAKE(device_id))
+ intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen9;
intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen9;
intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 43cf7f3..300d87a 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -283,6 +283,7 @@ set (utests_sources
compiler_sub_group_shuffle_down.cpp
compiler_sub_group_shuffle_up.cpp
compiler_sub_group_shuffle_xor.cpp
+ compiler_reqd_sub_group_size.cpp
builtin_global_linear_id.cpp
builtin_local_linear_id.cpp
multi_queue_events.cpp
@@ -297,7 +298,9 @@ set (utests_sources
compiler_generic_pointer.cpp
runtime_pipe_query.cpp
compiler_pipe_builtin.cpp
- compiler_device_enqueue.cpp)
+ compiler_device_enqueue.cpp
+ compiler_sqrt_div.cpp
+ compiler_remove_negative_add.cpp)
if (LLVM_VERSION_NODOT VERSION_GREATER 34)
SET(utests_sources
diff --git a/utests/compiler_if_else.cpp b/utests/compiler_if_else.cpp
index e38b23f..d65ecd3 100644
--- a/utests/compiler_if_else.cpp
+++ b/utests/compiler_if_else.cpp
@@ -21,43 +21,38 @@ static void compiler_if_else(void)
OCL_NDRANGE(1);
// First control flow
- OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < 16; ++i) {
OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
- OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
}
+ OCL_UNMAP_BUFFER(1);
// Second control flow
+ OCL_MAP_BUFFER(0);
for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
OCL_UNMAP_BUFFER(0);
- OCL_UNMAP_BUFFER(1);
OCL_NDRANGE(1);
- OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < 16; ++i) {
OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
- OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
}
+ OCL_UNMAP_BUFFER(1);
// Third control flow
+ OCL_MAP_BUFFER(0);
for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 2;
for (uint32_t i = 4; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
OCL_UNMAP_BUFFER(0);
- OCL_UNMAP_BUFFER(1);
OCL_NDRANGE(1);
- OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
for (uint32_t i = 0; i < 3; ++i) {
OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
- OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
}
OCL_ASSERT(((int32_t*)buf_data[1])[3] == -1);
- OCL_ASSERT(((int32_t*)buf_data[0])[3] == 1);
for (uint32_t i = 4; i < 16; ++i) {
OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
- OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
}
+ OCL_UNMAP_BUFFER(1);
}
MAKE_UTEST_FROM_FUNCTION(compiler_if_else);
diff --git a/utests/compiler_remove_negative_add.cpp b/utests/compiler_remove_negative_add.cpp
new file mode 100644
index 0000000..2b5df73
--- /dev/null
+++ b/utests/compiler_remove_negative_add.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+void compiler_remove_negative_add(void) {
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_remove_negative_add");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float *)buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float *)buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = exp2(-src[i]);
+ float gpu = dst[i];
+ if (fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_remove_negative_add);
diff --git a/utests/compiler_reqd_sub_group_size.cpp b/utests/compiler_reqd_sub_group_size.cpp
new file mode 100644
index 0000000..37d96fe
--- /dev/null
+++ b/utests/compiler_reqd_sub_group_size.cpp
@@ -0,0 +1,46 @@
+#include "utest_helper.hpp"
+#include<string>
+#include<sstream>
+#include<iostream>
+
+using namespace std;
+
+void compiler_reqd_sub_group_size(void)
+{
+ if (!cl_check_reqd_subgroup())
+ return;
+
+ size_t param_value_size;
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_SUB_GROUP_SIZES_INTEL,
+ 0, NULL, ¶m_value_size);
+
+ size_t* param_value = new size_t[param_value_size];
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_SUB_GROUP_SIZES_INTEL,
+ param_value_size, param_value, NULL);
+
+ const char* opt = "-D SIMD_SIZE=";
+ for( uint32_t i = 0; i < param_value_size / sizeof(size_t) ; ++i)
+ {
+ ostringstream ss;
+ uint32_t simd_size = param_value[i];
+ ss << opt << simd_size;
+ //cout << "options: " << ss.str() << endl;
+ OCL_CALL(cl_kernel_init, "compiler_reqd_sub_group_size.cl", "compiler_reqd_sub_group_size",
+ SOURCE, ss.str().c_str());
+ size_t SIMD_SIZE = 0;
+ OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device, CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL,0, NULL,sizeof(size_t),&SIMD_SIZE,NULL);
+ //cout << SIMD_SIZE << " with " << simd_size << endl;
+ OCL_ASSERT(SIMD_SIZE == simd_size);
+
+ cl_ulong SPILL_SIZE = 0xFFFFFFFF;
+ OCL_CALL(clGetKernelWorkGroupInfo, kernel, device, CL_KERNEL_SPILL_MEM_SIZE_INTEL, sizeof(cl_ulong), &SPILL_SIZE, NULL);
+ //cout << "spill size: " << SPILL_SIZE << endl;
+ OCL_ASSERT(SPILL_SIZE == 0);
+
+ clReleaseProgram(program);
+ program = NULL;
+ }
+ delete[] param_value;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_reqd_sub_group_size);
diff --git a/utests/compiler_sqrt_div.cpp b/utests/compiler_sqrt_div.cpp
new file mode 100644
index 0000000..c22c5a3
--- /dev/null
+++ b/utests/compiler_sqrt_div.cpp
@@ -0,0 +1,61 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+void compiler_sqrt_div(void) {
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_sqrt_div");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * 4 * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float *)buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float *)buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = 1.0f / sqrt(src[i]);
+ float gpu = dst[4 * i];
+ if (fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+
+ cpu = i / sqrt(src[i]);
+ gpu = dst[4 * i + 1];
+ if (fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+
+ cpu = 2.0f / sqrt(src[i]);
+ gpu = dst[4 * i + 2];
+ if (fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+
+ cpu = 1.0f / sqrt(src[i]) + sqrt(src[i]);
+ gpu = dst[4 * i + 3];
+ if (fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_sqrt_div);
diff --git a/utests/enqueue_built_in_kernels.cpp b/utests/enqueue_built_in_kernels.cpp
index 52b8848..2afbabd 100644
--- a/utests/enqueue_built_in_kernels.cpp
+++ b/utests/enqueue_built_in_kernels.cpp
@@ -14,6 +14,7 @@ void enqueue_built_in_kernels(void)
OCL_ASSERT(ret_sz == built_in_kernels_size);
cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
OCL_ASSERT(built_in_prog != NULL);
+ clReleaseProgram(built_in_prog);
}
MAKE_UTEST_FROM_FUNCTION(enqueue_built_in_kernels);
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 7052a14..4dae20d 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -1119,3 +1119,23 @@ float as_float(uint32_t i)
_tmp._uint = i;
return _tmp._float;
}
+
+int cl_check_reqd_subgroup(void)
+{
+ if (!cl_check_subgroups())
+ return 0;
+ std::string extStr;
+ size_t param_value_size;
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, ¶m_value_size);
+ std::vector<char> param_value(param_value_size);
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+ param_value.empty() ? NULL : ¶m_value.front(), ¶m_value_size);
+ if (!param_value.empty())
+ extStr = std::string(¶m_value.front(), param_value_size-1);
+
+ if (std::strstr(extStr.c_str(), "cl_intel_required_subgroup_size") == NULL) {
+ printf("No cl_intel_required_subgroup_size, Skip!");
+ return 0;
+ }
+ return 1;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index e2a6a88..19ec69d 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -325,4 +325,7 @@ extern float as_float(uint32_t i);
extern uint32_t as_uint(float f);
/* Check is intel subgroups short enabled. */
extern int cl_check_subgroups_short(void);
+
+/* Check is intel_required_subgroup_size enabled. */
+extern int cl_check_reqd_subgroup(void);
#endif /* __UTEST_HELPER_HPP__ */
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git
More information about the Pkg-opencl-commits
mailing list