[arrayfire] 323/408: Templated options are now runtime compile options for opencl indexed min/max
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit bf69efc7ae56921caa4ac6044438e05ddf6fbd7e
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Mon Aug 24 06:17:24 2015 -0400
Templated options are now runtime compile options for opencl indexed min/max
---
src/backend/opencl/kernel/ireduce.hpp | 163 +++++++++++++++++-----------------
1 file changed, 80 insertions(+), 83 deletions(-)
diff --git a/src/backend/opencl/kernel/ireduce.hpp b/src/backend/opencl/kernel/ireduce.hpp
index 122e664..92f4be3 100644
--- a/src/backend/opencl/kernel/ireduce.hpp
+++ b/src/backend/opencl/kernel/ireduce.hpp
@@ -19,6 +19,7 @@
#include <traits.hpp>
#include <dispatch.hpp>
#include <Param.hpp>
+#include <cache.hpp>
#include <debug_opencl.hpp>
#include <type_util.hpp>
#include "names.hpp"
@@ -40,17 +41,31 @@ namespace opencl
namespace kernel
{
- template<typename T, af_op_t op, int dim, bool is_first, int threads_y>
+ template<typename T, af_op_t op>
void ireduce_dim_launcher(Param out, cl::Buffer *oidx,
Param in, cl::Buffer *iidx,
+ const int dim,
+ const int threads_y,
+ const bool is_first,
const uint groups_all[4])
{
- static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static std::map<int, Program*> ireduceProgs;
- static std::map<int, Kernel*> ireduceKerns;
-
- int device= getActiveDeviceId();
- std::call_once(compileFlags[device], [device] () {
+ std::string ref_name =
+ std::string("ireduce_") +
+ std::to_string(dim) +
+ std::string("_") +
+ std::string(dtype_traits<T>::getName()) +
+ std::string("_") +
+ std::to_string(op) +
+ std::string("_") +
+ std::to_string(is_first) +
+ std::string("_") +
+ std::to_string(threads_y);
+
+ int device = getActiveDeviceId();
+ kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+ kc_entry_t entry;
+ if (idx == kernelCaches[device].end()) {
Binary<T, op> ireduce;
ToNum<T> toNum;
@@ -74,10 +89,13 @@ namespace kernel
const int ker_lens[] = {iops_cl_len, ireduce_dim_cl_len};
Program prog;
buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- ireduceProgs[device] = new Program(prog);
+ entry.prog = new Program(prog);
+ entry.ker = new Kernel(*entry.prog, "ireduce_dim_kernel");
- ireduceKerns[device] = new Kernel(*ireduceProgs[device], "ireduce_dim_kernel");
- });
+ kernelCaches[device][ref_name] = entry;
+ } else {
+ entry = idx->second;
+ }
NDRange local(THREADS_X, threads_y);
NDRange global(groups_all[0] * groups_all[2] * local[0],
@@ -85,7 +103,7 @@ namespace kernel
auto ireduceOp = make_kernel<Buffer, KParam, Buffer,
Buffer, KParam, Buffer,
- uint, uint, uint>(*ireduceKerns[device]);
+ uint, uint, uint>(*entry.ker);
ireduceOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info, *oidx,
@@ -97,20 +115,6 @@ namespace kernel
CL_DEBUG_FINISH(getQueue());
}
- template<typename T, af_op_t op, int dim, bool is_first>
- void ireduce_dim_fn(Param out, cl::Buffer *oidx, Param in, cl::Buffer *iidx,
- const uint threads_y, const uint groups_all[4])
- {
- switch(threads_y) {
- case 8: return ireduce_dim_launcher<T, op, dim, is_first, 8>(out, oidx, in, iidx, groups_all);
- case 4: return ireduce_dim_launcher<T, op, dim, is_first, 4>(out, oidx, in, iidx, groups_all);
- case 2: return ireduce_dim_launcher<T, op, dim, is_first, 2>(out, oidx, in, iidx, groups_all);
- case 1: return ireduce_dim_launcher<T, op, dim, is_first, 1>(out, oidx, in, iidx, groups_all);
- case 16: return ireduce_dim_launcher<T, op, dim, is_first, 16>(out, oidx, in, iidx, groups_all);
- case 32: return ireduce_dim_launcher<T, op, dim, is_first, 32>(out, oidx, in, iidx, groups_all);
- }
- }
-
template<typename T, af_op_t op, int dim>
void ireduce_dim(Param out, cl::Buffer *oidx, Param in)
{
@@ -139,56 +143,70 @@ namespace kernel
for (int k = dim + 1; k < 4; k++) tmp.info.strides[k] *= groups_all[dim];
}
- ireduce_dim_fn<T, op, dim, true>(tmp, tidx, in, tidx, threads_y, groups_all);
+ ireduce_dim_launcher<T, op>(tmp, tidx, in, tidx, dim, threads_y, true, groups_all);
if (groups_all[dim] > 1) {
groups_all[dim] = 1;
- ireduce_dim_fn<T, op, dim, false>(out, oidx, tmp, tidx, threads_y, groups_all);
+ ireduce_dim_launcher<T, op>(out, oidx, tmp, tidx, dim, threads_y, false, groups_all);
bufferFree(tmp.data);
bufferFree(tidx);
}
}
- template<typename T, af_op_t op, bool is_first, int threads_x>
+ template<typename T, af_op_t op>
void ireduce_first_launcher(Param out, cl::Buffer *oidx,
Param in, cl::Buffer *iidx,
+ const int threads_x,
+ const bool is_first,
const uint groups_x,
const uint groups_y)
{
- static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static std::map<int, Program*> ireduceProgs;
- static std::map<int, Kernel*> ireduceKerns;
-
- int device= getActiveDeviceId();
- std::call_once(compileFlags[device], [device] () {
-
- Binary<T, op> ireduce;
- ToNum<T> toNum;
-
- std::ostringstream options;
- options << " -D T=" << dtype_traits<T>::getName()
- << " -D DIMX=" << threads_x
- << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
- << " -D init=" << toNum(ireduce.init())
- << " -D " << binOpName<op>()
- << " -D CPLX=" << af::iscplx<T>()
- << " -D IS_FIRST=" << is_first;
-
- if (std::is_same<T, double>::value ||
- std::is_same<T, cdouble>::value) {
- options << " -D USE_DOUBLE";
- }
+ std::string ref_name =
+ std::string("ireduce_0_") +
+ std::string(dtype_traits<T>::getName()) +
+ std::string("_") +
+ std::to_string(op) +
+ std::string("_") +
+ std::to_string(is_first);
+ std::string("_") +
+ std::to_string(threads_x);
+
+ int device = getActiveDeviceId();
+ kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+ kc_entry_t entry;
+ if (idx == kernelCaches[device].end()) {
+
+ Binary<T, op> ireduce;
+ ToNum<T> toNum;
+
+ std::ostringstream options;
+ options << " -D T=" << dtype_traits<T>::getName()
+ << " -D DIMX=" << threads_x
+ << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
+ << " -D init=" << toNum(ireduce.init())
+ << " -D " << binOpName<op>()
+ << " -D CPLX=" << af::iscplx<T>()
+ << " -D IS_FIRST=" << is_first;
+
+ if (std::is_same<T, double>::value ||
+ std::is_same<T, cdouble>::value) {
+ options << " -D USE_DOUBLE";
+ }
- const char *ker_strs[] = {iops_cl, ireduce_first_cl};
- const int ker_lens[] = {iops_cl_len, ireduce_first_cl_len};
- Program prog;
- buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- ireduceProgs[device] = new Program(prog);
+ const char *ker_strs[] = {iops_cl, ireduce_first_cl};
+ const int ker_lens[] = {iops_cl_len, ireduce_first_cl_len};
+ Program prog;
+ buildProgram(prog, 2, ker_strs, ker_lens, options.str());
+ entry.prog = new Program(prog);
+ entry.ker = new Kernel(*entry.prog, "ireduce_first_kernel");
- ireduceKerns[device] = new Kernel(*ireduceProgs[device], "ireduce_first_kernel");
- });
+ kernelCaches[device][ref_name] = entry;
+ } else {
+ entry = idx->second;
+ }
NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
NDRange global(groups_x * in.info.dims[2] * local[0],
@@ -198,7 +216,7 @@ namespace kernel
auto ireduceOp = make_kernel<Buffer, KParam, Buffer,
Buffer, KParam, Buffer,
- uint, uint, uint>(*ireduceKerns[device]);
+ uint, uint, uint>(*entry.ker);
ireduceOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info, *oidx,
@@ -208,27 +226,6 @@ namespace kernel
CL_DEBUG_FINISH(getQueue());
}
- template<typename T, af_op_t op, bool is_first>
- void ireduce_first_fn(Param out, cl::Buffer *oidx,
- Param in, cl::Buffer *iidx,
- const uint groups_x,
- const uint groups_y,
- const uint threads_x)
- {
- switch(threads_x) {
- case 32: return ireduce_first_launcher<T, op, is_first, 32>(out, oidx, in, iidx, groups_x,
- groups_y);
- case 64: return ireduce_first_launcher<T, op, is_first, 64>(out, oidx, in, iidx, groups_x,
- groups_y);
- case 128: return ireduce_first_launcher<T, op, is_first, 128>(out, oidx, in, iidx, groups_x,
- groups_y);
- case 256: return ireduce_first_launcher<T, op, is_first, 256>(out, oidx, in, iidx, groups_x,
- groups_y);
- case 512: return ireduce_first_launcher<T, op, is_first, 512>(out, oidx, in, iidx, groups_x,
- groups_y);
- }
- }
-
template<typename T, af_op_t op>
void ireduce_first(Param out, cl::Buffer *oidx, Param in)
{
@@ -261,10 +258,10 @@ namespace kernel
for (int k = 1; k < 4; k++) tmp.info.strides[k] *= groups_x;
}
- ireduce_first_fn<T, op, true>(tmp, tidx, in, tidx, groups_x, groups_y, threads_x);
+ ireduce_first_launcher<T, op>(tmp, tidx, in, tidx, threads_x, true, groups_x, groups_y);
if (groups_x > 1) {
- ireduce_first_fn<T, op, false>(out, oidx, tmp, tidx, 1, groups_y, threads_x);
+ ireduce_first_launcher<T, op>(out, oidx, tmp, tidx, threads_x, false, 1, groups_y);
bufferFree(tmp.data);
bufferFree(tidx);
@@ -376,7 +373,7 @@ namespace kernel
tmp.data = bufferAlloc(tmp_elements * sizeof(T));
cl::Buffer *tidx = bufferAlloc(tmp_elements * sizeof(uint));
- ireduce_first_fn<T, op, true>(tmp, tidx, in, tidx, groups_x, groups_y, threads_x);
+ ireduce_first_launcher<T, op>(tmp, tidx, in, tidx, threads_x, true, groups_x, groups_y);
unique_ptr<T> h_ptr(new T[tmp_elements]);
unique_ptr<uint> h_iptr(new uint[tmp_elements]);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list