[arrayfire] 321/408: Templated options are now runtime compile options for opencl reductions
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 5e0ceac384ac3ed41936e908cbff890c430559cb
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Mon Aug 24 05:10:27 2015 -0400
Templated options are now runtime compile options for opencl reductions
---
src/backend/opencl/kernel/reduce.hpp | 235 ++++++++++++++++-------------------
1 file changed, 109 insertions(+), 126 deletions(-)
diff --git a/src/backend/opencl/kernel/reduce.hpp b/src/backend/opencl/kernel/reduce.hpp
index 094b42f..5ac55b5 100644
--- a/src/backend/opencl/kernel/reduce.hpp
+++ b/src/backend/opencl/kernel/reduce.hpp
@@ -20,6 +20,7 @@
#include <Param.hpp>
#include <debug_opencl.hpp>
#include <type_util.hpp>
+#include <cache.hpp>
#include "names.hpp"
#include "config.hpp"
#include <memory.hpp>
@@ -40,44 +41,61 @@ namespace opencl
namespace kernel
{
- template<typename Ti, typename To, af_op_t op, int dim, int threads_y>
+ template<typename Ti, typename To, af_op_t op>
void reduce_dim_launcher(Param out, Param in,
+ const int dim,
+ const uint threads_y,
const uint groups_all[4],
int change_nan, double nanval)
{
- static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static std::map<int, Program*> reduceProgs;
- static std::map<int, Kernel*> reduceKerns;
+ std::string ref_name =
+ std::string("reduce_") +
+ std::to_string(dim) +
+ std::string("_") +
+ std::string(dtype_traits<Ti>::getName()) +
+ std::string("_") +
+ std::string(dtype_traits<To>::getName()) +
+ std::string("_") +
+ std::to_string(op) +
+ std::string("_") +
+ std::to_string(threads_y);
+
+ int device = getActiveDeviceId();
+ kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+ kc_entry_t entry;
+ if (idx == kernelCaches[device].end()) {
+ Binary<To, op> reduce;
+ ToNum<To> toNum;
+
+ std::ostringstream options;
+ options << " -D To=" << dtype_traits<To>::getName()
+ << " -D Ti=" << dtype_traits<Ti>::getName()
+ << " -D T=To"
+ << " -D dim=" << dim
+ << " -D DIMY=" << threads_y
+ << " -D THREADS_X=" << THREADS_X
+ << " -D init=" << toNum(reduce.init())
+ << " -D " << binOpName<op>()
+ << " -D CPLX=" << af::iscplx<Ti>();
+ if (std::is_same<Ti, double>::value ||
+ std::is_same<Ti, cdouble>::value) {
+ options << " -D USE_DOUBLE";
- int device= getActiveDeviceId();
- std::call_once(compileFlags[device], [device] () {
+ }
- Binary<To, op> reduce;
- ToNum<To> toNum;
-
- std::ostringstream options;
- options << " -D To=" << dtype_traits<To>::getName()
- << " -D Ti=" << dtype_traits<Ti>::getName()
- << " -D T=To"
- << " -D dim=" << dim
- << " -D DIMY=" << threads_y
- << " -D THREADS_X=" << THREADS_X
- << " -D init=" << toNum(reduce.init())
- << " -D " << binOpName<op>()
- << " -D CPLX=" << af::iscplx<Ti>();
- if (std::is_same<Ti, double>::value ||
- std::is_same<Ti, cdouble>::value) {
- options << " -D USE_DOUBLE";
- }
+ const char *ker_strs[] = {ops_cl, reduce_dim_cl};
+ const int ker_lens[] = {ops_cl_len, reduce_dim_cl_len};
+ Program prog;
+ buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- const char *ker_strs[] = {ops_cl, reduce_dim_cl};
- const int ker_lens[] = {ops_cl_len, reduce_dim_cl_len};
- Program prog;
- buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- reduceProgs[device] = new Program(prog);
+ entry.prog = new Program(prog);
+ entry.ker = new Kernel(*entry.prog, "reduce_dim_kernel");
- reduceKerns[device] = new Kernel(*reduceProgs[device], "reduce_dim_kernel");
- });
+ kernelCaches[device][ref_name] = entry;
+ } else {
+ entry = idx->second;
+ }
NDRange local(THREADS_X, threads_y);
NDRange global(groups_all[0] * groups_all[2] * local[0],
@@ -86,7 +104,7 @@ namespace kernel
auto reduceOp = make_kernel<Buffer, KParam,
Buffer, KParam,
uint, uint, uint,
- int, To>(*reduceKerns[device]);
+ int, To>(*entry.ker);
reduceOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info,
@@ -100,34 +118,8 @@ namespace kernel
CL_DEBUG_FINISH(getQueue());
}
- template<typename Ti, typename To, af_op_t op, int dim>
- void reduce_dim_fn(Param out, Param in,
- const uint threads_y, const uint groups_all[4],
- int change_nan, double nanval)
- {
- switch(threads_y) {
- case 8: return reduce_dim_launcher<Ti, To, op, dim, 8>(out, in, groups_all,
- change_nan, nanval);
-
- case 4: return reduce_dim_launcher<Ti, To, op, dim, 4>(out, in, groups_all,
- change_nan, nanval);
-
- case 2: return reduce_dim_launcher<Ti, To, op, dim, 2>(out, in, groups_all,
- change_nan, nanval);
-
- case 1: return reduce_dim_launcher<Ti, To, op, dim, 1>(out, in, groups_all,
- change_nan, nanval);
-
- case 16: return reduce_dim_launcher<Ti, To, op, dim, 16>(out, in, groups_all,
- change_nan, nanval);
-
- case 32: return reduce_dim_launcher<Ti, To, op, dim, 32>(out, in, groups_all,
- change_nan, nanval);
- }
- }
-
- template<typename Ti, typename To, af_op_t op, int dim>
- void reduce_dim(Param out, Param in, int change_nan, double nanval)
+ template<typename Ti, typename To, af_op_t op>
+ void reduce_dim(Param out, Param in, int change_nan, double nanval, int dim)
{
uint threads_y = std::min(THREADS_Y, nextpow2(in.info.dims[dim]));
uint threads_x = THREADS_X;
@@ -152,61 +144,75 @@ namespace kernel
for (int k = dim + 1; k < 4; k++) tmp.info.strides[k] *= groups_all[dim];
}
- reduce_dim_fn<Ti, To, op, dim>(tmp, in, threads_y, groups_all, change_nan, nanval);
+ reduce_dim_launcher<Ti, To, op>(tmp, in, dim, threads_y, groups_all, change_nan, nanval);
if (groups_all[dim] > 1) {
groups_all[dim] = 1;
if (op == af_notzero_t) {
- reduce_dim_fn<To, To, af_add_t, dim>(out, tmp, threads_y, groups_all,
- change_nan, nanval);
+ reduce_dim_launcher<To, To, af_add_t>(out, tmp, dim, threads_y, groups_all,
+ change_nan, nanval);
} else {
- reduce_dim_fn<To, To, op, dim>(out, tmp, threads_y, groups_all,
- change_nan, nanval);
+ reduce_dim_launcher<To, To, op>(out, tmp, dim, threads_y, groups_all,
+ change_nan, nanval);
}
bufferFree(tmp.data);
}
}
- template<typename Ti, typename To, af_op_t op, int threads_x>
+ template<typename Ti, typename To, af_op_t op>
void reduce_first_launcher(Param out, Param in,
const uint groups_x,
const uint groups_y,
+ const uint threads_x,
int change_nan, double nanval)
{
- static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static std::map<int, Program*> reduceProgs;
- static std::map<int, Kernel*> reduceKerns;
+ std::string ref_name =
+ std::string("reduce_0_") +
+ std::string(dtype_traits<Ti>::getName()) +
+ std::string("_") +
+ std::string(dtype_traits<To>::getName()) +
+ std::string("_") +
+ std::to_string(op) +
+ std::string("_") +
+ std::to_string(threads_x);
+
+ int device = getActiveDeviceId();
+ kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+ kc_entry_t entry;
+ if (idx == kernelCaches[device].end()) {
+
+ Binary<To, op> reduce;
+ ToNum<To> toNum;
+
+ std::ostringstream options;
+ options << " -D To=" << dtype_traits<To>::getName()
+ << " -D Ti=" << dtype_traits<Ti>::getName()
+ << " -D T=To"
+ << " -D DIMX=" << threads_x
+ << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
+ << " -D init=" << toNum(reduce.init())
+ << " -D " << binOpName<op>()
+ << " -D CPLX=" << af::iscplx<Ti>();
+ if (std::is_same<Ti, double>::value ||
+ std::is_same<Ti, cdouble>::value) {
+ options << " -D USE_DOUBLE";
+ }
- int device= getActiveDeviceId();
- std::call_once(compileFlags[device], [device] () {
+ const char *ker_strs[] = {ops_cl, reduce_first_cl};
+ const int ker_lens[] = {ops_cl_len, reduce_first_cl_len};
+ Program prog;
+ buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- Binary<To, op> reduce;
- ToNum<To> toNum;
-
- std::ostringstream options;
- options << " -D To=" << dtype_traits<To>::getName()
- << " -D Ti=" << dtype_traits<Ti>::getName()
- << " -D T=To"
- << " -D DIMX=" << threads_x
- << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
- << " -D init=" << toNum(reduce.init())
- << " -D " << binOpName<op>()
- << " -D CPLX=" << af::iscplx<Ti>();
- if (std::is_same<Ti, double>::value ||
- std::is_same<Ti, cdouble>::value) {
- options << " -D USE_DOUBLE";
- }
-
- const char *ker_strs[] = {ops_cl, reduce_first_cl};
- const int ker_lens[] = {ops_cl_len, reduce_first_cl_len};
- Program prog;
- buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- reduceProgs[device] = new Program(prog);
+ entry.prog = new Program(prog);
+ entry.ker = new Kernel(*entry.prog, "reduce_first_kernel");
- reduceKerns[device] = new Kernel(*reduceProgs[device], "reduce_first_kernel");
- });
+ kernelCaches[device][ref_name] = entry;
+ } else {
+ entry = idx->second;
+ }
NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
NDRange global(groups_x * in.info.dims[2] * local[0],
@@ -217,7 +223,7 @@ namespace kernel
auto reduceOp = make_kernel<Buffer, KParam,
Buffer, KParam,
uint, uint, uint,
- int, To>(*reduceKerns[device]);
+ int, To>(*entry.ker);
reduceOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info,
@@ -227,27 +233,6 @@ namespace kernel
}
template<typename Ti, typename To, af_op_t op>
- void reduce_first_fn(Param out, Param in,
- const uint groups_x,
- const uint groups_y,
- const uint threads_x,
- int change_nan, double nanval)
- {
- switch(threads_x) {
- case 32: return reduce_first_launcher<Ti, To, op, 32>(out, in, groups_x,
- groups_y, change_nan, nanval);
- case 64: return reduce_first_launcher<Ti, To, op, 64>(out, in, groups_x,
- groups_y, change_nan, nanval);
- case 128: return reduce_first_launcher<Ti, To, op, 128>(out, in, groups_x,
- groups_y, change_nan, nanval);
- case 256: return reduce_first_launcher<Ti, To, op, 256>(out, in, groups_x,
- groups_y, change_nan, nanval);
- case 512: return reduce_first_launcher<Ti, To, op, 512>(out, in, groups_x,
- groups_y, change_nan, nanval);
- }
- }
-
- template<typename Ti, typename To, af_op_t op>
void reduce_first(Param out, Param in, int change_nan, double nanval)
{
uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0]));
@@ -270,15 +255,15 @@ namespace kernel
for (int k = 1; k < 4; k++) tmp.info.strides[k] *= groups_x;
}
- reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
+ reduce_first_launcher<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
if (groups_x > 1) {
//FIXME: Is there an alternative to the if condition ?
if (op == af_notzero_t) {
- reduce_first_fn<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
+ reduce_first_launcher<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
} else {
- reduce_first_fn<To, To, op>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
+ reduce_first_launcher<To, To, op>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
}
bufferFree(tmp.data);
@@ -289,12 +274,10 @@ namespace kernel
void reduce(Param out, Param in, int dim, int change_nan, double nanval)
{
try {
- switch (dim) {
- case 0: return reduce_first<Ti, To, op >(out, in, change_nan, nanval);
- case 1: return reduce_dim <Ti, To, op, 1>(out, in, change_nan, nanval);
- case 2: return reduce_dim <Ti, To, op, 2>(out, in, change_nan, nanval);
- case 3: return reduce_dim <Ti, To, op, 3>(out, in, change_nan, nanval);
- }
+ if (dim == 0)
+ return reduce_first<Ti, To, op>(out, in, change_nan, nanval);
+ else
+ return reduce_dim <Ti, To, op>(out, in, change_nan, nanval, dim);
} catch(cl::Error ex) {
CL_TO_AF_ERROR(ex);
}
@@ -342,7 +325,7 @@ namespace kernel
int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
tmp.data = bufferAlloc(tmp_elements * sizeof(To));
- reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
+ reduce_first_launcher<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
unique_ptr<To> h_ptr(new To[tmp_elements]);
getQueue().enqueueReadBuffer(*tmp.data, CL_TRUE, 0, sizeof(To) * tmp_elements, h_ptr.get());
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list