[arrayfire] 324/408: Templated options are now runtime compile options for opencl scan
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 5c0da49f748b94a263a2dd7d2f7c31f71881b2c2
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Mon Aug 24 06:46:54 2015 -0400
Templated options are now runtime compile options for opencl scan
---
src/backend/opencl/kernel/scan_dim.hpp | 207 +++++++++++++------------------
src/backend/opencl/kernel/scan_first.hpp | 196 +++++++++++++----------------
src/backend/opencl/kernel/where.hpp | 7 +-
src/backend/opencl/scan.cpp | 12 +-
4 files changed, 185 insertions(+), 237 deletions(-)
diff --git a/src/backend/opencl/kernel/scan_dim.hpp b/src/backend/opencl/kernel/scan_dim.hpp
index 6bb8cdd..84cc722 100644
--- a/src/backend/opencl/kernel/scan_dim.hpp
+++ b/src/backend/opencl/kernel/scan_dim.hpp
@@ -19,6 +19,7 @@
#include <Param.hpp>
#include <debug_opencl.hpp>
#include <type_util.hpp>
+#include <cache.hpp>
#include "names.hpp"
#include "config.hpp"
@@ -34,64 +35,77 @@ namespace opencl
{
namespace kernel
{
- template<typename Ti, typename To, af_op_t op, int dim, bool isFinalPass, uint threads_y>
- static Kernel* get_scan_dim_kernels(int kerIdx)
+ template<typename Ti, typename To, af_op_t op>
+ static Kernel get_scan_dim_kernels(int kerIdx, int dim, bool isFinalPass, uint threads_y)
{
- try {
- static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static std::map<int, Program*> scanProgs;
- static std::map<int, Kernel*> scanKerns;
- static std::map<int, Kernel*> bcastKerns;
-
- int device= getActiveDeviceId();
-
- std::call_once(compileFlags[device], [device] () {
-
- Binary<To, op> scan;
- ToNum<To> toNum;
-
- std::ostringstream options;
- options << " -D To=" << dtype_traits<To>::getName()
- << " -D Ti=" << dtype_traits<Ti>::getName()
- << " -D T=To"
- << " -D dim=" << dim
- << " -D DIMY=" << threads_y
- << " -D THREADS_X=" << THREADS_X
- << " -D init=" << toNum(scan.init())
- << " -D " << binOpName<op>()
- << " -D CPLX=" << af::iscplx<Ti>()
- << " -D isFinalPass=" << (int)(isFinalPass);
- if (std::is_same<Ti, double>::value ||
- std::is_same<Ti, cdouble>::value) {
- options << " -D USE_DOUBLE";
- }
-
- const char *ker_strs[] = {ops_cl, scan_dim_cl};
- const int ker_lens[] = {ops_cl_len, scan_dim_cl_len};
- cl::Program prog;
- buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- scanProgs[device] = new Program(prog);
-
- scanKerns[device] = new Kernel(*scanProgs[device], "scan_dim_kernel");
- bcastKerns[device] = new Kernel(*scanProgs[device], "bcast_dim_kernel");
-
- });
-
- return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device];
- } catch (cl::Error err) {
- CL_TO_AF_ERROR(err);
- throw;
+ std::string ref_name =
+ std::string("scan_") +
+ std::to_string(dim) +
+ std::string("_") +
+ std::to_string(isFinalPass) +
+ std::string("_") +
+ std::string(dtype_traits<Ti>::getName()) +
+ std::string("_") +
+ std::string(dtype_traits<To>::getName()) +
+ std::string("_") +
+ std::to_string(op) +
+ std::string("_") +
+ std::to_string(threads_y);
+
+ int device = getActiveDeviceId();
+ kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+ kc_entry_t entry;
+ if (idx == kernelCaches[device].end()) {
+
+ Binary<To, op> scan;
+ ToNum<To> toNum;
+
+ std::ostringstream options;
+ options << " -D To=" << dtype_traits<To>::getName()
+ << " -D Ti=" << dtype_traits<Ti>::getName()
+ << " -D T=To"
+ << " -D dim=" << dim
+ << " -D DIMY=" << threads_y
+ << " -D THREADS_X=" << THREADS_X
+ << " -D init=" << toNum(scan.init())
+ << " -D " << binOpName<op>()
+ << " -D CPLX=" << af::iscplx<Ti>()
+ << " -D isFinalPass=" << (int)(isFinalPass);
+ if (std::is_same<Ti, double>::value ||
+ std::is_same<Ti, cdouble>::value) {
+ options << " -D USE_DOUBLE";
+ }
+
+ const char *ker_strs[] = {ops_cl, scan_dim_cl};
+ const int ker_lens[] = {ops_cl_len, scan_dim_cl_len};
+ cl::Program prog;
+ buildProgram(prog, 2, ker_strs, ker_lens, options.str());
+
+ entry.prog = new Program(prog);
+ entry.ker = new Kernel[2];
+
+ entry.ker[0] = Kernel(*entry.prog, "scan_dim_kernel");
+ entry.ker[1] = Kernel(*entry.prog, "bcast_dim_kernel");
+
+ kernelCaches[device][ref_name] = entry;
+
+ } else {
+ entry = idx->second;
}
+
+ return entry.ker[kerIdx];
}
- template<typename Ti, typename To, af_op_t op, int dim, bool isFinalPass, uint threads_y>
+ template<typename Ti, typename To, af_op_t op>
static void scan_dim_launcher(Param &out,
Param &tmp,
const Param &in,
+ int dim, bool isFinalPass, uint threads_y,
const uint groups_all[4])
{
try {
- Kernel* ker = get_scan_dim_kernels<Ti, To, op, dim, isFinalPass, threads_y>(0);
+ Kernel ker = get_scan_dim_kernels<Ti, To, op>(0, dim, isFinalPass, threads_y);
NDRange local(THREADS_X, threads_y);
NDRange global(groups_all[0] * groups_all[2] * local[0],
@@ -103,7 +117,7 @@ namespace kernel
Buffer, KParam,
Buffer, KParam,
uint, uint,
- uint, uint>(*ker);
+ uint, uint>(ker);
scanOp(EnqueueArgs(getQueue(), global, local),
@@ -117,13 +131,14 @@ namespace kernel
}
}
- template<typename Ti, typename To, af_op_t op, int dim, bool isFinalPass, uint threads_y>
+ template<typename Ti, typename To, af_op_t op>
static void bcast_dim_launcher(Param &out,
Param &tmp,
+ int dim, bool isFinalPass, uint threads_y,
const uint groups_all[4])
{
try {
- Kernel* ker = get_scan_dim_kernels<Ti, To, op, dim, isFinalPass, threads_y>(1);
+ Kernel ker = get_scan_dim_kernels<Ti, To, op>(1, dim, isFinalPass, threads_y);
NDRange local(THREADS_X, threads_y);
NDRange global(groups_all[0] * groups_all[2] * local[0],
@@ -134,7 +149,7 @@ namespace kernel
auto bcastOp = make_kernel<Buffer, KParam,
Buffer, KParam,
uint, uint,
- uint, uint>(*ker);
+ uint, uint>(ker);
bcastOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info, *tmp.data, tmp.info,
@@ -147,57 +162,8 @@ namespace kernel
}
}
-
- template<typename Ti, typename To, af_op_t op, int dim, bool isFinalPass>
- static void scan_dim_fn(Param &out,
- Param &tmp,
- const Param &in,
- const uint threads_y,
- const uint groups_all[4])
- {
-
- switch (threads_y) {
- case 8:
- (scan_dim_launcher<Ti, To, op, dim, isFinalPass, 8>)(
- out, tmp, in, groups_all); break;
- case 4:
- (scan_dim_launcher<Ti, To, op, dim, isFinalPass, 4>)(
- out, tmp, in, groups_all); break;
- case 2:
- (scan_dim_launcher<Ti, To, op, dim, isFinalPass, 2>)(
- out, tmp, in, groups_all); break;
- case 1:
- (scan_dim_launcher<Ti, To, op, dim, isFinalPass, 1>)(
- out, tmp, in, groups_all); break;
- }
-
- }
-
- template<typename Ti, typename To, af_op_t op, int dim, bool isFinalPass>
- static void bcast_dim_fn(Param &out,
- Param &tmp,
- const uint threads_y,
- const uint groups_all[4])
- {
-
- switch (threads_y) {
- case 8:
- (bcast_dim_launcher<Ti, To, op, dim, isFinalPass, 8>)(
- out, tmp, groups_all); break;
- case 4:
- (bcast_dim_launcher<Ti, To, op, dim, isFinalPass, 4>)(
- out, tmp, groups_all); break;
- case 2:
- (bcast_dim_launcher<Ti, To, op, dim, isFinalPass, 2>)(
- out, tmp, groups_all); break;
- case 1:
- (bcast_dim_launcher<Ti, To, op, dim, isFinalPass, 1>)(
- out, tmp, groups_all); break;
- }
- }
-
- template<typename Ti, typename To, af_op_t op, int dim>
- static void scan_dim(Param &out, const Param &in)
+ template<typename Ti, typename To, af_op_t op>
+ static void scan_dim(Param &out, const Param &in, int dim)
{
try {
uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim]));
@@ -212,9 +178,10 @@ namespace kernel
if (groups_all[dim] == 1) {
- scan_dim_fn<Ti, To, op, dim, true>(out, out, in,
- threads_y,
- groups_all);
+ scan_dim_launcher<Ti, To, op>(out, out, in,
+ dim, true,
+ threads_y,
+ groups_all);
} else {
Param tmp = out;
@@ -229,27 +196,31 @@ namespace kernel
// FIXME: Do I need to free this ?
tmp.data = bufferAlloc(tmp_elements * sizeof(To));
- scan_dim_fn<Ti, To, op, dim, false>(out, tmp, in,
- threads_y,
- groups_all);
+ scan_dim_launcher<Ti, To, op>(out, tmp, in,
+ dim, false,
+ threads_y,
+ groups_all);
int gdim = groups_all[dim];
groups_all[dim] = 1;
if (op == af_notzero_t) {
- scan_dim_fn<To, To, af_add_t, dim, true>(tmp, tmp, tmp,
- threads_y,
- groups_all);
+ scan_dim_launcher<To, To, af_add_t>(tmp, tmp, tmp,
+ dim, true,
+ threads_y,
+ groups_all);
} else {
- scan_dim_fn<To, To, op, dim, true>(tmp, tmp, tmp,
- threads_y,
- groups_all);
+ scan_dim_launcher<To, To, op>(tmp, tmp, tmp,
+ dim, true,
+ threads_y,
+ groups_all);
}
groups_all[dim] = gdim;
- bcast_dim_fn<To, To, op, dim, true>(out, tmp,
- threads_y,
- groups_all);
+ bcast_dim_launcher<To, To, op>(out, tmp,
+ dim, true,
+ threads_y,
+ groups_all);
bufferFree(tmp.data);
}
} catch (cl::Error err) {
diff --git a/src/backend/opencl/kernel/scan_first.hpp b/src/backend/opencl/kernel/scan_first.hpp
index b9521fd..d7a284d 100644
--- a/src/backend/opencl/kernel/scan_first.hpp
+++ b/src/backend/opencl/kernel/scan_first.hpp
@@ -19,6 +19,7 @@
#include <Param.hpp>
#include <debug_opencl.hpp>
#include <type_util.hpp>
+#include <cache.hpp>
#include "names.hpp"
#include "config.hpp"
#include <memory.hpp>
@@ -36,62 +37,80 @@ namespace opencl
namespace kernel
{
- template<typename Ti, typename To, af_op_t op, bool isFinalPass, uint threads_x>
- static Kernel* get_scan_first_kernels(int kerIdx)
+ template<typename Ti, typename To, af_op_t op>
+ static Kernel get_scan_first_kernels(int kerIdx, bool isFinalPass, uint threads_x)
{
- static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static std::map<int, Program*> scanProgs;
- static std::map<int, Kernel* > scanKerns;
- static std::map<int, Kernel* > bcastKerns;
-
- int device= getActiveDeviceId();
-
- std::call_once(compileFlags[device], [device] () {
-
- const uint threads_y = THREADS_PER_GROUP / threads_x;
- const uint SHARED_MEM_SIZE = THREADS_PER_GROUP;
-
- Binary<To, op> scan;
- ToNum<To> toNum;
-
- std::ostringstream options;
- options << " -D To=" << dtype_traits<To>::getName()
- << " -D Ti=" << dtype_traits<Ti>::getName()
- << " -D T=To"
- << " -D DIMX=" << threads_x
- << " -D DIMY=" << threads_y
- << " -D SHARED_MEM_SIZE=" << SHARED_MEM_SIZE
- << " -D init=" << toNum(scan.init())
- << " -D " << binOpName<op>()
- << " -D CPLX=" << af::iscplx<Ti>()
- << " -D isFinalPass=" << (int)(isFinalPass);
- if (std::is_same<Ti, double>::value ||
- std::is_same<Ti, cdouble>::value) {
- options << " -D USE_DOUBLE";
- }
-
- const char *ker_strs[] = {ops_cl, scan_first_cl};
- const int ker_lens[] = {ops_cl_len, scan_first_cl_len};
- cl::Program prog;
- buildProgram(prog, 2, ker_strs, ker_lens, options.str());
- scanProgs[device] = new Program(prog);
-
- scanKerns[device] = new Kernel(*scanProgs[device], "scan_first_kernel");
- bcastKerns[device] = new Kernel(*scanProgs[device], "bcast_first_kernel");
-
- });
-
- return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device];
+ std::string ref_name =
+ std::string("scan_0_") +
+ std::string("_") +
+ std::to_string(isFinalPass) +
+ std::string("_") +
+ std::string(dtype_traits<Ti>::getName()) +
+ std::string("_") +
+ std::string(dtype_traits<To>::getName()) +
+ std::string("_") +
+ std::to_string(op) +
+ std::string("_") +
+ std::to_string(threads_x);
+
+ int device = getActiveDeviceId();
+ kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+ kc_entry_t entry;
+ if (idx == kernelCaches[device].end()) {
+
+ const uint threads_y = THREADS_PER_GROUP / threads_x;
+ const uint SHARED_MEM_SIZE = THREADS_PER_GROUP;
+
+ Binary<To, op> scan;
+ ToNum<To> toNum;
+
+ std::ostringstream options;
+ options << " -D To=" << dtype_traits<To>::getName()
+ << " -D Ti=" << dtype_traits<Ti>::getName()
+ << " -D T=To"
+ << " -D DIMX=" << threads_x
+ << " -D DIMY=" << threads_y
+ << " -D SHARED_MEM_SIZE=" << SHARED_MEM_SIZE
+ << " -D init=" << toNum(scan.init())
+ << " -D " << binOpName<op>()
+ << " -D CPLX=" << af::iscplx<Ti>()
+ << " -D isFinalPass=" << (int)(isFinalPass);
+ if (std::is_same<Ti, double>::value ||
+ std::is_same<Ti, cdouble>::value) {
+ options << " -D USE_DOUBLE";
+ }
+
+ const char *ker_strs[] = {ops_cl, scan_first_cl};
+ const int ker_lens[] = {ops_cl_len, scan_first_cl_len};
+ cl::Program prog;
+ buildProgram(prog, 2, ker_strs, ker_lens, options.str());
+
+ entry.prog = new Program(prog);
+ entry.ker = new Kernel[2];
+
+ entry.ker[0] = Kernel(*entry.prog, "scan_first_kernel");
+ entry.ker[1] = Kernel(*entry.prog, "bcast_first_kernel");
+
+ kernelCaches[device][ref_name] = entry;
+
+ } else {
+ entry = idx->second;
+ }
+
+ return entry.ker[kerIdx];
}
- template<typename Ti, typename To, af_op_t op, bool isFinalPass, uint threads_x>
+ template<typename Ti, typename To, af_op_t op>
static void scan_first_launcher(Param &out,
Param &tmp,
const Param &in,
+ const bool isFinalPass,
const uint groups_x,
- const uint groups_y)
+ const uint groups_y,
+ const uint threads_x)
{
- Kernel* ker = get_scan_first_kernels<Ti, To, op, isFinalPass, threads_x>(0);
+ Kernel ker = get_scan_first_kernels<Ti, To, op>(0, isFinalPass, threads_x);
NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
NDRange global(groups_x * out.info.dims[2] * local[0],
@@ -102,7 +121,7 @@ namespace kernel
auto scanOp = make_kernel<Buffer, KParam,
Buffer, KParam,
Buffer, KParam,
- uint, uint, uint>(*ker);
+ uint, uint, uint>(ker);
scanOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info, *tmp.data, tmp.info, *in.data, in.info,
@@ -111,14 +130,16 @@ namespace kernel
CL_DEBUG_FINISH(getQueue());
}
- template<typename Ti, typename To, af_op_t op, bool isFinalPass, uint threads_x>
+ template<typename Ti, typename To, af_op_t op>
static void bcast_first_launcher(Param &out,
Param &tmp,
+ const bool isFinalPass,
const uint groups_x,
- const uint groups_y)
+ const uint groups_y,
+ const uint threads_x)
{
- Kernel* ker = get_scan_first_kernels<Ti, To, op, isFinalPass, threads_x>(1);
+ Kernel ker = get_scan_first_kernels<Ti, To, op>(1, isFinalPass, threads_x);
NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
NDRange global(groups_x * out.info.dims[2] * local[0],
@@ -128,7 +149,7 @@ namespace kernel
auto bcastOp = make_kernel<Buffer, KParam,
Buffer, KParam,
- uint, uint, uint>(*ker);
+ uint, uint, uint>(ker);
bcastOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info, *tmp.data, tmp.info,
@@ -138,56 +159,6 @@ namespace kernel
}
- template<typename Ti, typename To, af_op_t op, bool isFinalPass>
- static void scan_first_fn(Param &out,
- Param &tmp,
- const Param &in,
- const uint groups_x,
- const uint groups_y,
- const uint threads_x)
- {
-
- switch (threads_x) {
- case 32:
- (scan_first_launcher<Ti, To, op, isFinalPass, 32>)(
- out, tmp, in, groups_x, groups_y); break;
- case 64:
- (scan_first_launcher<Ti, To, op, isFinalPass, 64>)(
- out, tmp, in, groups_x, groups_y); break;
- case 128:
- (scan_first_launcher<Ti, To, op, isFinalPass, 128>)(
- out, tmp, in, groups_x, groups_y); break;
- case 256:
- (scan_first_launcher<Ti, To, op, isFinalPass, 256>)(
- out, tmp, in, groups_x, groups_y); break;
- }
-
- }
-
- template<typename Ti, typename To, af_op_t op, bool isFinalPass>
- static void bcast_first_fn(Param &out,
- Param &tmp,
- const uint groups_x,
- const uint groups_y,
- const uint threads_x)
- {
-
- switch (threads_x) {
- case 32:
- (bcast_first_launcher<Ti, To, op, isFinalPass, 32>)(
- out, tmp, groups_x, groups_y); break;
- case 64:
- (bcast_first_launcher<Ti, To, op, isFinalPass, 64>)(
- out, tmp, groups_x, groups_y); break;
- case 128:
- (bcast_first_launcher<Ti, To, op, isFinalPass, 128>)(
- out, tmp, groups_x, groups_y); break;
- case 256:
- (bcast_first_launcher<Ti, To, op, isFinalPass, 256>)(
- out, tmp, groups_x, groups_y); break;
- }
- }
-
template<typename Ti, typename To, af_op_t op>
static void scan_first(Param &out, const Param &in)
{
@@ -199,7 +170,8 @@ namespace kernel
uint groups_y = divup(out.info.dims[1], threads_y);
if (groups_x == 1) {
- scan_first_fn<Ti, To, op, true>(out, out, in,
+ scan_first_launcher<Ti, To, op>(out, out, in,
+ true,
groups_x, groups_y,
threads_x);
@@ -216,21 +188,25 @@ namespace kernel
tmp.data = bufferAlloc(tmp_elements * sizeof(To));
- scan_first_fn<Ti, To, op, false>(out, tmp, in,
- groups_x, groups_y,
- threads_x);
+ scan_first_launcher<Ti, To, op>(out, tmp, in,
+ false,
+ groups_x, groups_y,
+ threads_x);
if (op == af_notzero_t) {
- scan_first_fn<To, To, af_add_t, true>(tmp, tmp, tmp,
+ scan_first_launcher<To, To, af_add_t>(tmp, tmp, tmp,
+ true,
1, groups_y,
threads_x);
} else {
- scan_first_fn<To, To, op, true>(tmp, tmp, tmp,
+ scan_first_launcher<To, To, op>(tmp, tmp, tmp,
+ true,
1, groups_y,
threads_x);
}
- bcast_first_fn<To, To, op, true>(out, tmp,
+ bcast_first_launcher<To, To, op>(out, tmp,
+ true,
groups_x,
groups_y,
threads_x);
diff --git a/src/backend/opencl/kernel/where.hpp b/src/backend/opencl/kernel/where.hpp
index 4b6d7e7..2cbf8c1 100644
--- a/src/backend/opencl/kernel/where.hpp
+++ b/src/backend/opencl/kernel/where.hpp
@@ -126,9 +126,10 @@ namespace kernel
int otmp_elements = otmp.info.strides[3] * otmp.info.dims[3];
otmp.data = bufferAlloc(otmp_elements * sizeof(uint));
- scan_first_fn<T, uint, af_notzero_t, false>(otmp, rtmp, in,
- groups_x, groups_y,
- threads_x);
+ scan_first_launcher<T, uint, af_notzero_t>(otmp, rtmp, in,
+ false,
+ groups_x, groups_y,
+ threads_x);
// Linearize the dimensions and perform scan
Param ltmp = rtmp;
diff --git a/src/backend/opencl/scan.cpp b/src/backend/opencl/scan.cpp
index ff657a4..74375da 100644
--- a/src/backend/opencl/scan.cpp
+++ b/src/backend/opencl/scan.cpp
@@ -28,12 +28,12 @@ namespace opencl
try {
Param Out = out;
Param In = in;
- switch (dim) {
- case 0: kernel::scan_first<Ti, To, op >(Out, In); break;
- case 1: kernel::scan_dim <Ti, To, op, 1>(Out, In); break;
- case 2: kernel::scan_dim <Ti, To, op, 2>(Out, In); break;
- case 3: kernel::scan_dim <Ti, To, op, 3>(Out, In); break;
- }
+
+ if (dim == 0)
+ kernel::scan_first<Ti, To, op>(Out, In);
+ else
+ kernel::scan_dim <Ti, To, op>(Out, In, dim);
+
} catch (cl::Error &ex) {
CL_TO_AF_ERROR(ex);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list