[arrayfire] 323/408: Templated options are now runtime compile options for opencl indexed min/max

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:22 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit bf69efc7ae56921caa4ac6044438e05ddf6fbd7e
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Mon Aug 24 06:17:24 2015 -0400

    Templated options are now runtime compile options for opencl indexed min/max
---
 src/backend/opencl/kernel/ireduce.hpp | 163 +++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 83 deletions(-)

diff --git a/src/backend/opencl/kernel/ireduce.hpp b/src/backend/opencl/kernel/ireduce.hpp
index 122e664..92f4be3 100644
--- a/src/backend/opencl/kernel/ireduce.hpp
+++ b/src/backend/opencl/kernel/ireduce.hpp
@@ -19,6 +19,7 @@
 #include <traits.hpp>
 #include <dispatch.hpp>
 #include <Param.hpp>
+#include <cache.hpp>
 #include <debug_opencl.hpp>
 #include <type_util.hpp>
 #include "names.hpp"
@@ -40,17 +41,31 @@ namespace opencl
 namespace kernel
 {
 
-    template<typename T, af_op_t op, int dim, bool is_first, int threads_y>
+    template<typename T, af_op_t op>
     void ireduce_dim_launcher(Param out, cl::Buffer *oidx,
                               Param in, cl::Buffer *iidx,
+                              const int dim,
+                              const int threads_y,
+                              const bool is_first,
                               const uint groups_all[4])
     {
-        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-        static std::map<int, Program*> ireduceProgs;
-        static std::map<int, Kernel*> ireduceKerns;
-
-        int device= getActiveDeviceId();
-        std::call_once(compileFlags[device], [device] () {
+        std::string ref_name =
+            std::string("ireduce_") +
+            std::to_string(dim) +
+            std::string("_") +
+            std::string(dtype_traits<T>::getName()) +
+            std::string("_") +
+            std::to_string(op) +
+            std::string("_") +
+            std::to_string(is_first) +
+            std::string("_") +
+            std::to_string(threads_y);
+
+        int device = getActiveDeviceId();
+        kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+        kc_entry_t entry;
+        if (idx == kernelCaches[device].end()) {
 
                 Binary<T, op> ireduce;
                 ToNum<T> toNum;
@@ -74,10 +89,13 @@ namespace kernel
                 const int   ker_lens[] = {iops_cl_len, ireduce_dim_cl_len};
                 Program prog;
                 buildProgram(prog, 2, ker_strs, ker_lens, options.str());
-                ireduceProgs[device] = new Program(prog);
+                entry.prog = new Program(prog);
+                entry.ker = new Kernel(*entry.prog, "ireduce_dim_kernel");
 
-                ireduceKerns[device] = new Kernel(*ireduceProgs[device], "ireduce_dim_kernel");
-            });
+                kernelCaches[device][ref_name] = entry;
+        } else {
+            entry = idx->second;
+        }
 
         NDRange local(THREADS_X, threads_y);
         NDRange global(groups_all[0] * groups_all[2] * local[0],
@@ -85,7 +103,7 @@ namespace kernel
 
         auto ireduceOp = make_kernel<Buffer, KParam, Buffer,
                                      Buffer, KParam, Buffer,
-                                     uint, uint, uint>(*ireduceKerns[device]);
+                                     uint, uint, uint>(*entry.ker);
 
         ireduceOp(EnqueueArgs(getQueue(), global, local),
                   *out.data, out.info, *oidx,
@@ -97,20 +115,6 @@ namespace kernel
         CL_DEBUG_FINISH(getQueue());
     }
 
-    template<typename T, af_op_t op, int dim, bool is_first>
-    void ireduce_dim_fn(Param out, cl::Buffer *oidx, Param in, cl::Buffer *iidx,
-                        const uint threads_y, const uint groups_all[4])
-    {
-        switch(threads_y) {
-        case  8: return ireduce_dim_launcher<T, op, dim, is_first,  8>(out, oidx, in, iidx, groups_all);
-        case  4: return ireduce_dim_launcher<T, op, dim, is_first,  4>(out, oidx, in, iidx, groups_all);
-        case  2: return ireduce_dim_launcher<T, op, dim, is_first,  2>(out, oidx, in, iidx, groups_all);
-        case  1: return ireduce_dim_launcher<T, op, dim, is_first,  1>(out, oidx, in, iidx, groups_all);
-        case 16: return ireduce_dim_launcher<T, op, dim, is_first, 16>(out, oidx, in, iidx, groups_all);
-        case 32: return ireduce_dim_launcher<T, op, dim, is_first, 32>(out, oidx, in, iidx, groups_all);
-        }
-    }
-
     template<typename T, af_op_t op, int dim>
     void ireduce_dim(Param out, cl::Buffer *oidx, Param in)
     {
@@ -139,56 +143,70 @@ namespace kernel
             for (int k = dim + 1; k < 4; k++) tmp.info.strides[k] *= groups_all[dim];
         }
 
-        ireduce_dim_fn<T, op, dim, true>(tmp, tidx, in, tidx, threads_y, groups_all);
+        ireduce_dim_launcher<T, op>(tmp, tidx, in, tidx, dim, threads_y, true, groups_all);
 
         if (groups_all[dim] > 1) {
             groups_all[dim] = 1;
 
-            ireduce_dim_fn<T, op, dim, false>(out, oidx, tmp, tidx, threads_y, groups_all);
+            ireduce_dim_launcher<T, op>(out, oidx, tmp, tidx, dim, threads_y, false, groups_all);
             bufferFree(tmp.data);
             bufferFree(tidx);
         }
 
     }
 
-    template<typename T, af_op_t op, bool is_first, int threads_x>
+    template<typename T, af_op_t op>
     void ireduce_first_launcher(Param out, cl::Buffer *oidx,
                                 Param in, cl::Buffer *iidx,
+                                const int threads_x,
+                                const bool is_first,
                                 const uint groups_x,
                                 const uint groups_y)
     {
-        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-        static std::map<int, Program*> ireduceProgs;
-        static std::map<int, Kernel*>  ireduceKerns;
-
-        int device= getActiveDeviceId();
-        std::call_once(compileFlags[device], [device] () {
-
-                Binary<T, op> ireduce;
-                ToNum<T> toNum;
-
-                std::ostringstream options;
-                options << " -D T=" << dtype_traits<T>::getName()
-                        << " -D DIMX=" << threads_x
-                        << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
-                        << " -D init=" << toNum(ireduce.init())
-                        << " -D " << binOpName<op>()
-                        << " -D CPLX=" << af::iscplx<T>()
-                        << " -D IS_FIRST=" << is_first;
-
-                if (std::is_same<T, double>::value ||
-                    std::is_same<T, cdouble>::value) {
-                    options << " -D USE_DOUBLE";
-                }
+        std::string ref_name =
+            std::string("ireduce_0_") +
+            std::string(dtype_traits<T>::getName()) +
+            std::string("_") +
+            std::to_string(op) +
+            std::string("_") +
+            std::to_string(is_first);
+            std::string("_") +
+            std::to_string(threads_x);
+
+        int device = getActiveDeviceId();
+        kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+        kc_entry_t entry;
+        if (idx == kernelCaches[device].end()) {
+
+            Binary<T, op> ireduce;
+            ToNum<T> toNum;
+
+            std::ostringstream options;
+            options << " -D T=" << dtype_traits<T>::getName()
+                    << " -D DIMX=" << threads_x
+                    << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
+                    << " -D init=" << toNum(ireduce.init())
+                    << " -D " << binOpName<op>()
+                    << " -D CPLX=" << af::iscplx<T>()
+                    << " -D IS_FIRST=" << is_first;
+
+            if (std::is_same<T, double>::value ||
+                std::is_same<T, cdouble>::value) {
+                options << " -D USE_DOUBLE";
+            }
 
-                const char *ker_strs[] = {iops_cl, ireduce_first_cl};
-                const int   ker_lens[] = {iops_cl_len, ireduce_first_cl_len};
-                Program prog;
-                buildProgram(prog, 2, ker_strs, ker_lens, options.str());
-                ireduceProgs[device] = new Program(prog);
+            const char *ker_strs[] = {iops_cl, ireduce_first_cl};
+            const int   ker_lens[] = {iops_cl_len, ireduce_first_cl_len};
+            Program prog;
+            buildProgram(prog, 2, ker_strs, ker_lens, options.str());
+            entry.prog = new Program(prog);
+            entry.ker = new Kernel(*entry.prog, "ireduce_first_kernel");
 
-                ireduceKerns[device] = new Kernel(*ireduceProgs[device], "ireduce_first_kernel");
-            });
+            kernelCaches[device][ref_name] = entry;
+        } else {
+            entry = idx->second;
+        }
 
         NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
         NDRange global(groups_x * in.info.dims[2] * local[0],
@@ -198,7 +216,7 @@ namespace kernel
 
         auto ireduceOp = make_kernel<Buffer, KParam, Buffer,
                                      Buffer, KParam, Buffer,
-                                     uint, uint, uint>(*ireduceKerns[device]);
+                                     uint, uint, uint>(*entry.ker);
 
         ireduceOp(EnqueueArgs(getQueue(), global, local),
                   *out.data, out.info, *oidx,
@@ -208,27 +226,6 @@ namespace kernel
         CL_DEBUG_FINISH(getQueue());
     }
 
-    template<typename T, af_op_t op, bool is_first>
-    void ireduce_first_fn(Param out, cl::Buffer *oidx,
-                          Param in, cl::Buffer *iidx,
-                          const uint groups_x,
-                          const uint groups_y,
-                          const uint threads_x)
-    {
-        switch(threads_x) {
-        case  32: return ireduce_first_launcher<T, op, is_first,  32>(out, oidx, in, iidx, groups_x,
-                                                            groups_y);
-        case  64: return ireduce_first_launcher<T, op, is_first,  64>(out, oidx, in, iidx, groups_x,
-                                                            groups_y);
-        case 128: return ireduce_first_launcher<T, op, is_first, 128>(out, oidx, in, iidx, groups_x,
-                                                            groups_y);
-        case 256: return ireduce_first_launcher<T, op, is_first, 256>(out, oidx, in, iidx, groups_x,
-                                                            groups_y);
-        case 512: return ireduce_first_launcher<T, op, is_first, 512>(out, oidx, in, iidx, groups_x,
-                                                                      groups_y);
-        }
-    }
-
     template<typename T, af_op_t op>
     void ireduce_first(Param out, cl::Buffer *oidx, Param in)
     {
@@ -261,10 +258,10 @@ namespace kernel
             for (int k = 1; k < 4; k++) tmp.info.strides[k] *= groups_x;
         }
 
-        ireduce_first_fn<T, op, true>(tmp, tidx, in, tidx, groups_x, groups_y, threads_x);
+        ireduce_first_launcher<T, op>(tmp, tidx, in, tidx, threads_x, true, groups_x, groups_y);
 
         if (groups_x > 1) {
-            ireduce_first_fn<T, op, false>(out, oidx, tmp, tidx, 1, groups_y, threads_x);
+            ireduce_first_launcher<T, op>(out, oidx, tmp, tidx, threads_x, false, 1, groups_y);
 
             bufferFree(tmp.data);
             bufferFree(tidx);
@@ -376,7 +373,7 @@ namespace kernel
                 tmp.data = bufferAlloc(tmp_elements * sizeof(T));
                 cl::Buffer *tidx = bufferAlloc(tmp_elements * sizeof(uint));
 
-                ireduce_first_fn<T, op, true>(tmp, tidx, in, tidx, groups_x, groups_y, threads_x);
+                ireduce_first_launcher<T, op>(tmp, tidx, in, tidx, threads_x, true, groups_x, groups_y);
 
                 unique_ptr<T> h_ptr(new T[tmp_elements]);
                 unique_ptr<uint> h_iptr(new uint[tmp_elements]);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list