[arrayfire] 321/408: Templated options are now runtime compile options for opencl reductions

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:22 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 5e0ceac384ac3ed41936e908cbff890c430559cb
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Mon Aug 24 05:10:27 2015 -0400

    Templated options are now runtime compile options for opencl reductions
---
 src/backend/opencl/kernel/reduce.hpp | 235 ++++++++++++++++-------------------
 1 file changed, 109 insertions(+), 126 deletions(-)

diff --git a/src/backend/opencl/kernel/reduce.hpp b/src/backend/opencl/kernel/reduce.hpp
index 094b42f..5ac55b5 100644
--- a/src/backend/opencl/kernel/reduce.hpp
+++ b/src/backend/opencl/kernel/reduce.hpp
@@ -20,6 +20,7 @@
 #include <Param.hpp>
 #include <debug_opencl.hpp>
 #include <type_util.hpp>
+#include <cache.hpp>
 #include "names.hpp"
 #include "config.hpp"
 #include <memory.hpp>
@@ -40,44 +41,61 @@ namespace opencl
 namespace kernel
 {
 
-    template<typename Ti, typename To, af_op_t op, int dim, int threads_y>
+    template<typename Ti, typename To, af_op_t op>
     void reduce_dim_launcher(Param out, Param in,
+                             const int dim,
+                             const uint threads_y,
                              const uint groups_all[4],
                              int change_nan, double nanval)
     {
-        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-        static std::map<int, Program*> reduceProgs;
-        static std::map<int, Kernel*> reduceKerns;
+        std::string ref_name =
+            std::string("reduce_") +
+            std::to_string(dim) +
+            std::string("_") +
+            std::string(dtype_traits<Ti>::getName()) +
+            std::string("_") +
+            std::string(dtype_traits<To>::getName()) +
+            std::string("_") +
+            std::to_string(op) +
+            std::string("_") +
+            std::to_string(threads_y);
+
+        int device = getActiveDeviceId();
+        kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+        kc_entry_t entry;
+        if (idx == kernelCaches[device].end()) {
+            Binary<To, op> reduce;
+            ToNum<To> toNum;
+
+            std::ostringstream options;
+            options << " -D To=" << dtype_traits<To>::getName()
+                    << " -D Ti=" << dtype_traits<Ti>::getName()
+                    << " -D T=To"
+                    << " -D dim=" << dim
+                    << " -D DIMY=" << threads_y
+                    << " -D THREADS_X=" << THREADS_X
+                    << " -D init=" << toNum(reduce.init())
+                    << " -D " << binOpName<op>()
+                    << " -D CPLX=" << af::iscplx<Ti>();
+            if (std::is_same<Ti, double>::value ||
+                std::is_same<Ti, cdouble>::value) {
+                options << " -D USE_DOUBLE";
 
-        int device= getActiveDeviceId();
-        std::call_once(compileFlags[device], [device] () {
+            }
 
-                Binary<To, op> reduce;
-                ToNum<To> toNum;
-
-                std::ostringstream options;
-                options << " -D To=" << dtype_traits<To>::getName()
-                        << " -D Ti=" << dtype_traits<Ti>::getName()
-                        << " -D T=To"
-                        << " -D dim=" << dim
-                        << " -D DIMY=" << threads_y
-                        << " -D THREADS_X=" << THREADS_X
-                        << " -D init=" << toNum(reduce.init())
-                        << " -D " << binOpName<op>()
-                        << " -D CPLX=" << af::iscplx<Ti>();
-                if (std::is_same<Ti, double>::value ||
-                    std::is_same<Ti, cdouble>::value) {
-                    options << " -D USE_DOUBLE";
-                }
+            const char *ker_strs[] = {ops_cl, reduce_dim_cl};
+            const int   ker_lens[] = {ops_cl_len, reduce_dim_cl_len};
+            Program prog;
+            buildProgram(prog, 2, ker_strs, ker_lens, options.str());
 
-                const char *ker_strs[] = {ops_cl, reduce_dim_cl};
-                const int   ker_lens[] = {ops_cl_len, reduce_dim_cl_len};
-                Program prog;
-                buildProgram(prog, 2, ker_strs, ker_lens, options.str());
-                reduceProgs[device] = new Program(prog);
+            entry.prog = new Program(prog);
+            entry.ker = new Kernel(*entry.prog, "reduce_dim_kernel");
 
-                reduceKerns[device] = new Kernel(*reduceProgs[device], "reduce_dim_kernel");
-            });
+            kernelCaches[device][ref_name] = entry;
+        } else {
+            entry = idx->second;
+        }
 
         NDRange local(THREADS_X, threads_y);
         NDRange global(groups_all[0] * groups_all[2] * local[0],
@@ -86,7 +104,7 @@ namespace kernel
         auto reduceOp = make_kernel<Buffer, KParam,
                                     Buffer, KParam,
                                     uint, uint, uint,
-                                    int, To>(*reduceKerns[device]);
+                                    int, To>(*entry.ker);
 
         reduceOp(EnqueueArgs(getQueue(), global, local),
                  *out.data, out.info,
@@ -100,34 +118,8 @@ namespace kernel
         CL_DEBUG_FINISH(getQueue());
     }
 
-    template<typename Ti, typename To, af_op_t op, int dim>
-    void reduce_dim_fn(Param out, Param in,
-                       const uint threads_y, const uint groups_all[4],
-                       int change_nan, double nanval)
-    {
-        switch(threads_y) {
-        case  8: return reduce_dim_launcher<Ti, To, op, dim,  8>(out, in, groups_all,
-                                                                change_nan, nanval);
-
-        case  4: return reduce_dim_launcher<Ti, To, op, dim,  4>(out, in, groups_all,
-                                                                change_nan, nanval);
-
-        case  2: return reduce_dim_launcher<Ti, To, op, dim,  2>(out, in, groups_all,
-                                                                change_nan, nanval);
-
-        case  1: return reduce_dim_launcher<Ti, To, op, dim,  1>(out, in, groups_all,
-                                                                change_nan, nanval);
-
-        case 16: return reduce_dim_launcher<Ti, To, op, dim, 16>(out, in, groups_all,
-                                                                change_nan, nanval);
-
-        case 32: return reduce_dim_launcher<Ti, To, op, dim, 32>(out, in, groups_all,
-                                                                change_nan, nanval);
-        }
-    }
-
-    template<typename Ti, typename To, af_op_t op, int dim>
-    void reduce_dim(Param out, Param in, int change_nan, double nanval)
+    template<typename Ti, typename To, af_op_t op>
+    void reduce_dim(Param out, Param in, int change_nan, double nanval, int dim)
     {
         uint threads_y = std::min(THREADS_Y, nextpow2(in.info.dims[dim]));
         uint threads_x = THREADS_X;
@@ -152,61 +144,75 @@ namespace kernel
             for (int k = dim + 1; k < 4; k++) tmp.info.strides[k] *= groups_all[dim];
         }
 
-        reduce_dim_fn<Ti, To, op, dim>(tmp, in, threads_y, groups_all, change_nan, nanval);
+        reduce_dim_launcher<Ti, To, op>(tmp, in, dim, threads_y, groups_all, change_nan, nanval);
 
         if (groups_all[dim] > 1) {
             groups_all[dim] = 1;
 
             if (op == af_notzero_t) {
-                reduce_dim_fn<To, To, af_add_t, dim>(out, tmp, threads_y, groups_all,
-                                                     change_nan, nanval);
+                reduce_dim_launcher<To, To, af_add_t>(out, tmp, dim, threads_y, groups_all,
+                                                      change_nan, nanval);
             } else {
-                reduce_dim_fn<To, To,       op, dim>(out, tmp, threads_y, groups_all,
-                                                     change_nan, nanval);
+                reduce_dim_launcher<To, To,       op>(out, tmp, dim, threads_y, groups_all,
+                                                      change_nan, nanval);
             }
             bufferFree(tmp.data);
         }
 
     }
 
-    template<typename Ti, typename To, af_op_t op, int threads_x>
+    template<typename Ti, typename To, af_op_t op>
     void reduce_first_launcher(Param out, Param in,
                                const uint groups_x,
                                const uint groups_y,
+                               const uint threads_x,
                                int change_nan, double nanval)
     {
-        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-        static std::map<int, Program*> reduceProgs;
-        static std::map<int, Kernel*>  reduceKerns;
+        std::string ref_name =
+            std::string("reduce_0_") +
+            std::string(dtype_traits<Ti>::getName()) +
+            std::string("_") +
+            std::string(dtype_traits<To>::getName()) +
+            std::string("_") +
+            std::to_string(op) +
+            std::string("_") +
+            std::to_string(threads_x);
+
+        int device = getActiveDeviceId();
+        kc_t::iterator idx = kernelCaches[device].find(ref_name);
+
+        kc_entry_t entry;
+        if (idx == kernelCaches[device].end()) {
+
+            Binary<To, op> reduce;
+            ToNum<To> toNum;
+
+            std::ostringstream options;
+            options << " -D To=" << dtype_traits<To>::getName()
+                    << " -D Ti=" << dtype_traits<Ti>::getName()
+                    << " -D T=To"
+                    << " -D DIMX=" << threads_x
+                    << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
+                    << " -D init=" << toNum(reduce.init())
+                    << " -D " << binOpName<op>()
+                    << " -D CPLX=" << af::iscplx<Ti>();
+            if (std::is_same<Ti, double>::value ||
+                std::is_same<Ti, cdouble>::value) {
+                options << " -D USE_DOUBLE";
+            }
 
-        int device= getActiveDeviceId();
-        std::call_once(compileFlags[device], [device] () {
+            const char *ker_strs[] = {ops_cl, reduce_first_cl};
+            const int   ker_lens[] = {ops_cl_len, reduce_first_cl_len};
+            Program prog;
+            buildProgram(prog, 2, ker_strs, ker_lens, options.str());
 
-                Binary<To, op> reduce;
-                ToNum<To> toNum;
-
-                std::ostringstream options;
-                options << " -D To=" << dtype_traits<To>::getName()
-                        << " -D Ti=" << dtype_traits<Ti>::getName()
-                        << " -D T=To"
-                        << " -D DIMX=" << threads_x
-                        << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
-                        << " -D init=" << toNum(reduce.init())
-                        << " -D " << binOpName<op>()
-                        << " -D CPLX=" << af::iscplx<Ti>();
-                if (std::is_same<Ti, double>::value ||
-                    std::is_same<Ti, cdouble>::value) {
-                    options << " -D USE_DOUBLE";
-                }
-
-                const char *ker_strs[] = {ops_cl, reduce_first_cl};
-                const int   ker_lens[] = {ops_cl_len, reduce_first_cl_len};
-                Program prog;
-                buildProgram(prog, 2, ker_strs, ker_lens, options.str());
-                reduceProgs[device] = new Program(prog);
+            entry.prog = new Program(prog);
+            entry.ker = new Kernel(*entry.prog, "reduce_first_kernel");
 
-                reduceKerns[device] = new Kernel(*reduceProgs[device], "reduce_first_kernel");
-            });
+            kernelCaches[device][ref_name] = entry;
+        } else {
+            entry = idx->second;
+        }
 
         NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
         NDRange global(groups_x * in.info.dims[2] * local[0],
@@ -217,7 +223,7 @@ namespace kernel
         auto reduceOp = make_kernel<Buffer, KParam,
                                     Buffer, KParam,
                                     uint, uint, uint,
-                                    int, To>(*reduceKerns[device]);
+                                    int, To>(*entry.ker);
 
         reduceOp(EnqueueArgs(getQueue(), global, local),
                  *out.data, out.info,
@@ -227,27 +233,6 @@ namespace kernel
     }
 
     template<typename Ti, typename To, af_op_t op>
-    void reduce_first_fn(Param out, Param in,
-                         const uint groups_x,
-                         const uint groups_y,
-                         const uint threads_x,
-                         int change_nan, double nanval)
-    {
-        switch(threads_x) {
-        case  32: return reduce_first_launcher<Ti, To, op,  32>(out, in, groups_x,
-                                                                groups_y, change_nan, nanval);
-        case  64: return reduce_first_launcher<Ti, To, op,  64>(out, in, groups_x,
-                                                                groups_y, change_nan, nanval);
-        case 128: return reduce_first_launcher<Ti, To, op, 128>(out, in, groups_x,
-                                                                groups_y, change_nan, nanval);
-        case 256: return reduce_first_launcher<Ti, To, op, 256>(out, in, groups_x,
-                                                                groups_y, change_nan, nanval);
-        case 512: return reduce_first_launcher<Ti, To, op, 512>(out, in, groups_x,
-                                                                groups_y, change_nan, nanval);
-        }
-    }
-
-    template<typename Ti, typename To, af_op_t op>
     void reduce_first(Param out, Param in, int change_nan, double nanval)
     {
         uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0]));
@@ -270,15 +255,15 @@ namespace kernel
             for (int k = 1; k < 4; k++) tmp.info.strides[k] *= groups_x;
         }
 
-        reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
+        reduce_first_launcher<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
 
         if (groups_x > 1) {
 
             //FIXME: Is there an alternative to the if condition ?
             if (op == af_notzero_t) {
-                reduce_first_fn<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
+                reduce_first_launcher<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
             } else {
-                reduce_first_fn<To, To,       op>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
+                reduce_first_launcher<To, To,       op>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
             }
 
             bufferFree(tmp.data);
@@ -289,12 +274,10 @@ namespace kernel
     void reduce(Param out, Param in, int dim, int change_nan, double nanval)
     {
         try {
-            switch (dim) {
-            case 0: return reduce_first<Ti, To, op   >(out, in, change_nan, nanval);
-            case 1: return reduce_dim  <Ti, To, op, 1>(out, in, change_nan, nanval);
-            case 2: return reduce_dim  <Ti, To, op, 2>(out, in, change_nan, nanval);
-            case 3: return reduce_dim  <Ti, To, op, 3>(out, in, change_nan, nanval);
-            }
+            if (dim == 0)
+                return reduce_first<Ti, To, op>(out, in, change_nan, nanval);
+            else
+                return reduce_dim  <Ti, To, op>(out, in, change_nan, nanval, dim);
         } catch(cl::Error ex) {
             CL_TO_AF_ERROR(ex);
         }
@@ -342,7 +325,7 @@ namespace kernel
                 int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
                 tmp.data = bufferAlloc(tmp_elements * sizeof(To));
 
-                reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
+                reduce_first_launcher<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
 
                 unique_ptr<To> h_ptr(new To[tmp_elements]);
                 getQueue().enqueueReadBuffer(*tmp.data, CL_TRUE, 0, sizeof(To) * tmp_elements, h_ptr.get());

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list