[arrayfire] 213/408: FEAT: Select added for CUDA backend

Mon Sep 21 19:12:00 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 97c916af2c0b985126638bce2af9de553ca8c328
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Sat Aug 8 20:50:22 2015 -0400

    FEAT: Select added for CUDA backend
---
 src/backend/cuda/kernel/memcopy.hpp |   1 +
 src/backend/cuda/kernel/select.hpp  | 155 ++++++++++++++++++++++++++++++++++++
 src/backend/cuda/select.cu          |   5 +-
 3 files changed, 159 insertions(+), 2 deletions(-)

diff --git a/src/backend/cuda/kernel/memcopy.hpp b/src/backend/cuda/kernel/memcopy.hpp
index 2246419..4d5d192 100644
--- a/src/backend/cuda/kernel/memcopy.hpp
+++ b/src/backend/cuda/kernel/memcopy.hpp
@@ -187,6 +187,7 @@ namespace kernel
         dim3 threads(DIMX, DIMY);
         size_t local_size[] = {DIMX, DIMY};
 
+        //FIXME: Why isn't threads being updated??
         local_size[0] *= local_size[1];
         if (ndims == 1) {
             local_size[1] = 1;
diff --git a/src/backend/cuda/kernel/select.hpp b/src/backend/cuda/kernel/select.hpp
new file mode 100644
index 0000000..ab5bf2d
--- /dev/null
+++ b/src/backend/cuda/kernel/select.hpp
@@ -0,0 +1,155 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <math.hpp>
+
+namespace cuda
+{
+    namespace kernel
+    {
+
+        static const uint DIMX = 32;
+        static const uint DIMY =  8;
+
+        __device__ __host__
+        int getOffset(dim_t *dims, dim_t *strides, dim_t *refdims)
+        {
+            int off = 0;
+            off += (dims[3] == refdims[3]) * strides[3];
+            off += (dims[2] == refdims[2]) * strides[2];
+            off += (dims[1] == refdims[1]) * strides[1];
+            off += (dims[0] == refdims[0]);
+            return off;
+        }
+
+        template<typename T, bool is_same>
+        __global__
+        void select_kernel(Param<T> out, CParam<char> cond,
+                           CParam<T> a, CParam<T> b, int blk_x, int blk_y)
+        {
+            const int idz = blockIdx.x / blk_x;
+            const int idw = blockIdx.y / blk_y;
+
+            const int blockIdx_x = blockIdx.x - idz * blk_x;
+            const int blockIdx_y = blockIdx.y - idz * blk_y;
+
+            const int idx = blockIdx_x * blockDim.x + threadIdx.x;
+            const int idy = blockIdx_y * blockDim.y + threadIdx.y;
+
+            const int off = idw * out.strides[3] + idz * out.strides[2] + idy * out.strides[1] + idx;
+
+            T *optr = out.ptr + off;
+
+            const T *aptr = a.ptr;
+            const T *bptr = b.ptr;
+            const char *cptr = cond.ptr;
+
+            if (is_same) {
+                aptr += off;
+                bptr += off;
+                cptr += off;
+            } else {
+                aptr += getOffset(a.dims, a.strides, out.dims);
+                bptr += getOffset(b.dims, b.strides, out.dims);
+                cptr += getOffset(cond.dims, cond.strides, out.dims);
+            }
+
+            if (idx < out.dims[0] && idy < out.dims[1] && idz < out.dims[2] && idw < out.dims[3]) {
+                *optr = (*cptr) ? *aptr : *bptr;
+            }
+        }
+
+        template<typename T>
+        void select(Param<T> out, CParam<char> cond, CParam<T> a, CParam<T> b, int ndims)
+        {
+            bool is_same = true;
+            for (int i = 0; i < 4; i++) {
+                is_same &= (a.dims[i] == b.dims[i]);
+            }
+
+            dim3 threads(DIMX, DIMY);
+
+            if (ndims == 1) {
+                threads.x *= threads.y;
+                threads.y = 1;
+            }
+
+            int blk_x = divup(out.dims[0], threads.x);
+            int blk_y = divup(out.dims[1], threads.y);
+
+
+            dim3 blocks(blk_x * out.dims[2],
+                        blk_y * out.dims[3]);
+
+            if (is_same) {
+                CUDA_LAUNCH((select_kernel<T, true>), blocks, threads,
+                            out, cond, a, b, blk_x, blk_y);
+            } else {
+                CUDA_LAUNCH((select_kernel<T, false>), blocks, threads,
+                            out, cond, a, b, blk_x, blk_y);
+            }
+
+        }
+
+        template<typename T, bool flip>
+        __global__
+        void select_scalar_kernel(Param<T> out, CParam<char> cond,
+                                  CParam<T> a, T b, int blk_x, int blk_y)
+        {
+            const int idz = blockIdx.x / blk_x;
+            const int idw = blockIdx.y / blk_y;
+
+            const int blockIdx_x = blockIdx.x - idz * blk_x;
+            const int blockIdx_y = blockIdx.y - idz * blk_y;
+
+            const int idx = blockIdx_x * blockDim.x + threadIdx.x;
+            const int idy = blockIdx_y * blockDim.y + threadIdx.y;
+
+            const int off = idw * out.strides[3] + idz * out.strides[2] + idy * out.strides[1] + idx;
+
+            T *optr = out.ptr + off;
+
+            const T *aptr = a.ptr;
+            const char *cptr = cond.ptr;
+
+            aptr += off;
+            cptr += off;
+
+            if (idx < out.dims[0] && idy < out.dims[1] && idz < out.dims[2] && idw < out.dims[3]) {
+                *optr = ((*cptr) ^ flip) ? *aptr : b;
+            }
+        }
+
+        template<typename T, bool flip>
+        void select_scalar(Param<T> out, CParam<char> cond, CParam<T> a, const double b, int ndims)
+        {
+            dim3 threads(DIMX, DIMY);
+
+            if (ndims == 1) {
+                threads.x *= threads.y;
+                threads.y = 1;
+            }
+
+            int blk_x = divup(out.dims[0], threads.x);
+            int blk_y = divup(out.dims[1], threads.y);
+
+
+            dim3 blocks(blk_x * threads.x,
+                        blk_y * threads.y);
+
+            CUDA_LAUNCH((select_scalar_kernel<T, flip>), blocks, threads,
+                        out, cond, a, scalar<T>(b), blk_x, blk_y);
+
+        }
+    }
+}
diff --git a/src/backend/cuda/select.cu b/src/backend/cuda/select.cu
index 5204057..eb90730 100644
--- a/src/backend/cuda/select.cu
+++ b/src/backend/cuda/select.cu
@@ -10,19 +10,20 @@
 #include <Array.hpp>
 #include <select.hpp>
 #include <err_cuda.hpp>
+#include <kernel/select.hpp>
 
 namespace cuda
 {
     template<typename T>
     void select(Array<T> &out, const Array<char> &cond, const Array<T> &a, const Array<T> &b)
     {
-        CUDA_NOT_SUPPORTED();
+        kernel::select<T>(out, cond, a, b, out.ndims());
     }
 
     template<typename T, bool flip>
     void select_scalar(Array<T> &out, const Array<char> &cond, const Array<T> &a, const double &b)
     {
-        CUDA_NOT_SUPPORTED();
+        kernel::select_scalar<T, flip>(out, cond, a, b, out.ndims());
     }
 
 #define INSTANTIATE(T)                                              \

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git