[arrayfire] 283/408: FEAT: wrap for CUDA backend

Mon Sep 21 19:12:14 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 1909fa5e11a710bea56729860d5ead19629c88ca
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Tue Aug 18 21:39:15 2015 -0400

    FEAT: wrap for CUDA backend
    
    - Additionally cleaned up wrap tests to be more modular
---
 src/backend/cuda/kernel/wrap.hpp | 113 +++++++++++++++++++++++++++++++++++++++
 src/backend/cuda/wrap.cu         |   8 ++-
 test/wrap.cpp                    | 107 ++++++++++++++++++------------------
 3 files changed, 174 insertions(+), 54 deletions(-)

diff --git a/src/backend/cuda/kernel/wrap.hpp b/src/backend/cuda/kernel/wrap.hpp
new file mode 100644
index 0000000..e02dcb2
--- /dev/null
+++ b/src/backend/cuda/kernel/wrap.hpp
@@ -0,0 +1,113 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <math.hpp>
+#include "config.hpp"
+#include "atomics.hpp"
+
+namespace cuda
+{
+    namespace kernel
+    {
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Wrap Kernel
+        ///////////////////////////////////////////////////////////////////////////
+        template<typename T, bool is_column>
+        __global__
+        void wrap_kernel(Param<T> out, CParam<T> in,
+                         const dim_t wx, const dim_t wy,
+                         const dim_t sx, const dim_t sy,
+                         const dim_t px, const dim_t py,
+                         const dim_t nx, const dim_t ny,
+                         dim_t blocks_x,
+                         dim_t blocks_y)
+        {
+            dim_t idx2 = blockIdx.x / blocks_x;
+            dim_t idx3 = blockIdx.y / blocks_y;
+
+            dim_t blockIdx_x = blockIdx.x - idx2 * blocks_x;
+            dim_t blockIdx_y = blockIdx.y - idx3 * blocks_y;
+
+            dim_t oidx0 = threadIdx.x + blockDim.x * blockIdx_x;
+            dim_t oidx1 = threadIdx.y + blockDim.y * blockIdx_y;
+
+                  T *optr = out.ptr + idx2 * out.strides[2] + idx3 * out.strides[3];
+            const T *iptr =  in.ptr + idx2 *  in.strides[2] + idx3 *  in.strides[3];
+
+
+            if (oidx0 >= out.dims[0] || oidx1 >= out.dims[1]) return;
+
+            dim_t pidx0 = oidx0 + px;
+            dim_t pidx1 = oidx1 + py;
+
+            // The last time a value appears in the unwrapped index is padded_index / stride
+            // Each previous index has the value appear "stride" locations earlier
+            // We work our way back from the last index
+
+            const dim_t x_end = min(pidx0 / sx, nx - 1);
+            const dim_t y_end = min(pidx1 / sy, ny - 1);
+
+            const dim_t x_off = pidx0 - sx * x_end;
+            const dim_t y_off = pidx1 - sy * y_end;
+
+            T val = scalar<T>(0);
+            dim_t idx = 1;
+
+            for (dim_t y = y_end, yo = y_off; y >= 0 && yo < wy; yo += sy, y--) {
+                dim_t win_end_y = yo * wx;
+                dim_t dim_end_y = y * nx;
+
+                for (dim_t x = x_end, xo = x_off; x >= 0 && xo < wx; xo += sx, x--) {
+
+                    dim_t win_end = win_end_y + xo;
+                    dim_t dim_end = dim_end_y + x;
+
+                    if (is_column) {
+                        idx = dim_end * in.strides[1] + win_end;
+                    } else {
+                        idx = dim_end + win_end * in.strides[1];
+                    }
+
+                    val = val + iptr[idx];
+                }
+            }
+
+            optr[oidx1 * out.strides[1] + oidx0] = val;
+        }
+
+        template <typename T>
+        void wrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+                  const dim_t sx, const dim_t sy,
+                  const dim_t px, const dim_t py,
+                  const bool is_column)
+        {
+            dim_t nx = (out.dims[0] + 2 * px - wx) / sx + 1;
+            dim_t ny = (out.dims[1] + 2 * py - wy) / sy + 1;
+
+            dim3 threads(THREADS_X, THREADS_Y);
+            dim_t blocks_x = divup(out.dims[0], threads.x);
+            dim_t blocks_y = divup(out.dims[1], threads.y);
+
+            dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);
+
+            if (is_column) {
+                CUDA_LAUNCH((wrap_kernel<T, true >), blocks, threads,
+                            out, in, wx, wy, sx, sy, px, py, nx, ny, blocks_x, blocks_y);
+            } else {
+                CUDA_LAUNCH((wrap_kernel<T, false>), blocks, threads,
+                            out, in, wx, wy, sx, sy, px, py, nx, ny, blocks_x, blocks_y);
+            }
+        }
+    }
+}
diff --git a/src/backend/cuda/wrap.cu b/src/backend/cuda/wrap.cu
index f7da711..a1e70fc 100644
--- a/src/backend/cuda/wrap.cu
+++ b/src/backend/cuda/wrap.cu
@@ -13,6 +13,7 @@
 #include <err_cuda.hpp>
 #include <dispatch.hpp>
 #include <math.hpp>
+#include <kernel/wrap.hpp>
 
 namespace cuda
 {
@@ -25,7 +26,12 @@ namespace cuda
                   const dim_t px, const dim_t py,
                   const bool is_column)
     {
-        CUDA_NOT_SUPPORTED();
+        af::dim4 idims = in.dims();
+        af::dim4 odims(ox, oy, idims[2], idims[3]);
+        Array<T> out = createValueArray<T>(odims, scalar<T>(0));
+
+        kernel::wrap<T>(out, in, wx, wy, sx, sy, px, py, is_column);
+        return out;
     }
 
 
diff --git a/test/wrap.cpp b/test/wrap.cpp
index 3d5f735..8d26a5a 100644
--- a/test/wrap.cpp
+++ b/test/wrap.cpp
@@ -70,7 +70,8 @@ template<typename T>
 void wrapTest(const dim_t ix, const dim_t iy,
               const dim_t wx, const dim_t wy,
               const dim_t sx, const dim_t sy,
-              const dim_t px, const dim_t py)
+              const dim_t px, const dim_t py,
+              bool cond)
 {
     if (noDoubleTests<T>()) return;
 
@@ -110,69 +111,69 @@ void wrapTest(const dim_t ix, const dim_t iy,
 
     af::array factor(ix, iy, &h_factor[0]);
 
-    for (int i = 0; i <= 1; i++)
-    {
-        bool cond = i == 0;
-        af::array in_dim = af::unwrap(in, wx, wy, sx, sy, px, py, cond);
-        af::array res_dim = af::wrap(in_dim, ix, iy, wx, wy, sx, sy, px, py, cond);
+    af::array in_dim = af::unwrap(in, wx, wy, sx, sy, px, py, cond);
+    af::array res_dim = af::wrap(in_dim, ix, iy, wx, wy, sx, sy, px, py, cond);
 
-        ASSERT_EQ(in.elements(), res_dim.elements());
+    ASSERT_EQ(in.elements(), res_dim.elements());
 
-        std::vector<T> h_res(ix * iy);
-        res_dim.host(&h_res[0]);
+    std::vector<T> h_res(ix * iy);
+    res_dim.host(&h_res[0]);
 
-        for (int n = 0; n < nc; n++) {
-            T *iptr = &h_in[n * ix * iy];
-            T *rptr = &h_res[n * ix * iy];
+    for (int n = 0; n < nc; n++) {
+        T *iptr = &h_in[n * ix * iy];
+        T *rptr = &h_res[n * ix * iy];
 
-            for (int y = 0; y < iy; y++) {
-                for (int x = 0; x < ix; x++) {
+        for (int y = 0; y < iy; y++) {
+            for (int x = 0; x < ix; x++) {
 
-                    // FIXME: Use a better test
-                    T ival = iptr[y * ix + x];
-                    T rval = rptr[y * ix + x];
-                    int factor = h_factor[y * ix + x];
+                // FIXME: Use a better test
+                T ival = iptr[y * ix + x];
+                T rval = rptr[y * ix + x];
+                int factor = h_factor[y * ix + x];
 
-                    if (abs(ival) == 0) continue;
+                if (abs(ival) == 0) continue;
 
-                    ASSERT_NEAR(get_val<T>(ival * factor), get_val<T>(rval), 1E-5)
-                        << "at " << x << "," << y << std::endl;
-                }
+                ASSERT_NEAR(get_val<T>(ival * factor), get_val<T>(rval), 1E-5)
+                    << "at " << x << "," << y <<  " for cond  == " << cond << std::endl;
             }
-
         }
+
     }
 }
 
-#define WRAP_INIT(desc, ix, iy, wx, wy, sx, sy, px,py)          \
-    TYPED_TEST(Wrap, desc)                                      \
-    {                                                           \
-        wrapTest<TypeParam>(ix, iy, wx, wy, sx, sy, px, py);    \
+#define WRAP_INIT(desc, ix, iy, wx, wy, sx, sy, px,py)              \
+    TYPED_TEST(Wrap, Col##desc)                                     \
+    {                                                               \
+        wrapTest<TypeParam>(ix, iy, wx, wy, sx, sy, px, py, true ); \
+    }                                                               \
+    TYPED_TEST(Wrap, Row##desc)                                     \
+    {                                                               \
+        wrapTest<TypeParam>(ix, iy, wx, wy, sx, sy, px, py, false); \
     }
 
-    WRAP_INIT(Wrap00, 300, 100,  3,  3,  1,  1,  0,  0);
-    WRAP_INIT(Wrap01, 300, 100,  3,  3,  1,  1,  1,  1);
-    WRAP_INIT(Wrap03, 300, 100,  3,  3,  2,  2,  0,  0);
-    WRAP_INIT(Wrap04, 300, 100,  3,  3,  2,  2,  1,  1);
-    WRAP_INIT(Wrap05, 300, 100,  3,  3,  2,  2,  2,  2);
-    WRAP_INIT(Wrap06, 300, 100,  3,  3,  3,  3,  0,  0);
-    WRAP_INIT(Wrap07, 300, 100,  3,  3,  3,  3,  1,  1);
-    WRAP_INIT(Wrap08, 300, 100,  3,  3,  3,  3,  2,  2);
-    WRAP_INIT(Wrap09, 300, 100,  4,  4,  1,  1,  0,  0);
-    WRAP_INIT(Wrap13, 300, 100,  4,  4,  2,  2,  0,  0);
-    WRAP_INIT(Wrap14, 300, 100,  4,  4,  2,  2,  1,  1);
-    WRAP_INIT(Wrap15, 300, 100,  4,  4,  2,  2,  2,  2);
-    WRAP_INIT(Wrap16, 300, 100,  4,  4,  2,  2,  3,  3);
-    WRAP_INIT(Wrap17, 300, 100,  4,  4,  4,  4,  0,  0);
-    WRAP_INIT(Wrap18, 300, 100,  4,  4,  4,  4,  1,  1);
-    WRAP_INIT(Wrap19, 300, 100,  4,  4,  4,  4,  2,  2);
-    WRAP_INIT(Wrap27, 300, 100,  8,  8,  8,  8,  0,  0);
-    WRAP_INIT(Wrap28, 300, 100,  8,  8,  8,  8,  7,  7);
-    WRAP_INIT(Wrap31, 300, 100, 12, 12, 12, 12,  0,  0);
-    WRAP_INIT(Wrap32, 300, 100, 12, 12, 12, 12,  2,  2);
-    WRAP_INIT(Wrap35, 300, 100, 16, 16, 16, 16, 15, 15);
-    WRAP_INIT(Wrap36, 300, 100, 31, 31,  8,  8, 15, 15);
-    WRAP_INIT(Wrap39, 300, 100,  8, 12,  8, 12,  0,  0);
-    WRAP_INIT(Wrap40, 300, 100,  8, 12,  8, 12,  7, 11);
-    WRAP_INIT(Wrap43, 300, 100, 15, 10, 15, 10,  0,  0);
-    WRAP_INIT(Wrap44, 300, 100, 15, 10, 15, 10, 14,  9);
+    WRAP_INIT(00, 300, 100,  3,  3,  1,  1,  0,  0);
+    WRAP_INIT(01, 300, 100,  3,  3,  1,  1,  1,  1);
+    WRAP_INIT(03, 300, 100,  3,  3,  2,  2,  0,  0);
+    WRAP_INIT(04, 300, 100,  3,  3,  2,  2,  1,  1);
+    WRAP_INIT(05, 300, 100,  3,  3,  2,  2,  2,  2);
+    WRAP_INIT(06, 300, 100,  3,  3,  3,  3,  0,  0);
+    WRAP_INIT(07, 300, 100,  3,  3,  3,  3,  1,  1);
+    WRAP_INIT(08, 300, 100,  3,  3,  3,  3,  2,  2);
+    WRAP_INIT(09, 300, 100,  4,  4,  1,  1,  0,  0);
+    WRAP_INIT(13, 300, 100,  4,  4,  2,  2,  0,  0);
+    WRAP_INIT(14, 300, 100,  4,  4,  2,  2,  1,  1);
+    WRAP_INIT(15, 300, 100,  4,  4,  2,  2,  2,  2);
+    WRAP_INIT(16, 300, 100,  4,  4,  2,  2,  3,  3);
+    WRAP_INIT(17, 300, 100,  4,  4,  4,  4,  0,  0);
+    WRAP_INIT(18, 300, 100,  4,  4,  4,  4,  1,  1);
+    WRAP_INIT(19, 300, 100,  4,  4,  4,  4,  2,  2);
+    WRAP_INIT(27, 300, 100,  8,  8,  8,  8,  0,  0);
+    WRAP_INIT(28, 300, 100,  8,  8,  8,  8,  7,  7);
+    WRAP_INIT(31, 300, 100, 12, 12, 12, 12,  0,  0);
+    WRAP_INIT(32, 300, 100, 12, 12, 12, 12,  2,  2);
+    WRAP_INIT(35, 300, 100, 16, 16, 16, 16, 15, 15);
+    WRAP_INIT(36, 300, 100, 31, 31,  8,  8, 15, 15);
+    WRAP_INIT(39, 300, 100,  8, 12,  8, 12,  0,  0);
+    WRAP_INIT(40, 300, 100,  8, 12,  8, 12,  7, 11);
+    WRAP_INIT(43, 300, 100, 15, 10, 15, 10,  0,  0);
+    WRAP_INIT(44, 300, 100, 15, 10, 15, 10, 14,  9);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git