[arrayfire] 08/408: Added CUDA backend for Unwrap

Mon Sep 21 19:11:03 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 73d9038c4e3afb4dac02e4f9aec5bb56546b4bbc
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date:   Wed Jun 17 16:34:49 2015 -0400

    Added CUDA backend for Unwrap
---
 src/backend/cpu/unwrap.cpp         |  8 ++--
 src/backend/cuda/kernel/unwrap.hpp | 89 ++++++++++++++++++++++++++++++++++++++
 src/backend/cuda/unwrap.cu         | 15 ++++++-
 3 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 691cf08..3d25661 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -20,7 +20,7 @@ namespace cpu
                  const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
     {
         dim_t nx = (idims[0] - wx) / sx + 1;
-        dim_t ny = (idims[1] - wy) / sy + 1;
+        //dim_t ny = (idims[1] - wy) / sy + 1;
 
         for(dim_t w = 0; w < odims[3]; w++) {
             for(dim_t z = 0; z < odims[2]; z++) {
@@ -28,8 +28,8 @@ namespace cpu
                 dim_t cIn  = w * istrides[3] + z * istrides[2];
                 for(dim_t col = 0; col < odims[1]; col++) {
                     // Calculate input window index
-                    dim_t winy = (col / ny);
-                    dim_t winx = (col % ny);
+                    dim_t winy = (col / nx);
+                    dim_t winx = (col % nx);
 
                     dim_t startx = winx * sx;
                     dim_t starty = winy * sy;
@@ -39,7 +39,7 @@ namespace cpu
 
                     for(dim_t y = 0; y < wy; y++) {
                         for(dim_t x = 0; x < wx; x++) {
-                            dim_t oloc = (y * wy + x) * ostrides[0];
+                            dim_t oloc = (y * wx + x) * ostrides[0];
                             dim_t iloc = (y * istrides[1] + x * istrides[0]);
                             optr[oloc] = iptr[iloc];
                         }
diff --git a/src/backend/cuda/kernel/unwrap.hpp b/src/backend/cuda/kernel/unwrap.hpp
new file mode 100644
index 0000000..9ac0663
--- /dev/null
+++ b/src/backend/cuda/kernel/unwrap.hpp
@@ -0,0 +1,89 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <math.hpp>
+
+namespace cuda
+{
+    namespace kernel
+    {
+        ///////////////////////////////////////////////////////////////////////////
+        // Resize Kernel
+        ///////////////////////////////////////////////////////////////////////////
+        template<typename T, int threads>
+        __global__
+        void unwrap_kernel(Param<T> out, CParam<T> in,
+                           const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
+                           dim_t repsPerColumn)
+        {
+            const dim_t w = blockIdx.y / in.dims[2];
+            const dim_t z = blockIdx.y % in.dims[2];
+
+            if(w >= in.dims[3] || z >= in.dims[2])
+                return;
+
+            const dim_t cOut = w * out.strides[3] + z * out.strides[2];
+            const dim_t cIn  = w *  in.strides[3] + z *  in.strides[2];
+
+            const dim_t nx = (in.dims[0] - wx) / sx + 1;
+            //dim_t ny = (in.dims[1] - wy) / sy + 1;
+
+            const dim_t colId = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if(colId >= out.dims[1])
+                return;
+
+            const dim_t startx = (colId % nx) * sx;
+            const dim_t starty = (colId / nx) * sy;
+
+                  T* optr = out.ptr + cOut + colId * out.strides[1];
+            const T* iptr = in.ptr  + cIn  + starty * in.strides[1] + startx;
+
+            for(int i = 0; i < repsPerColumn; i++) {
+                const dim_t colIndex = i * threads + threadIdx.x;
+
+                if(colIndex >= out.dims[0])
+                    return;
+
+                const dim_t x = colIndex % wx;
+                const dim_t y = colIndex / wx;
+
+                const dim_t outIdx = (y * wx + x) * out.strides[0];
+                const dim_t inIdx = y * in.strides[1] + x * in.strides[0];
+
+                optr[outIdx] = iptr[inIdx];
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Wrapper functions
+        ///////////////////////////////////////////////////////////////////////////
+        template <typename T, int TX>
+        void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+                    const dim_t sx, const dim_t sy)
+        {
+            dim3 threads(TX, 256 / TX, 1);
+
+            dim_t repsPerColumn = 1;
+            if(TX == 256 && wx * wy > 256) {
+                repsPerColumn = (wx * wy) / 256;
+            }
+
+            dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);
+
+            unwrap_kernel<T, TX><<<blocks, threads>>>(out, in, wx, wy, sx, sy, repsPerColumn);
+            POST_LAUNCH_CHECK();
+        }
+    }
+}
+
diff --git a/src/backend/cuda/unwrap.cu b/src/backend/cuda/unwrap.cu
index d44ffd8..b180420 100644
--- a/src/backend/cuda/unwrap.cu
+++ b/src/backend/cuda/unwrap.cu
@@ -9,7 +9,7 @@
 
 #include <Array.hpp>
 #include <unwrap.hpp>
-//#include <kernel/unwrap.hpp>
+#include <kernel/unwrap.hpp>
 #include <stdexcept>
 #include <err_cuda.hpp>
 
@@ -29,6 +29,18 @@ namespace cuda
         // Create output placeholder
         Array<T> outArray = createEmptyArray<T>(odims);
 
+        if(odims[0] <= 16) {
+            kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy);
+        } else if (odims[0] <= 32) {
+            kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy);
+        } else if (odims[0] <= 64) {
+            kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy);
+        } else if(odims[0] <= 128) {
+            kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy);
+        } else {
+            kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy);
+        }
+
         return outArray;
     }
 
@@ -49,4 +61,3 @@ namespace cuda
     INSTANTIATE(uchar)
     INSTANTIATE(char)
 }
-

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git