[arrayfire] 268/408: FEAT: Adding support to unwrap along rows as well as columns

Mon Sep 21 19:12:11 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 2d60aed2d75163f816cba4c2271ef39cae55125d
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Fri Aug 14 16:29:59 2015 -0400

    FEAT: Adding support to unwrap along rows as well as columns
    
    - updated tests to test for unwrapping along rows
---
 include/af/image.h                   |  8 ++-
 src/api/c/unwrap.cpp                 | 27 ++++++-----
 src/api/cpp/unwrap.cpp               |  5 +-
 src/backend/cpu/unwrap.cpp           | 38 +++++++++------
 src/backend/cpu/unwrap.hpp           |  3 +-
 src/backend/cuda/kernel/unwrap.hpp   | 94 +++++++++++++++++++++++++-----------
 src/backend/cuda/unwrap.cu           | 24 ++++-----
 src/backend/cuda/unwrap.hpp          |  3 +-
 src/backend/opencl/kernel/unwrap.cl  | 50 +++++++++----------
 src/backend/opencl/kernel/unwrap.hpp | 37 ++++++++------
 src/backend/opencl/unwrap.cpp        | 21 ++++----
 src/backend/opencl/unwrap.hpp        |  3 +-
 test/unwrap.cpp                      | 25 ++++++----
 13 files changed, 193 insertions(+), 145 deletions(-)

diff --git a/include/af/image.h b/include/af/image.h
index 4f7227c..6bdbfb4 100644
--- a/include/af/image.h
+++ b/include/af/image.h
@@ -483,12 +483,14 @@ AFAPI array dog(const array& in, const int radius1, const int radius2);
    \param[in]  sy is the stride along 1st-dimension
    \param[in]  px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
    \param[in]  py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
+   \param[in]  is_column specifies the layout for the unwrapped patch. If is_column is false, the unrapped patch is laid out as a row.
    \returns    an array with the image blocks as columns
 
    \ingroup image_func_unwrap
 */
 AFAPI array unwrap(const array& in, const dim_t wx, const dim_t wy,
-                   const dim_t sx, const dim_t sy, const dim_t px=0, const dim_t py=0);
+                   const dim_t sx, const dim_t sy, const dim_t px=0, const dim_t py=0,
+                   const bool is_column = true);
 
 
 /**
@@ -996,13 +998,15 @@ extern "C" {
        \param[in]  sy is the stride along 1st-dimension
        \param[in]  px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
        \param[in]  py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
+       \param[in]  is_column specifies the layout for the unwrapped patch. If is_column is false, the unrapped patch is laid out as a row.
        \return     \ref AF_SUCCESS if the color transformation is successful,
        otherwise an appropriate error code is returned.
 
        \ingroup image_func_unwrap
     */
     AFAPI af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
-                           const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+                           const dim_t sx, const dim_t sy, const dim_t px, const dim_t py,
+                           const bool is_column);
 
     /**
        C Interface wrapper for summed area tables
diff --git a/src/api/c/unwrap.cpp b/src/api/c/unwrap.cpp
index 3816874..5bb6fcd 100644
--- a/src/api/c/unwrap.cpp
+++ b/src/api/c/unwrap.cpp
@@ -20,13 +20,14 @@ using namespace detail;
 
 template<typename T>
 static inline af_array unwrap(const af_array in, const dim_t wx, const dim_t wy,
-                              const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+                              const dim_t sx, const dim_t sy, const dim_t px, const dim_t py,
+                              const bool is_column)
 {
-    return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy, px, py));
+    return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy, px, py, is_column));
 }
 
 af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
-                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
 {
     try {
         ArrayInfo info = getInfo(in);
@@ -43,16 +44,16 @@ af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t w
         af_array output;
 
         switch(type) {
-            case f32: output = unwrap<float  >(in, wx, wy, sx, sy, px, py);  break;
-            case f64: output = unwrap<double >(in, wx, wy, sx, sy, px, py);  break;
-            case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy, px, py);  break;
-            case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy, px, py);  break;
-            case s32: output = unwrap<int    >(in, wx, wy, sx, sy, px, py);  break;
-            case u32: output = unwrap<uint   >(in, wx, wy, sx, sy, px, py);  break;
-            case s64: output = unwrap<intl   >(in, wx, wy, sx, sy, px, py);  break;
-            case u64: output = unwrap<uintl  >(in, wx, wy, sx, sy, px, py);  break;
-            case u8:  output = unwrap<uchar  >(in, wx, wy, sx, sy, px, py);  break;
-            case b8:  output = unwrap<char   >(in, wx, wy, sx, sy, px, py);  break;
+            case f32: output = unwrap<float  >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case f64: output = unwrap<double >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case s32: output = unwrap<int    >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case u32: output = unwrap<uint   >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case s64: output = unwrap<intl   >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case u64: output = unwrap<uintl  >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case u8:  output = unwrap<uchar  >(in, wx, wy, sx, sy, px, py, is_column);  break;
+            case b8:  output = unwrap<char   >(in, wx, wy, sx, sy, px, py, is_column);  break;
             default:  TYPE_ERROR(1, type);
         }
         std::swap(*out,output);
diff --git a/src/api/cpp/unwrap.cpp b/src/api/cpp/unwrap.cpp
index 7a1a103..d48d312 100644
--- a/src/api/cpp/unwrap.cpp
+++ b/src/api/cpp/unwrap.cpp
@@ -14,11 +14,10 @@
 namespace af
 {
     array unwrap(const array& in, const dim_t wx, const dim_t wy,
-                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
     {
         af_array out = 0;
-        AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy, px, py));
+        AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy, px, py, is_column));
         return array(out);
     }
 }
-
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 6d206b2..0de292e 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -16,11 +16,11 @@
 
 namespace cpu
 {
-    template<typename T>
-    void unwrap_(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-                 const af::dim4 &ostrides, const af::dim4 &istrides,
-                 const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
-                 const dim_t px, const dim_t py)
+    template<typename T, int d>
+    void unwrap_dim(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
+                    const af::dim4 &ostrides, const af::dim4 &istrides,
+                    const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
+                    const dim_t px, const dim_t py)
     {
         dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
 
@@ -30,11 +30,11 @@ namespace cpu
                 dim_t cOut = w * ostrides[3] + z * ostrides[2];
                 dim_t cIn  = w * istrides[3] + z * istrides[2];
                 const T* iptr = inPtr  + cIn;
-                      T* optr_= outPtr + cOut;
+                T* optr_= outPtr + cOut;
 
-                for(dim_t col = 0; col < odims[1]; col++) {
+                for(dim_t col = 0; col < odims[d]; col++) {
                     // Offset output ptr
-                    T* optr = optr_ + col * ostrides[1];
+                    T* optr = optr_ + col * ostrides[d];
 
                     // Calculate input window index
                     dim_t winy = (col / nx);
@@ -56,7 +56,9 @@ namespace cpu
                             dim_t xpad = spx + x;
                             dim_t ypad = spy + y;
 
-                            dim_t oloc = (y * wx + x) * ostrides[0];
+                            dim_t oloc = (y * wx + x);
+                            if (d == 0) oloc *= ostrides[1];
+
                             if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
                                 dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
                                 optr[oloc] = iptr[iloc];
@@ -72,7 +74,7 @@ namespace cpu
 
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
     {
         af::dim4 idims = in.dims();
 
@@ -81,25 +83,32 @@ namespace cpu
 
         af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
 
+        if (!is_column) {
+            std::swap(odims[0], odims[1]);
+        }
+
         // Create output placeholder
         Array<T> outArray = createEmptyArray<T>(odims);
 
         // Get pointers to raw data
         const T *inPtr = in.get();
-              T *outPtr = outArray.get();
+        T *outPtr = outArray.get();
 
         af::dim4 ostrides = outArray.strides();
         af::dim4 istrides = in.strides();
 
-        unwrap_(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
-
+        if (is_column) {
+            unwrap_dim<T, 1>(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
+        } else {
+            unwrap_dim<T, 0>(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
+        }
         return outArray;
     }
 
 
 #define INSTANTIATE(T)                                                                  \
     template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
 
 
     INSTANTIATE(float)
@@ -113,4 +122,3 @@ namespace cpu
     INSTANTIATE(uchar)
     INSTANTIATE(char)
 }
-
diff --git a/src/backend/cpu/unwrap.hpp b/src/backend/cpu/unwrap.hpp
index 7b5ea75..447fcfe 100644
--- a/src/backend/cpu/unwrap.hpp
+++ b/src/backend/cpu/unwrap.hpp
@@ -13,6 +13,5 @@ namespace cpu
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
 }
-
diff --git a/src/backend/cuda/kernel/unwrap.hpp b/src/backend/cuda/kernel/unwrap.hpp
index e012340..410d94f 100644
--- a/src/backend/cuda/kernel/unwrap.hpp
+++ b/src/backend/cuda/kernel/unwrap.hpp
@@ -12,19 +12,20 @@
 #include <err_cuda.hpp>
 #include <debug_cuda.hpp>
 #include <math.hpp>
+#include "config.hpp"
 
 namespace cuda
 {
     namespace kernel
     {
         ///////////////////////////////////////////////////////////////////////////
-        // Resize Kernel
+        // Unwrap Kernel
         ///////////////////////////////////////////////////////////////////////////
-        template<typename T, int TX>
+        template<typename T, bool is_column>
         __global__
         void unwrap_kernel(Param<T> out, CParam<T> in,
                            const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
-                           const dim_t px, const dim_t py, const dim_t nx, dim_t repsPerColumn)
+                           const dim_t px, const dim_t py, const dim_t nx, dim_t reps)
         {
             // Compute channel and volume
             const dim_t w = blockIdx.y / in.dims[2];
@@ -38,69 +39,106 @@ namespace cuda
             const dim_t cIn  = w *  in.strides[3] + z *  in.strides[2];
 
             // Compute the output column index
-            const dim_t colId = blockIdx.x * blockDim.y + threadIdx.y;
+            const dim_t id = is_column ?
+                (blockIdx.x * blockDim.y + threadIdx.y) :
+                (blockIdx.x * blockDim.x + threadIdx.x);
 
-            if(colId >= out.dims[1])
-                return;
+            if (id >= (is_column ? out.dims[1] : out.dims[0])) return;
 
             // Compute the starting index of window in x and y of input
-            const dim_t startx = (colId % nx) * sx;
-            const dim_t starty = (colId / nx) * sy;
+            const dim_t startx = (id % nx) * sx;
+            const dim_t starty = (id / nx) * sy;
 
             const dim_t spx = startx - px;
             const dim_t spy = starty - py;
 
             // Offset the global pointers to the respective starting indices
-                  T* optr = out.ptr + cOut + colId * out.strides[1];
+            T* optr = out.ptr + cOut + id * (is_column ? out.strides[1] : 1);
             const T* iptr = in.ptr  + cIn;
 
             bool cond = (spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1]);
 
-            for(int i = 0; i < repsPerColumn; i++) {
+            for(int i = 0; i < reps; i++) {
+
                 // Compute output index local to column
-                const dim_t colIndex = i * TX + threadIdx.x;
+                const dim_t outIdx = is_column ?
+                    (i * blockDim.x + threadIdx.x) :
+                    (i * blockDim.y + threadIdx.y);
 
-                if(colIndex >= out.dims[0])
+                if(outIdx >= (is_column ? out.dims[0] : out.dims[1]))
                     return;
 
                 // Compute input index local to window
-                const dim_t x = colIndex % wx;
-                const dim_t y = colIndex / wx;
+                const dim_t x = outIdx % wx;
+                const dim_t y = outIdx / wx;
 
                 const dim_t xpad = spx + x;
                 const dim_t ypad = spy + y;
 
-                const dim_t outIdx = (y * wx + x) * out.strides[0];
-
                 // Copy
                 T val = scalar<T>(0.0);
                 if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
                     const dim_t inIdx = ypad * in.strides[1] + xpad;
                     val = iptr[inIdx];
                 }
-                optr[outIdx] = val;
+
+                if (is_column) {
+                    optr[outIdx] = val;
+                } else {
+                    optr[outIdx * out.strides[1]] = val;
+                }
             }
         }
 
         ///////////////////////////////////////////////////////////////////////////
         // Wrapper functions
         ///////////////////////////////////////////////////////////////////////////
-        template <typename T, int TX>
-        void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
+        template <typename T>
+        void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+                        const dim_t sx, const dim_t sy,
+                        const dim_t px, const dim_t py, const dim_t nx)
         {
-            dim3 threads(TX, 256 / TX, 1);
-
-            dim_t repsPerColumn = 1;
-            if(TX == 256 && wx * wy > 256) {
-                repsPerColumn = divup((wx * wy), 256);
-            }
+            dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0]));
 
+            dim3 threads(TX, THREADS_PER_BLOCK / TX);
             dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);
 
-            CUDA_LAUNCH((unwrap_kernel<T, TX>), blocks, threads,
-                        out, in, wx, wy, sx, sy, px, py, nx, repsPerColumn);
+            dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256
+
+            CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads,
+                        out, in, wx, wy, sx, sy, px, py, nx, reps);
+
+            POST_LAUNCH_CHECK();
+        }
+
+        template<typename T>
+        void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+                        const dim_t sx, const dim_t sy,
+                        const dim_t px, const dim_t py, const dim_t nx)
+        {
+            dim3 threads(THREADS_X, THREADS_Y);
+            dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]);
+
+            dim_t reps = divup((wx * wy), threads.y);
+
+            CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads,
+                        out, in, wx, wy, sx, sy, px, py, nx, reps);
+
             POST_LAUNCH_CHECK();
         }
+
+        template <typename T>
+        void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+                    const dim_t sx, const dim_t sy,
+                    const dim_t px, const dim_t py, const dim_t nx, const bool is_column)
+        {
+
+            if (is_column) {
+                unwrap_col<T>(out, in, wx, wy, sx, sy, px, py, nx);
+            } else {
+                unwrap_row<T>(out, in, wx, wy, sx, sy, px, py, nx);
+            }
+        }
+
     }
 }
diff --git a/src/backend/cuda/unwrap.cu b/src/backend/cuda/unwrap.cu
index 3164e87..5fdfc0e 100644
--- a/src/backend/cuda/unwrap.cu
+++ b/src/backend/cuda/unwrap.cu
@@ -17,37 +17,31 @@ namespace cuda
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
     {
         af::dim4 idims = in.dims();
 
         dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
         dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
 
-        af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
+        af::dim4 odims;
 
-        // Create output placeholder
-        Array<T> outArray = createEmptyArray<T>(odims);
-
-        if(odims[0] <= 16) {
-            kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
-        } else if (odims[0] <= 32) {
-            kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
-        } else if (odims[0] <= 64) {
-            kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
-        } else if(odims[0] <= 128) {
-            kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
+        if (is_column) {
+            odims = dim4(wx * wy, nx * ny, idims[2], idims[3]);
         } else {
-            kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
+            odims = dim4(nx * ny, wx * wy, idims[2], idims[3]);
         }
 
+        // Create output placeholder
+        Array<T> outArray = createEmptyArray<T>(odims);
+        kernel::unwrap<T>(outArray, in, wx, wy, sx, sy, px, py, nx, is_column);
         return outArray;
     }
 
 
 #define INSTANTIATE(T)                                                                  \
     template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
 
 
     INSTANTIATE(float)
diff --git a/src/backend/cuda/unwrap.hpp b/src/backend/cuda/unwrap.hpp
index 0217c4b..7105585 100644
--- a/src/backend/cuda/unwrap.hpp
+++ b/src/backend/cuda/unwrap.hpp
@@ -13,6 +13,5 @@ namespace cuda
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
 }
-
diff --git a/src/backend/opencl/kernel/unwrap.cl b/src/backend/opencl/kernel/unwrap.cl
index 61aab1a..6ffd1e4 100644
--- a/src/backend/opencl/kernel/unwrap.cl
+++ b/src/backend/opencl/kernel/unwrap.cl
@@ -7,24 +7,11 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#define divup(a, b) (((a)+(b)-1)/(b))
-
-#if CPLX
-#define set(a, b) a = b
-#define set_scalar(a, b) do {                   \
-        a.x = b;                                \
-        a.y = 0;                                \
-    } while(0)
-#else
-#define set(a, b) a = b
-#define set_scalar(a, b) a = b
-#endif
-
 __kernel
 void unwrap_kernel(__global T *d_out, const KParam out,
                    __global const T *d_in, const KParam in,
                    const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
-                   const dim_t px, const dim_t py, const dim_t nx, const dim_t repsPerColumn)
+                   const dim_t px, const dim_t py, const dim_t nx, const dim_t reps)
 {
     // Compute channel and volume
     const dim_t w = get_group_id(1) / in.dims[2];
@@ -38,46 +25,53 @@ void unwrap_kernel(__global T *d_out, const KParam out,
     const dim_t cIn  = w *  in.strides[3] + z *  in.strides[2];
 
     // Compute the output column index
-    const dim_t colId = get_group_id(0) * get_local_size(1) + get_local_id(1);
+    const dim_t id = is_column ?
+        (get_group_id(0) * get_local_size(1) + get_local_id(1)) :
+        get_global_id(0);
 
-    if(colId >= out.dims[1])
-        return;
+    if (id >= (is_column ? out.dims[1] : out.dims[0])) return;
 
     // Compute the starting index of window in x and y of input
-    const dim_t startx = (colId % nx) * sx;
-    const dim_t starty = (colId / nx) * sy;
+    const dim_t startx = (id % nx) * sx;
+    const dim_t starty = (id / nx) * sy;
 
     const dim_t spx = startx - px;
     const dim_t spy = starty - py;
 
     // Offset the global pointers to the respective starting indices
-    __global       T* optr = d_out + cOut + colId * out.strides[1];
+    __global       T* optr = d_out + cOut + id * (is_column ? out.strides[1] : 1);
     __global const T* iptr = d_in  + cIn + in.offset;
 
     bool cond = (spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1]);
 
-    for(int i = 0; i < repsPerColumn; i++) {
+    for(int i = 0; i < reps; i++) {
+
         // Compute output index local to column
-        const dim_t colIndex = i * TX + get_local_id(0);
+        const dim_t outIdx = is_column ?
+            (i * get_local_size(0) + get_local_id(0)) :
+            (i * get_local_size(1) + get_local_id(1));
 
-        if(colIndex >= out.dims[0])
+        if(outIdx >= (is_column ? out.dims[0] : out.dims[1]))
             return;
 
         // Compute input index local to window
-        const dim_t y = colIndex / wx;
-        const dim_t x = colIndex % wx;
+        const dim_t y = outIdx / wx;
+        const dim_t x = outIdx % wx;
 
         const dim_t xpad = spx + x;
         const dim_t ypad = spy + y;
 
-        const dim_t outIdx = (y * wx + x) * out.strides[0];
-
         // Copy
         T val = ZERO;
         if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
             const dim_t inIdx = ypad * in.strides[1] + xpad * in.strides[0];
             val = iptr[inIdx];
         }
-        optr[outIdx] = val;
+
+        if (is_column) {
+            optr[outIdx] = val;
+        } else {
+            optr[outIdx * out.strides[1]] = val;
+        }
     }
 }
diff --git a/src/backend/opencl/kernel/unwrap.hpp b/src/backend/opencl/kernel/unwrap.hpp
index 51c3297..180dc8a 100644
--- a/src/backend/opencl/kernel/unwrap.hpp
+++ b/src/backend/opencl/kernel/unwrap.hpp
@@ -33,7 +33,7 @@ namespace opencl
 {
     namespace kernel
     {
-        template<typename T, int TX>
+        template<typename T, bool is_column>
         void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy,
                     const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
         {
@@ -48,9 +48,9 @@ namespace opencl
 
                         ToNum<T> toNum;
                         std::ostringstream options;
-                        options << " -D ZERO=" << toNum(scalar<T>(0));
-                        options << " -D T="    << dtype_traits<T>::getName();
-                        options << " -D TX="   << TX;
+                        options << " -D is_column=" << is_column
+                                << " -D ZERO=" << toNum(scalar<T>(0))
+                                << " -D T="    << dtype_traits<T>::getName();
 
                         if((af_dtype) dtype_traits<T>::af_type == c32 ||
                            (af_dtype) dtype_traits<T>::af_type == c64) {
@@ -75,20 +75,29 @@ namespace opencl
                                       const dim_t, const dim_t, const dim_t, const dim_t>
                                       (*unwrapKernels[device]);
 
-                const dim_t TY = 256 / TX;
-                dim_t repsPerColumn = 1;
-                if(TX == 256 && wx * wy > 256) {
-                    repsPerColumn = divup((wx * wy), 256);
-                }
+                dim_t TX = 1, TY = 1;
+                dim_t BX = 1;
+                const dim_t BY = out.info.dims[2] * out.info.dims[3];
+                dim_t reps = 1;
 
-                NDRange local(TX, TY, 1);
+                if (is_column) {
+                    TX = std::min(THREADS_PER_GROUP, nextpow2(out.info.dims[0]));
+                    TY = THREADS_PER_GROUP / TX;
+                    BX = divup(out.info.dims[1], TY);
+                    reps = divup((wx * wy), TX);
+                } else {
+                    TX = THREADS_X;
+                    TY = THREADS_Y;
+                    BX = divup(out.info.dims[0], TX);
+                    reps = divup((wx * wy), TY);
+                }
 
-                NDRange global(local[0] * divup(out.info.dims[1], TY),
-                               local[1] * out.info.dims[2] * out.info.dims[3],
-                               1);
+                NDRange local(TX, TY);
+                NDRange global(local[0] * BX,
+                               local[1] * BY);
 
                 unwrapOp(EnqueueArgs(getQueue(), global, local),
-                       *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, repsPerColumn);
+                       *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, reps);
 
                 CL_DEBUG_FINISH(getQueue());
             } catch (cl::Error err) {
diff --git a/src/backend/opencl/unwrap.cpp b/src/backend/opencl/unwrap.cpp
index da8f8b3..2a1662f 100644
--- a/src/backend/opencl/unwrap.cpp
+++ b/src/backend/opencl/unwrap.cpp
@@ -17,7 +17,7 @@ namespace opencl
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
     {
         af::dim4 idims = in.dims();
 
@@ -26,19 +26,17 @@ namespace opencl
 
         af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
 
+        if (!is_column) {
+            std::swap(odims[0], odims[1]);
+        }
+
         // Create output placeholder
         Array<T> outArray = createEmptyArray<T>(odims);
 
-        if(odims[0] <= 16) {
-            kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
-        } else if (odims[0] <= 32) {
-            kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
-        } else if (odims[0] <= 64) {
-            kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
-        } else if(odims[0] <= 128) {
-            kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
+        if (is_column) {
+            kernel::unwrap<T, true >(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else {
-            kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
+            kernel::unwrap<T, false>(outArray, in, wx, wy, sx, sy, px, py, nx);
         }
 
         return outArray;
@@ -47,7 +45,7 @@ namespace opencl
 
 #define INSTANTIATE(T)                                                                  \
     template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
 
 
     INSTANTIATE(float)
@@ -61,4 +59,3 @@ namespace opencl
     INSTANTIATE(uchar)
     INSTANTIATE(char)
 }
-
diff --git a/src/backend/opencl/unwrap.hpp b/src/backend/opencl/unwrap.hpp
index 40efb4c..d8d3d55 100644
--- a/src/backend/opencl/unwrap.hpp
+++ b/src/backend/opencl/unwrap.hpp
@@ -13,6 +13,5 @@ namespace opencl
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
 }
-
diff --git a/test/unwrap.cpp b/test/unwrap.cpp
index 0602caf..d8238c1 100644
--- a/test/unwrap.cpp
+++ b/test/unwrap.cpp
@@ -54,26 +54,34 @@ void unwrapTest(string pTestFile, const unsigned resultIdx,
 
     af_array inArray = 0;
     af_array outArray = 0;
+    af_array outArrayT = 0;
+    af_array outArray2 = 0;
 
     ASSERT_EQ(AF_SUCCESS, af_create_array(&inArray, &(in[0].front()), idims.ndims(), idims.get(), (af_dtype) af::dtype_traits<T>::af_type));
 
-    ASSERT_EQ(AF_SUCCESS, af_unwrap(&outArray, inArray, wx, wy, sx, sy, px, py));
+    ASSERT_EQ(AF_SUCCESS, af_unwrap(&outArray , inArray, wx, wy, sx, sy, px, py, true ));
+    ASSERT_EQ(AF_SUCCESS, af_unwrap(&outArrayT, inArray, wx, wy, sx, sy, px, py, false));
+    ASSERT_EQ(AF_SUCCESS, af_transpose(&outArray2, outArrayT, false));
 
-    // Get result
-    T* outData = new T[tests[resultIdx].size()];
-    ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray));
-
-    // Compare result
     size_t nElems = tests[resultIdx].size();
+    std::vector<T> outData(nElems);
+
+    // Compare is_column == true results
+    ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)&outData[0], outArray));
     for (size_t elIter = 0; elIter < nElems; ++elIter) {
         ASSERT_EQ(tests[resultIdx][elIter], outData[elIter]) << "at: " << elIter << std::endl;
     }
 
-    // Delete
-    delete[] outData;
+    // Compare is_column == false results
+    ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)&outData[0], outArray2));
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx][elIter], outData[elIter]) << "at: " << elIter << std::endl;
+    }
 
     if(inArray   != 0) af_release_array(inArray);
     if(outArray  != 0) af_release_array(outArray);
+    if(outArrayT != 0) af_release_array(outArrayT);
+    if(outArray2 != 0) af_release_array(outArray2);
 }
 
 #define UNWRAP_INIT(desc, file, resultIdx, wx, wy, sx, sy, px,py)                                           \
@@ -164,4 +172,3 @@ TEST(Unwrap, CPP)
     // Delete
     delete[] outData;
 }
-

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git