[arrayfire] 16/408: Changing behavior of unwrap using padding

Mon Sep 21 19:11:06 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 2816c018af55b1af243fbe927cdd7122cc4cd8c7
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date:   Fri Jun 19 11:21:03 2015 -0400

    Changing behavior of unwrap using padding
    
    * All 3 backends changed
    * PX and PY can be less then WX and WY respectively
---
 include/af/image.h                   | 18 ++++++++++-----
 src/api/c/unwrap.cpp                 | 35 ++++++++++++++++------------
 src/api/cpp/unwrap.cpp               |  5 ++--
 src/backend/cpu/unwrap.cpp           | 44 +++++++++++++++++++++++-------------
 src/backend/cpu/unwrap.hpp           |  2 +-
 src/backend/cuda/kernel/unwrap.hpp   | 30 +++++++++++++-----------
 src/backend/cuda/unwrap.cu           | 18 +++++++--------
 src/backend/cuda/unwrap.hpp          |  2 +-
 src/backend/opencl/kernel/unwrap.cl  | 31 +++++++++++++------------
 src/backend/opencl/kernel/unwrap.hpp |  7 +++---
 src/backend/opencl/unwrap.cpp        | 18 +++++++--------
 src/backend/opencl/unwrap.hpp        |  2 +-
 12 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/include/af/image.h b/include/af/image.h
index 0c616bc..22b9fd5 100644
--- a/include/af/image.h
+++ b/include/af/image.h
@@ -465,15 +465,18 @@ AFAPI array colorSpace(const array& image, const CSpace to, const CSpace from);
    C++ Interface wrapper for unwrap
 
    \param[in]  in is the input array
-   \param[in]  wx is the block window size along 0th-dimension
-   \param[in]  wy is the block window size along 1st-dimension
+   \param[in]  wx is the block window size along 0th-dimension between [1, input.dims[0] + px]
+   \param[in]  wy is the block window size along 1st-dimension between [1, input.dims[1] + py]
    \param[in]  sx is the stride along 0th-dimension
    \param[in]  sy is the stride along 1st-dimension
+   \param[in]  px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
+   \param[in]  py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
    \returns    an array with the image blocks as columns
 
    \ingroup image_func_unwrap
 */
-AFAPI array unwrap(const array& in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy);
+AFAPI array unwrap(const array& in, const dim_t wx, const dim_t wy,
+                   const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 
 }
 #endif
@@ -922,16 +925,19 @@ extern "C" {
 
        \param[out] out is an array with image blocks as columns.
        \param[in]  in is the input array
-       \param[in]  wx is the block window size along 0th-dimension
-       \param[in]  wy is the block window size along 1st-dimension
+       \param[in]  wx is the block window size along 0th-dimension between [1, input.dims[0] + px]
+       \param[in]  wy is the block window size along 1st-dimension between [1, input.dims[1] + py]
        \param[in]  sx is the stride along 0th-dimension
        \param[in]  sy is the stride along 1st-dimension
+       \param[in]  px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
+       \param[in]  py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
        \return     \ref AF_SUCCESS if the color transformation is successful,
        otherwise an appropriate error code is returned.
 
        \ingroup image_func_unwrap
     */
-    AFAPI af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy);
+    AFAPI af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
+        const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 
 #ifdef __cplusplus
 }
diff --git a/src/api/c/unwrap.cpp b/src/api/c/unwrap.cpp
index 6ef8d49..2b173b3 100644
--- a/src/api/c/unwrap.cpp
+++ b/src/api/c/unwrap.cpp
@@ -20,36 +20,41 @@ using namespace detail;
 
 template<typename T>
 static inline af_array unwrap(const af_array in, const dim_t wx, const dim_t wy,
-                              const dim_t sx, const dim_t sy)
+                              const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
 {
-    return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy));
+    return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy, px, py));
 }
 
-af_err af_unwrap(af_array *out, const af_array in,
-                 const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
+af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
+                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
 {
     try {
         ArrayInfo info = getInfo(in);
         af_dtype type = info.getType();
         af::dim4 idims = info.dims();
 
-        DIM_ASSERT(1, idims[0] >= wx && idims[1] >= wy);
+        DIM_ASSERT(2, wx > 0 && wx <= idims[0] + px);
+        DIM_ASSERT(3, wy > 0 && wy <= idims[1] + py);
         ARG_ASSERT(4, sx > 0);
         ARG_ASSERT(5, sy > 0);
+        ARG_ASSERT(4, sx > 0);
+        ARG_ASSERT(5, sy > 0);
+        ARG_ASSERT(6, px >= 0 && px < wx);
+        ARG_ASSERT(7, py >= 0 && py < wy);
 
         af_array output;
 
         switch(type) {
-            case f32: output = unwrap<float  >(in, wx, wy, sx, sy);  break;
-            case f64: output = unwrap<double >(in, wx, wy, sx, sy);  break;
-            case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy);  break;
-            case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy);  break;
-            case s32: output = unwrap<int    >(in, wx, wy, sx, sy);  break;
-            case u32: output = unwrap<uint   >(in, wx, wy, sx, sy);  break;
-            case s64: output = unwrap<intl   >(in, wx, wy, sx, sy);  break;
-            case u64: output = unwrap<uintl  >(in, wx, wy, sx, sy);  break;
-            case u8:  output = unwrap<uchar  >(in, wx, wy, sx, sy);  break;
-            case b8:  output = unwrap<char   >(in, wx, wy, sx, sy);  break;
+            case f32: output = unwrap<float  >(in, wx, wy, sx, sy, px, py);  break;
+            case f64: output = unwrap<double >(in, wx, wy, sx, sy, px, py);  break;
+            case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy, px, py);  break;
+            case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy, px, py);  break;
+            case s32: output = unwrap<int    >(in, wx, wy, sx, sy, px, py);  break;
+            case u32: output = unwrap<uint   >(in, wx, wy, sx, sy, px, py);  break;
+            case s64: output = unwrap<intl   >(in, wx, wy, sx, sy, px, py);  break;
+            case u64: output = unwrap<uintl  >(in, wx, wy, sx, sy, px, py);  break;
+            case u8:  output = unwrap<uchar  >(in, wx, wy, sx, sy, px, py);  break;
+            case b8:  output = unwrap<char   >(in, wx, wy, sx, sy, px, py);  break;
             default:  TYPE_ERROR(1, type);
         }
         std::swap(*out,output);
diff --git a/src/api/cpp/unwrap.cpp b/src/api/cpp/unwrap.cpp
index b10dbeb..7a1a103 100644
--- a/src/api/cpp/unwrap.cpp
+++ b/src/api/cpp/unwrap.cpp
@@ -13,10 +13,11 @@
 
 namespace af
 {
-    array unwrap(const array& in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
+    array unwrap(const array& in, const dim_t wx, const dim_t wy,
+                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
     {
         af_array out = 0;
-        AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy));
+        AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy, px, py));
         return array(out);
     }
 }
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 76bc915..6d206b2 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -19,15 +19,23 @@ namespace cpu
     template<typename T>
     void unwrap_(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                 const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
+                 const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
+                 const dim_t px, const dim_t py)
     {
-        dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
+        dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
 
         for(dim_t w = 0; w < odims[3]; w++) {
             for(dim_t z = 0; z < odims[2]; z++) {
+
                 dim_t cOut = w * ostrides[3] + z * ostrides[2];
                 dim_t cIn  = w * istrides[3] + z * istrides[2];
+                const T* iptr = inPtr  + cIn;
+                      T* optr_= outPtr + cOut;
+
                 for(dim_t col = 0; col < odims[1]; col++) {
+                    // Offset output ptr
+                    T* optr = optr_ + col * ostrides[1];
+
                     // Calculate input window index
                     dim_t winy = (col / nx);
                     dim_t winx = (col % nx);
@@ -35,22 +43,26 @@ namespace cpu
                     dim_t startx = winx * sx;
                     dim_t starty = winy * sy;
 
-                          T* optr = outPtr + cOut + col * ostrides[1];
-                    const T* iptr = inPtr  + cIn  + starty * istrides[1] + startx;
+                    dim_t spx = startx - px;
+                    dim_t spy = starty - py;
 
-                    // Condition shortcuts
-                    bool cond = true;
-                    if((startx + wx >= idims[0]) || (starty + wy >= idims[1]))
-                        cond = false;
+                    // Short cut condition ensuring all values within input dimensions
+                    bool cond = false;
+                    if(spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1])
+                        cond = true;
 
                     for(dim_t y = 0; y < wy; y++) {
                         for(dim_t x = 0; x < wx; x++) {
+                            dim_t xpad = spx + x;
+                            dim_t ypad = spy + y;
+
                             dim_t oloc = (y * wx + x) * ostrides[0];
-                            dim_t iloc = (y * istrides[1] + x * istrides[0]);
-                            if(cond || (startx + x < idims[0] && starty + y < idims[1]))
+                            if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
+                                dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
                                 optr[oloc] = iptr[iloc];
-                            else
+                            } else {
                                 optr[oloc] = scalar<T>(0.0);
+                            }
                         }
                     }
                 }
@@ -60,12 +72,12 @@ namespace cpu
 
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
     {
         af::dim4 idims = in.dims();
 
-        dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
-        dim_t ny = divup(idims[1] - wy, sy) + (sy >= idims[1] ? 0 : 1);
+        dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+        dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
 
         af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
 
@@ -79,7 +91,7 @@ namespace cpu
         af::dim4 ostrides = outArray.strides();
         af::dim4 istrides = in.strides();
 
-        unwrap_(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy);
+        unwrap_(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
 
         return outArray;
     }
@@ -87,7 +99,7 @@ namespace cpu
 
 #define INSTANTIATE(T)                                                                  \
     template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
-                                 const dim_t sx, const dim_t sy);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 
 
     INSTANTIATE(float)
diff --git a/src/backend/cpu/unwrap.hpp b/src/backend/cpu/unwrap.hpp
index c492813..7b5ea75 100644
--- a/src/backend/cpu/unwrap.hpp
+++ b/src/backend/cpu/unwrap.hpp
@@ -13,6 +13,6 @@ namespace cpu
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 }
 
diff --git a/src/backend/cuda/kernel/unwrap.hpp b/src/backend/cuda/kernel/unwrap.hpp
index d5921df..8135844 100644
--- a/src/backend/cuda/kernel/unwrap.hpp
+++ b/src/backend/cuda/kernel/unwrap.hpp
@@ -24,7 +24,7 @@ namespace cuda
         __global__
         void unwrap_kernel(Param<T> out, CParam<T> in,
                            const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
-                           dim_t repsPerColumn)
+                           const dim_t px, const dim_t py, const dim_t nx, dim_t repsPerColumn)
         {
             // Compute channel and volume
             const dim_t w = blockIdx.y / in.dims[2];
@@ -37,9 +37,6 @@ namespace cuda
             const dim_t cOut = w * out.strides[3] + z * out.strides[2];
             const dim_t cIn  = w *  in.strides[3] + z *  in.strides[2];
 
-            // Compute the number of windows along dim0 of input
-            const dim_t nx = divup(in.dims[0] - wx, sx) + (sx >= in.dims[0] ? 0 : 1);
-
             // Compute the output column index
             const dim_t colId = blockIdx.x * blockDim.y + threadIdx.y;
 
@@ -50,13 +47,16 @@ namespace cuda
             const dim_t startx = (colId % nx) * sx;
             const dim_t starty = (colId / nx) * sy;
 
+            const dim_t spx = startx - px;
+            const dim_t spy = starty - py;
+
             // Offset the global pointers to the respective starting indices
                   T* optr = out.ptr + cOut + colId * out.strides[1];
-            const T* iptr = in.ptr  + cIn  + starty * in.strides[1] + startx;
+            const T* iptr = in.ptr  + cIn;
 
-            bool cond = true;
-            if((startx + wx >= in.dims[0]) || (starty + wy >= in.dims[1]))
-                cond = false;
+            bool cond = false;
+            if(spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1])
+                cond = true;
 
             for(int i = 0; i < repsPerColumn; i++) {
                 // Compute output index local to column
@@ -69,14 +69,18 @@ namespace cuda
                 const dim_t x = colIndex % wx;
                 const dim_t y = colIndex / wx;
 
+                const dim_t xpad = spx + x;
+                const dim_t ypad = spy + y;
+
                 const dim_t outIdx = (y * wx + x) * out.strides[0];
-                const dim_t inIdx = y * in.strides[1] + x * in.strides[0];
 
                 // Copy
-                if(cond || (startx + x < in.dims[0] && starty + y < in.dims[1]))
+                if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
+                    const dim_t inIdx = ypad * in.strides[1] + xpad * in.strides[0];
                     optr[outIdx] = iptr[inIdx];
-                else
+                } else {
                     optr[outIdx] = scalar<T>(0.0);
+                }
             }
         }
 
@@ -85,7 +89,7 @@ namespace cuda
         ///////////////////////////////////////////////////////////////////////////
         template <typename T, int TX>
         void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
         {
             dim3 threads(TX, 256 / TX, 1);
 
@@ -96,7 +100,7 @@ namespace cuda
 
             dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);
 
-            unwrap_kernel<T, TX><<<blocks, threads>>>(out, in, wx, wy, sx, sy, repsPerColumn);
+            unwrap_kernel<T, TX><<<blocks, threads>>>(out, in, wx, wy, sx, sy, px, py, nx, repsPerColumn);
             POST_LAUNCH_CHECK();
         }
     }
diff --git a/src/backend/cuda/unwrap.cu b/src/backend/cuda/unwrap.cu
index 41aa8dc..3164e87 100644
--- a/src/backend/cuda/unwrap.cu
+++ b/src/backend/cuda/unwrap.cu
@@ -17,12 +17,12 @@ namespace cuda
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
     {
         af::dim4 idims = in.dims();
 
-        dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
-        dim_t ny = divup(idims[1] - wy, sy) + (sy >= idims[1] ? 0 : 1);
+        dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+        dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
 
         af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
 
@@ -30,15 +30,15 @@ namespace cuda
         Array<T> outArray = createEmptyArray<T>(odims);
 
         if(odims[0] <= 16) {
-            kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else if (odims[0] <= 32) {
-            kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else if (odims[0] <= 64) {
-            kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else if(odims[0] <= 128) {
-            kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else {
-            kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
         }
 
         return outArray;
@@ -47,7 +47,7 @@ namespace cuda
 
 #define INSTANTIATE(T)                                                                  \
     template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
-                                 const dim_t sx, const dim_t sy);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 
 
     INSTANTIATE(float)
diff --git a/src/backend/cuda/unwrap.hpp b/src/backend/cuda/unwrap.hpp
index ad71a12..0217c4b 100644
--- a/src/backend/cuda/unwrap.hpp
+++ b/src/backend/cuda/unwrap.hpp
@@ -13,6 +13,6 @@ namespace cuda
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 }
 
diff --git a/src/backend/opencl/kernel/unwrap.cl b/src/backend/opencl/kernel/unwrap.cl
index 331f0da..8a15f0e 100644
--- a/src/backend/opencl/kernel/unwrap.cl
+++ b/src/backend/opencl/kernel/unwrap.cl
@@ -24,7 +24,7 @@ __kernel
 void unwrap_kernel(__global T *d_out, const KParam out,
                    __global const T *d_in, const KParam in,
                    const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
-                   const dim_t repsPerColumn)
+                   const dim_t px, const dim_t py, const dim_t nx, const dim_t repsPerColumn)
 {
     // Compute channel and volume
     const dim_t w = get_group_id(1) / in.dims[2];
@@ -37,11 +37,6 @@ void unwrap_kernel(__global T *d_out, const KParam out,
     const dim_t cOut = w * out.strides[3] + z * out.strides[2];
     const dim_t cIn  = w *  in.strides[3] + z *  in.strides[2];
 
-    // Compute the number of windows along dim0 of input
-    int nx_add = 1;
-    if(sx >= in.dims[0]) nx_add = 0;
-    const dim_t nx = divup(in.dims[0] - wx, sx) + nx_add;
-
     // Compute the output column index
     const dim_t colId = get_group_id(0) * get_local_size(1) + get_local_id(1);
 
@@ -52,13 +47,17 @@ void unwrap_kernel(__global T *d_out, const KParam out,
     const dim_t startx = (colId % nx) * sx;
     const dim_t starty = (colId / nx) * sy;
 
+    const dim_t spx = startx - px;
+    const dim_t spy = starty - py;
+
     // Offset the global pointers to the respective starting indices
     __global       T* optr = d_out + cOut + colId * out.strides[1];
-    __global const T* iptr = d_in  + cIn  + starty * in.strides[1] + startx + in.offset;
+    __global const T* iptr = d_in  + cIn + in.offset;
+
+    bool cond = false;
+    if(spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1])
+        cond = true;
 
-    bool cond = true;
-    if((startx + wx >= in.dims[0]) || (starty + wy >= in.dims[1]))
-        cond = false;
 
     for(int i = 0; i < repsPerColumn; i++) {
         // Compute output index local to column
@@ -69,15 +68,19 @@ void unwrap_kernel(__global T *d_out, const KParam out,
 
         // Compute input index local to window
         const dim_t y = colIndex / wx;
-        const dim_t x = colIndex - y * wx;  // colIndex % wx
+        const dim_t x = colIndex % wx;
+
+        const dim_t xpad = spx + x;
+        const dim_t ypad = spy + y;
 
         const dim_t outIdx = (y * wx + x) * out.strides[0];
-        const dim_t inIdx = y * in.strides[1] + x * in.strides[0];
 
         // Copy
-        if(cond || (startx + x < in.dims[0] && starty + y < in.dims[1]))
+        if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
+            const dim_t inIdx = ypad * in.strides[1] + xpad * in.strides[0];
             optr[outIdx] = iptr[inIdx];
-        else
+        } else {
             set_scalar(optr[outIdx], 0);
+        }
     }
 }
diff --git a/src/backend/opencl/kernel/unwrap.hpp b/src/backend/opencl/kernel/unwrap.hpp
index 8341e72..d9f7d49 100644
--- a/src/backend/opencl/kernel/unwrap.hpp
+++ b/src/backend/opencl/kernel/unwrap.hpp
@@ -32,7 +32,7 @@ namespace opencl
     {
         template<typename T, int TX>
         void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
         {
             try {
                 static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
@@ -65,7 +65,8 @@ namespace opencl
                 });
 
                 auto unwrapOp = make_kernel<Buffer, const KParam, const Buffer, const KParam,
-                                      const dim_t, const dim_t, const dim_t, const dim_t, const dim_t>
+                                      const dim_t, const dim_t, const dim_t, const dim_t,
+                                      const dim_t, const dim_t, const dim_t, const dim_t>
                                       (*unwrapKernels[device]);
 
                 const dim_t TY = 256 / TX;
@@ -81,7 +82,7 @@ namespace opencl
                                1);
 
                 unwrapOp(EnqueueArgs(getQueue(), global, local),
-                       *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, repsPerColumn);
+                       *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, repsPerColumn);
 
                 CL_DEBUG_FINISH(getQueue());
             } catch (cl::Error err) {
diff --git a/src/backend/opencl/unwrap.cpp b/src/backend/opencl/unwrap.cpp
index 61f2ddb..da8f8b3 100644
--- a/src/backend/opencl/unwrap.cpp
+++ b/src/backend/opencl/unwrap.cpp
@@ -17,12 +17,12 @@ namespace opencl
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy)
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
     {
         af::dim4 idims = in.dims();
 
-        dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
-        dim_t ny = divup(idims[1] - wy, sy) + (sy >= idims[1] ? 0 : 1);
+        dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+        dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
 
         af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
 
@@ -30,15 +30,15 @@ namespace opencl
         Array<T> outArray = createEmptyArray<T>(odims);
 
         if(odims[0] <= 16) {
-            kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else if (odims[0] <= 32) {
-            kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else if (odims[0] <= 64) {
-            kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else if(odims[0] <= 128) {
-            kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
         } else {
-            kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy);
+            kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
         }
 
         return outArray;
@@ -47,7 +47,7 @@ namespace opencl
 
 #define INSTANTIATE(T)                                                                  \
     template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
-                                 const dim_t sx, const dim_t sy);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 
 
     INSTANTIATE(float)
diff --git a/src/backend/opencl/unwrap.hpp b/src/backend/opencl/unwrap.hpp
index 290f28f..40efb4c 100644
--- a/src/backend/opencl/unwrap.hpp
+++ b/src/backend/opencl/unwrap.hpp
@@ -13,6 +13,6 @@ namespace opencl
 {
     template<typename T>
     Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                    const dim_t sx, const dim_t sy);
+                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git