[arrayfire] 16/408: Changing behavior of unwrap using padding
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:06 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 2816c018af55b1af243fbe927cdd7122cc4cd8c7
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date: Fri Jun 19 11:21:03 2015 -0400
Changing behavior of unwrap using padding
* All 3 backends changed
* PX and PY can be less then WX and WY respectively
---
include/af/image.h | 18 ++++++++++-----
src/api/c/unwrap.cpp | 35 ++++++++++++++++------------
src/api/cpp/unwrap.cpp | 5 ++--
src/backend/cpu/unwrap.cpp | 44 +++++++++++++++++++++++-------------
src/backend/cpu/unwrap.hpp | 2 +-
src/backend/cuda/kernel/unwrap.hpp | 30 +++++++++++++-----------
src/backend/cuda/unwrap.cu | 18 +++++++--------
src/backend/cuda/unwrap.hpp | 2 +-
src/backend/opencl/kernel/unwrap.cl | 31 +++++++++++++------------
src/backend/opencl/kernel/unwrap.hpp | 7 +++---
src/backend/opencl/unwrap.cpp | 18 +++++++--------
src/backend/opencl/unwrap.hpp | 2 +-
12 files changed, 122 insertions(+), 90 deletions(-)
diff --git a/include/af/image.h b/include/af/image.h
index 0c616bc..22b9fd5 100644
--- a/include/af/image.h
+++ b/include/af/image.h
@@ -465,15 +465,18 @@ AFAPI array colorSpace(const array& image, const CSpace to, const CSpace from);
C++ Interface wrapper for unwrap
\param[in] in is the input array
- \param[in] wx is the block window size along 0th-dimension
- \param[in] wy is the block window size along 1st-dimension
+ \param[in] wx is the block window size along 0th-dimension between [1, input.dims[0] + px]
+ \param[in] wy is the block window size along 1st-dimension between [1, input.dims[1] + py]
\param[in] sx is the stride along 0th-dimension
\param[in] sy is the stride along 1st-dimension
+ \param[in] px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
+ \param[in] py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
\returns an array with the image blocks as columns
\ingroup image_func_unwrap
*/
-AFAPI array unwrap(const array& in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy);
+AFAPI array unwrap(const array& in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
}
#endif
@@ -922,16 +925,19 @@ extern "C" {
\param[out] out is an array with image blocks as columns.
\param[in] in is the input array
- \param[in] wx is the block window size along 0th-dimension
- \param[in] wy is the block window size along 1st-dimension
+ \param[in] wx is the block window size along 0th-dimension between [1, input.dims[0] + px]
+ \param[in] wy is the block window size along 1st-dimension between [1, input.dims[1] + py]
\param[in] sx is the stride along 0th-dimension
\param[in] sy is the stride along 1st-dimension
+ \param[in] px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
+ \param[in] py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
\return \ref AF_SUCCESS if the color transformation is successful,
otherwise an appropriate error code is returned.
\ingroup image_func_unwrap
*/
- AFAPI af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy);
+ AFAPI af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
#ifdef __cplusplus
}
diff --git a/src/api/c/unwrap.cpp b/src/api/c/unwrap.cpp
index 6ef8d49..2b173b3 100644
--- a/src/api/c/unwrap.cpp
+++ b/src/api/c/unwrap.cpp
@@ -20,36 +20,41 @@ using namespace detail;
template<typename T>
static inline af_array unwrap(const af_array in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
{
- return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy));
+ return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy, px, py));
}
-af_err af_unwrap(af_array *out, const af_array in,
- const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
+af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
{
try {
ArrayInfo info = getInfo(in);
af_dtype type = info.getType();
af::dim4 idims = info.dims();
- DIM_ASSERT(1, idims[0] >= wx && idims[1] >= wy);
+ DIM_ASSERT(2, wx > 0 && wx <= idims[0] + px);
+ DIM_ASSERT(3, wy > 0 && wy <= idims[1] + py);
ARG_ASSERT(4, sx > 0);
ARG_ASSERT(5, sy > 0);
+ ARG_ASSERT(4, sx > 0);
+ ARG_ASSERT(5, sy > 0);
+ ARG_ASSERT(6, px >= 0 && px < wx);
+ ARG_ASSERT(7, py >= 0 && py < wy);
af_array output;
switch(type) {
- case f32: output = unwrap<float >(in, wx, wy, sx, sy); break;
- case f64: output = unwrap<double >(in, wx, wy, sx, sy); break;
- case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy); break;
- case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy); break;
- case s32: output = unwrap<int >(in, wx, wy, sx, sy); break;
- case u32: output = unwrap<uint >(in, wx, wy, sx, sy); break;
- case s64: output = unwrap<intl >(in, wx, wy, sx, sy); break;
- case u64: output = unwrap<uintl >(in, wx, wy, sx, sy); break;
- case u8: output = unwrap<uchar >(in, wx, wy, sx, sy); break;
- case b8: output = unwrap<char >(in, wx, wy, sx, sy); break;
+ case f32: output = unwrap<float >(in, wx, wy, sx, sy, px, py); break;
+ case f64: output = unwrap<double >(in, wx, wy, sx, sy, px, py); break;
+ case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy, px, py); break;
+ case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy, px, py); break;
+ case s32: output = unwrap<int >(in, wx, wy, sx, sy, px, py); break;
+ case u32: output = unwrap<uint >(in, wx, wy, sx, sy, px, py); break;
+ case s64: output = unwrap<intl >(in, wx, wy, sx, sy, px, py); break;
+ case u64: output = unwrap<uintl >(in, wx, wy, sx, sy, px, py); break;
+ case u8: output = unwrap<uchar >(in, wx, wy, sx, sy, px, py); break;
+ case b8: output = unwrap<char >(in, wx, wy, sx, sy, px, py); break;
default: TYPE_ERROR(1, type);
}
std::swap(*out,output);
diff --git a/src/api/cpp/unwrap.cpp b/src/api/cpp/unwrap.cpp
index b10dbeb..7a1a103 100644
--- a/src/api/cpp/unwrap.cpp
+++ b/src/api/cpp/unwrap.cpp
@@ -13,10 +13,11 @@
namespace af
{
- array unwrap(const array& in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
+ array unwrap(const array& in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
{
af_array out = 0;
- AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy));
+ AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy, px, py));
return array(out);
}
}
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 76bc915..6d206b2 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -19,15 +19,23 @@ namespace cpu
template<typename T>
void unwrap_(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
const af::dim4 &ostrides, const af::dim4 &istrides,
- const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
+ const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
+ const dim_t px, const dim_t py)
{
- dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
+ dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
for(dim_t w = 0; w < odims[3]; w++) {
for(dim_t z = 0; z < odims[2]; z++) {
+
dim_t cOut = w * ostrides[3] + z * ostrides[2];
dim_t cIn = w * istrides[3] + z * istrides[2];
+ const T* iptr = inPtr + cIn;
+ T* optr_= outPtr + cOut;
+
for(dim_t col = 0; col < odims[1]; col++) {
+ // Offset output ptr
+ T* optr = optr_ + col * ostrides[1];
+
// Calculate input window index
dim_t winy = (col / nx);
dim_t winx = (col % nx);
@@ -35,22 +43,26 @@ namespace cpu
dim_t startx = winx * sx;
dim_t starty = winy * sy;
- T* optr = outPtr + cOut + col * ostrides[1];
- const T* iptr = inPtr + cIn + starty * istrides[1] + startx;
+ dim_t spx = startx - px;
+ dim_t spy = starty - py;
- // Condition shortcuts
- bool cond = true;
- if((startx + wx >= idims[0]) || (starty + wy >= idims[1]))
- cond = false;
+ // Short cut condition ensuring all values within input dimensions
+ bool cond = false;
+ if(spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1])
+ cond = true;
for(dim_t y = 0; y < wy; y++) {
for(dim_t x = 0; x < wx; x++) {
+ dim_t xpad = spx + x;
+ dim_t ypad = spy + y;
+
dim_t oloc = (y * wx + x) * ostrides[0];
- dim_t iloc = (y * istrides[1] + x * istrides[0]);
- if(cond || (startx + x < idims[0] && starty + y < idims[1]))
+ if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
+ dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
optr[oloc] = iptr[iloc];
- else
+ } else {
optr[oloc] = scalar<T>(0.0);
+ }
}
}
}
@@ -60,12 +72,12 @@ namespace cpu
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
{
af::dim4 idims = in.dims();
- dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
- dim_t ny = divup(idims[1] - wy, sy) + (sy >= idims[1] ? 0 : 1);
+ dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+ dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
@@ -79,7 +91,7 @@ namespace cpu
af::dim4 ostrides = outArray.strides();
af::dim4 istrides = in.strides();
- unwrap_(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy);
+ unwrap_(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
return outArray;
}
@@ -87,7 +99,7 @@ namespace cpu
#define INSTANTIATE(T) \
template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
INSTANTIATE(float)
diff --git a/src/backend/cpu/unwrap.hpp b/src/backend/cpu/unwrap.hpp
index c492813..7b5ea75 100644
--- a/src/backend/cpu/unwrap.hpp
+++ b/src/backend/cpu/unwrap.hpp
@@ -13,6 +13,6 @@ namespace cpu
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
}
diff --git a/src/backend/cuda/kernel/unwrap.hpp b/src/backend/cuda/kernel/unwrap.hpp
index d5921df..8135844 100644
--- a/src/backend/cuda/kernel/unwrap.hpp
+++ b/src/backend/cuda/kernel/unwrap.hpp
@@ -24,7 +24,7 @@ namespace cuda
__global__
void unwrap_kernel(Param<T> out, CParam<T> in,
const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
- dim_t repsPerColumn)
+ const dim_t px, const dim_t py, const dim_t nx, dim_t repsPerColumn)
{
// Compute channel and volume
const dim_t w = blockIdx.y / in.dims[2];
@@ -37,9 +37,6 @@ namespace cuda
const dim_t cOut = w * out.strides[3] + z * out.strides[2];
const dim_t cIn = w * in.strides[3] + z * in.strides[2];
- // Compute the number of windows along dim0 of input
- const dim_t nx = divup(in.dims[0] - wx, sx) + (sx >= in.dims[0] ? 0 : 1);
-
// Compute the output column index
const dim_t colId = blockIdx.x * blockDim.y + threadIdx.y;
@@ -50,13 +47,16 @@ namespace cuda
const dim_t startx = (colId % nx) * sx;
const dim_t starty = (colId / nx) * sy;
+ const dim_t spx = startx - px;
+ const dim_t spy = starty - py;
+
// Offset the global pointers to the respective starting indices
T* optr = out.ptr + cOut + colId * out.strides[1];
- const T* iptr = in.ptr + cIn + starty * in.strides[1] + startx;
+ const T* iptr = in.ptr + cIn;
- bool cond = true;
- if((startx + wx >= in.dims[0]) || (starty + wy >= in.dims[1]))
- cond = false;
+ bool cond = false;
+ if(spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1])
+ cond = true;
for(int i = 0; i < repsPerColumn; i++) {
// Compute output index local to column
@@ -69,14 +69,18 @@ namespace cuda
const dim_t x = colIndex % wx;
const dim_t y = colIndex / wx;
+ const dim_t xpad = spx + x;
+ const dim_t ypad = spy + y;
+
const dim_t outIdx = (y * wx + x) * out.strides[0];
- const dim_t inIdx = y * in.strides[1] + x * in.strides[0];
// Copy
- if(cond || (startx + x < in.dims[0] && starty + y < in.dims[1]))
+ if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
+ const dim_t inIdx = ypad * in.strides[1] + xpad * in.strides[0];
optr[outIdx] = iptr[inIdx];
- else
+ } else {
optr[outIdx] = scalar<T>(0.0);
+ }
}
}
@@ -85,7 +89,7 @@ namespace cuda
///////////////////////////////////////////////////////////////////////////
template <typename T, int TX>
void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
{
dim3 threads(TX, 256 / TX, 1);
@@ -96,7 +100,7 @@ namespace cuda
dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);
- unwrap_kernel<T, TX><<<blocks, threads>>>(out, in, wx, wy, sx, sy, repsPerColumn);
+ unwrap_kernel<T, TX><<<blocks, threads>>>(out, in, wx, wy, sx, sy, px, py, nx, repsPerColumn);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/unwrap.cu b/src/backend/cuda/unwrap.cu
index 41aa8dc..3164e87 100644
--- a/src/backend/cuda/unwrap.cu
+++ b/src/backend/cuda/unwrap.cu
@@ -17,12 +17,12 @@ namespace cuda
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
{
af::dim4 idims = in.dims();
- dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
- dim_t ny = divup(idims[1] - wy, sy) + (sy >= idims[1] ? 0 : 1);
+ dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+ dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
@@ -30,15 +30,15 @@ namespace cuda
Array<T> outArray = createEmptyArray<T>(odims);
if(odims[0] <= 16) {
- kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
} else if (odims[0] <= 32) {
- kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
} else if (odims[0] <= 64) {
- kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
} else if(odims[0] <= 128) {
- kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
} else {
- kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
}
return outArray;
@@ -47,7 +47,7 @@ namespace cuda
#define INSTANTIATE(T) \
template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
INSTANTIATE(float)
diff --git a/src/backend/cuda/unwrap.hpp b/src/backend/cuda/unwrap.hpp
index ad71a12..0217c4b 100644
--- a/src/backend/cuda/unwrap.hpp
+++ b/src/backend/cuda/unwrap.hpp
@@ -13,6 +13,6 @@ namespace cuda
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
}
diff --git a/src/backend/opencl/kernel/unwrap.cl b/src/backend/opencl/kernel/unwrap.cl
index 331f0da..8a15f0e 100644
--- a/src/backend/opencl/kernel/unwrap.cl
+++ b/src/backend/opencl/kernel/unwrap.cl
@@ -24,7 +24,7 @@ __kernel
void unwrap_kernel(__global T *d_out, const KParam out,
__global const T *d_in, const KParam in,
const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
- const dim_t repsPerColumn)
+ const dim_t px, const dim_t py, const dim_t nx, const dim_t repsPerColumn)
{
// Compute channel and volume
const dim_t w = get_group_id(1) / in.dims[2];
@@ -37,11 +37,6 @@ void unwrap_kernel(__global T *d_out, const KParam out,
const dim_t cOut = w * out.strides[3] + z * out.strides[2];
const dim_t cIn = w * in.strides[3] + z * in.strides[2];
- // Compute the number of windows along dim0 of input
- int nx_add = 1;
- if(sx >= in.dims[0]) nx_add = 0;
- const dim_t nx = divup(in.dims[0] - wx, sx) + nx_add;
-
// Compute the output column index
const dim_t colId = get_group_id(0) * get_local_size(1) + get_local_id(1);
@@ -52,13 +47,17 @@ void unwrap_kernel(__global T *d_out, const KParam out,
const dim_t startx = (colId % nx) * sx;
const dim_t starty = (colId / nx) * sy;
+ const dim_t spx = startx - px;
+ const dim_t spy = starty - py;
+
// Offset the global pointers to the respective starting indices
__global T* optr = d_out + cOut + colId * out.strides[1];
- __global const T* iptr = d_in + cIn + starty * in.strides[1] + startx + in.offset;
+ __global const T* iptr = d_in + cIn + in.offset;
+
+ bool cond = false;
+ if(spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1])
+ cond = true;
- bool cond = true;
- if((startx + wx >= in.dims[0]) || (starty + wy >= in.dims[1]))
- cond = false;
for(int i = 0; i < repsPerColumn; i++) {
// Compute output index local to column
@@ -69,15 +68,19 @@ void unwrap_kernel(__global T *d_out, const KParam out,
// Compute input index local to window
const dim_t y = colIndex / wx;
- const dim_t x = colIndex - y * wx; // colIndex % wx
+ const dim_t x = colIndex % wx;
+
+ const dim_t xpad = spx + x;
+ const dim_t ypad = spy + y;
const dim_t outIdx = (y * wx + x) * out.strides[0];
- const dim_t inIdx = y * in.strides[1] + x * in.strides[0];
// Copy
- if(cond || (startx + x < in.dims[0] && starty + y < in.dims[1]))
+ if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
+ const dim_t inIdx = ypad * in.strides[1] + xpad * in.strides[0];
optr[outIdx] = iptr[inIdx];
- else
+ } else {
set_scalar(optr[outIdx], 0);
+ }
}
}
diff --git a/src/backend/opencl/kernel/unwrap.hpp b/src/backend/opencl/kernel/unwrap.hpp
index 8341e72..d9f7d49 100644
--- a/src/backend/opencl/kernel/unwrap.hpp
+++ b/src/backend/opencl/kernel/unwrap.hpp
@@ -32,7 +32,7 @@ namespace opencl
{
template<typename T, int TX>
void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
@@ -65,7 +65,8 @@ namespace opencl
});
auto unwrapOp = make_kernel<Buffer, const KParam, const Buffer, const KParam,
- const dim_t, const dim_t, const dim_t, const dim_t, const dim_t>
+ const dim_t, const dim_t, const dim_t, const dim_t,
+ const dim_t, const dim_t, const dim_t, const dim_t>
(*unwrapKernels[device]);
const dim_t TY = 256 / TX;
@@ -81,7 +82,7 @@ namespace opencl
1);
unwrapOp(EnqueueArgs(getQueue(), global, local),
- *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, repsPerColumn);
+ *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, repsPerColumn);
CL_DEBUG_FINISH(getQueue());
} catch (cl::Error err) {
diff --git a/src/backend/opencl/unwrap.cpp b/src/backend/opencl/unwrap.cpp
index 61f2ddb..da8f8b3 100644
--- a/src/backend/opencl/unwrap.cpp
+++ b/src/backend/opencl/unwrap.cpp
@@ -17,12 +17,12 @@ namespace opencl
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
{
af::dim4 idims = in.dims();
- dim_t nx = divup(idims[0] - wx, sx) + (sx >= idims[0] ? 0 : 1);
- dim_t ny = divup(idims[1] - wy, sy) + (sy >= idims[1] ? 0 : 1);
+ dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
+ dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
@@ -30,15 +30,15 @@ namespace opencl
Array<T> outArray = createEmptyArray<T>(odims);
if(odims[0] <= 16) {
- kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
} else if (odims[0] <= 32) {
- kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
} else if (odims[0] <= 64) {
- kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
} else if(odims[0] <= 128) {
- kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
} else {
- kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy);
+ kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
}
return outArray;
@@ -47,7 +47,7 @@ namespace opencl
#define INSTANTIATE(T) \
template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
INSTANTIATE(float)
diff --git a/src/backend/opencl/unwrap.hpp b/src/backend/opencl/unwrap.hpp
index 290f28f..40efb4c 100644
--- a/src/backend/opencl/unwrap.hpp
+++ b/src/backend/opencl/unwrap.hpp
@@ -13,6 +13,6 @@ namespace opencl
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list