[arrayfire] 268/408: FEAT: Adding support to unwrap along rows as well as columns
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:11 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 2d60aed2d75163f816cba4c2271ef39cae55125d
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Fri Aug 14 16:29:59 2015 -0400
FEAT: Adding support to unwrap along rows as well as columns
- updated tests to test for unwrapping along rows
---
include/af/image.h | 8 ++-
src/api/c/unwrap.cpp | 27 ++++++-----
src/api/cpp/unwrap.cpp | 5 +-
src/backend/cpu/unwrap.cpp | 38 +++++++++------
src/backend/cpu/unwrap.hpp | 3 +-
src/backend/cuda/kernel/unwrap.hpp | 94 +++++++++++++++++++++++++-----------
src/backend/cuda/unwrap.cu | 24 ++++-----
src/backend/cuda/unwrap.hpp | 3 +-
src/backend/opencl/kernel/unwrap.cl | 50 +++++++++----------
src/backend/opencl/kernel/unwrap.hpp | 37 ++++++++------
src/backend/opencl/unwrap.cpp | 21 ++++----
src/backend/opencl/unwrap.hpp | 3 +-
test/unwrap.cpp | 25 ++++++----
13 files changed, 193 insertions(+), 145 deletions(-)
diff --git a/include/af/image.h b/include/af/image.h
index 4f7227c..6bdbfb4 100644
--- a/include/af/image.h
+++ b/include/af/image.h
@@ -483,12 +483,14 @@ AFAPI array dog(const array& in, const int radius1, const int radius2);
\param[in] sy is the stride along 1st-dimension
\param[in] px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
\param[in] py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
+ \param[in] is_column specifies the layout for the unwrapped patch. If is_column is false, the unrapped patch is laid out as a row.
\returns an array with the image blocks as columns
\ingroup image_func_unwrap
*/
AFAPI array unwrap(const array& in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px=0, const dim_t py=0);
+ const dim_t sx, const dim_t sy, const dim_t px=0, const dim_t py=0,
+ const bool is_column = true);
/**
@@ -996,13 +998,15 @@ extern "C" {
\param[in] sy is the stride along 1st-dimension
\param[in] px is the padding along 0th-dimension between [0, wx). Padding is applied both before and after.
\param[in] py is the padding along 1st-dimension between [0, wy). Padding is applied both before and after.
+ \param[in] is_column specifies the layout for the unwrapped patch. If is_column is false, the unrapped patch is laid out as a row.
\return \ref AF_SUCCESS if the color transformation is successful,
otherwise an appropriate error code is returned.
\ingroup image_func_unwrap
*/
AFAPI af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py,
+ const bool is_column);
/**
C Interface wrapper for summed area tables
diff --git a/src/api/c/unwrap.cpp b/src/api/c/unwrap.cpp
index 3816874..5bb6fcd 100644
--- a/src/api/c/unwrap.cpp
+++ b/src/api/c/unwrap.cpp
@@ -20,13 +20,14 @@ using namespace detail;
template<typename T>
static inline af_array unwrap(const af_array in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py,
+ const bool is_column)
{
- return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy, px, py));
+ return getHandle(unwrap<T>(getArray<T>(in), wx, wy, sx, sy, px, py, is_column));
}
af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
{
try {
ArrayInfo info = getInfo(in);
@@ -43,16 +44,16 @@ af_err af_unwrap(af_array *out, const af_array in, const dim_t wx, const dim_t w
af_array output;
switch(type) {
- case f32: output = unwrap<float >(in, wx, wy, sx, sy, px, py); break;
- case f64: output = unwrap<double >(in, wx, wy, sx, sy, px, py); break;
- case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy, px, py); break;
- case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy, px, py); break;
- case s32: output = unwrap<int >(in, wx, wy, sx, sy, px, py); break;
- case u32: output = unwrap<uint >(in, wx, wy, sx, sy, px, py); break;
- case s64: output = unwrap<intl >(in, wx, wy, sx, sy, px, py); break;
- case u64: output = unwrap<uintl >(in, wx, wy, sx, sy, px, py); break;
- case u8: output = unwrap<uchar >(in, wx, wy, sx, sy, px, py); break;
- case b8: output = unwrap<char >(in, wx, wy, sx, sy, px, py); break;
+ case f32: output = unwrap<float >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case f64: output = unwrap<double >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case c32: output = unwrap<cfloat >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case c64: output = unwrap<cdouble>(in, wx, wy, sx, sy, px, py, is_column); break;
+ case s32: output = unwrap<int >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case u32: output = unwrap<uint >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case s64: output = unwrap<intl >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case u64: output = unwrap<uintl >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case u8: output = unwrap<uchar >(in, wx, wy, sx, sy, px, py, is_column); break;
+ case b8: output = unwrap<char >(in, wx, wy, sx, sy, px, py, is_column); break;
default: TYPE_ERROR(1, type);
}
std::swap(*out,output);
diff --git a/src/api/cpp/unwrap.cpp b/src/api/cpp/unwrap.cpp
index 7a1a103..d48d312 100644
--- a/src/api/cpp/unwrap.cpp
+++ b/src/api/cpp/unwrap.cpp
@@ -14,11 +14,10 @@
namespace af
{
array unwrap(const array& in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
{
af_array out = 0;
- AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy, px, py));
+ AF_THROW(af_unwrap(&out, in.get(), wx, wy, sx, sy, px, py, is_column));
return array(out);
}
}
-
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 6d206b2..0de292e 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -16,11 +16,11 @@
namespace cpu
{
- template<typename T>
- void unwrap_(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
- const dim_t px, const dim_t py)
+ template<typename T, int d>
+ void unwrap_dim(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
+ const af::dim4 &ostrides, const af::dim4 &istrides,
+ const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
+ const dim_t px, const dim_t py)
{
dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
@@ -30,11 +30,11 @@ namespace cpu
dim_t cOut = w * ostrides[3] + z * ostrides[2];
dim_t cIn = w * istrides[3] + z * istrides[2];
const T* iptr = inPtr + cIn;
- T* optr_= outPtr + cOut;
+ T* optr_= outPtr + cOut;
- for(dim_t col = 0; col < odims[1]; col++) {
+ for(dim_t col = 0; col < odims[d]; col++) {
// Offset output ptr
- T* optr = optr_ + col * ostrides[1];
+ T* optr = optr_ + col * ostrides[d];
// Calculate input window index
dim_t winy = (col / nx);
@@ -56,7 +56,9 @@ namespace cpu
dim_t xpad = spx + x;
dim_t ypad = spy + y;
- dim_t oloc = (y * wx + x) * ostrides[0];
+ dim_t oloc = (y * wx + x);
+ if (d == 0) oloc *= ostrides[1];
+
if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
optr[oloc] = iptr[iloc];
@@ -72,7 +74,7 @@ namespace cpu
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
{
af::dim4 idims = in.dims();
@@ -81,25 +83,32 @@ namespace cpu
af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
+ if (!is_column) {
+ std::swap(odims[0], odims[1]);
+ }
+
// Create output placeholder
Array<T> outArray = createEmptyArray<T>(odims);
// Get pointers to raw data
const T *inPtr = in.get();
- T *outPtr = outArray.get();
+ T *outPtr = outArray.get();
af::dim4 ostrides = outArray.strides();
af::dim4 istrides = in.strides();
- unwrap_(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
-
+ if (is_column) {
+ unwrap_dim<T, 1>(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
+ } else {
+ unwrap_dim<T, 0>(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py);
+ }
return outArray;
}
#define INSTANTIATE(T) \
template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
INSTANTIATE(float)
@@ -113,4 +122,3 @@ namespace cpu
INSTANTIATE(uchar)
INSTANTIATE(char)
}
-
diff --git a/src/backend/cpu/unwrap.hpp b/src/backend/cpu/unwrap.hpp
index 7b5ea75..447fcfe 100644
--- a/src/backend/cpu/unwrap.hpp
+++ b/src/backend/cpu/unwrap.hpp
@@ -13,6 +13,5 @@ namespace cpu
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
}
-
diff --git a/src/backend/cuda/kernel/unwrap.hpp b/src/backend/cuda/kernel/unwrap.hpp
index e012340..410d94f 100644
--- a/src/backend/cuda/kernel/unwrap.hpp
+++ b/src/backend/cuda/kernel/unwrap.hpp
@@ -12,19 +12,20 @@
#include <err_cuda.hpp>
#include <debug_cuda.hpp>
#include <math.hpp>
+#include "config.hpp"
namespace cuda
{
namespace kernel
{
///////////////////////////////////////////////////////////////////////////
- // Resize Kernel
+ // Unwrap Kernel
///////////////////////////////////////////////////////////////////////////
- template<typename T, int TX>
+ template<typename T, bool is_column>
__global__
void unwrap_kernel(Param<T> out, CParam<T> in,
const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
- const dim_t px, const dim_t py, const dim_t nx, dim_t repsPerColumn)
+ const dim_t px, const dim_t py, const dim_t nx, dim_t reps)
{
// Compute channel and volume
const dim_t w = blockIdx.y / in.dims[2];
@@ -38,69 +39,106 @@ namespace cuda
const dim_t cIn = w * in.strides[3] + z * in.strides[2];
// Compute the output column index
- const dim_t colId = blockIdx.x * blockDim.y + threadIdx.y;
+ const dim_t id = is_column ?
+ (blockIdx.x * blockDim.y + threadIdx.y) :
+ (blockIdx.x * blockDim.x + threadIdx.x);
- if(colId >= out.dims[1])
- return;
+ if (id >= (is_column ? out.dims[1] : out.dims[0])) return;
// Compute the starting index of window in x and y of input
- const dim_t startx = (colId % nx) * sx;
- const dim_t starty = (colId / nx) * sy;
+ const dim_t startx = (id % nx) * sx;
+ const dim_t starty = (id / nx) * sy;
const dim_t spx = startx - px;
const dim_t spy = starty - py;
// Offset the global pointers to the respective starting indices
- T* optr = out.ptr + cOut + colId * out.strides[1];
+ T* optr = out.ptr + cOut + id * (is_column ? out.strides[1] : 1);
const T* iptr = in.ptr + cIn;
bool cond = (spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1]);
- for(int i = 0; i < repsPerColumn; i++) {
+ for(int i = 0; i < reps; i++) {
+
// Compute output index local to column
- const dim_t colIndex = i * TX + threadIdx.x;
+ const dim_t outIdx = is_column ?
+ (i * blockDim.x + threadIdx.x) :
+ (i * blockDim.y + threadIdx.y);
- if(colIndex >= out.dims[0])
+ if(outIdx >= (is_column ? out.dims[0] : out.dims[1]))
return;
// Compute input index local to window
- const dim_t x = colIndex % wx;
- const dim_t y = colIndex / wx;
+ const dim_t x = outIdx % wx;
+ const dim_t y = outIdx / wx;
const dim_t xpad = spx + x;
const dim_t ypad = spy + y;
- const dim_t outIdx = (y * wx + x) * out.strides[0];
-
// Copy
T val = scalar<T>(0.0);
if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
const dim_t inIdx = ypad * in.strides[1] + xpad;
val = iptr[inIdx];
}
- optr[outIdx] = val;
+
+ if (is_column) {
+ optr[outIdx] = val;
+ } else {
+ optr[outIdx * out.strides[1]] = val;
+ }
}
}
///////////////////////////////////////////////////////////////////////////
// Wrapper functions
///////////////////////////////////////////////////////////////////////////
- template <typename T, int TX>
- void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
+ template <typename T>
+ void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy,
+ const dim_t px, const dim_t py, const dim_t nx)
{
- dim3 threads(TX, 256 / TX, 1);
-
- dim_t repsPerColumn = 1;
- if(TX == 256 && wx * wy > 256) {
- repsPerColumn = divup((wx * wy), 256);
- }
+ dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0]));
+ dim3 threads(TX, THREADS_PER_BLOCK / TX);
dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);
- CUDA_LAUNCH((unwrap_kernel<T, TX>), blocks, threads,
- out, in, wx, wy, sx, sy, px, py, nx, repsPerColumn);
+ dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256
+
+ CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads,
+ out, in, wx, wy, sx, sy, px, py, nx, reps);
+
+ POST_LAUNCH_CHECK();
+ }
+
+ template<typename T>
+ void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy,
+ const dim_t px, const dim_t py, const dim_t nx)
+ {
+ dim3 threads(THREADS_X, THREADS_Y);
+ dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]);
+
+ dim_t reps = divup((wx * wy), threads.y);
+
+ CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads,
+ out, in, wx, wy, sx, sy, px, py, nx, reps);
+
POST_LAUNCH_CHECK();
}
+
+ template <typename T>
+ void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy,
+ const dim_t px, const dim_t py, const dim_t nx, const bool is_column)
+ {
+
+ if (is_column) {
+ unwrap_col<T>(out, in, wx, wy, sx, sy, px, py, nx);
+ } else {
+ unwrap_row<T>(out, in, wx, wy, sx, sy, px, py, nx);
+ }
+ }
+
}
}
diff --git a/src/backend/cuda/unwrap.cu b/src/backend/cuda/unwrap.cu
index 3164e87..5fdfc0e 100644
--- a/src/backend/cuda/unwrap.cu
+++ b/src/backend/cuda/unwrap.cu
@@ -17,37 +17,31 @@ namespace cuda
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
{
af::dim4 idims = in.dims();
dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
dim_t ny = (idims[1] + 2 * py - wy) / sx + 1;
- af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
+ af::dim4 odims;
- // Create output placeholder
- Array<T> outArray = createEmptyArray<T>(odims);
-
- if(odims[0] <= 16) {
- kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
- } else if (odims[0] <= 32) {
- kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
- } else if (odims[0] <= 64) {
- kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
- } else if(odims[0] <= 128) {
- kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
+ if (is_column) {
+ odims = dim4(wx * wy, nx * ny, idims[2], idims[3]);
} else {
- kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
+ odims = dim4(nx * ny, wx * wy, idims[2], idims[3]);
}
+ // Create output placeholder
+ Array<T> outArray = createEmptyArray<T>(odims);
+ kernel::unwrap<T>(outArray, in, wx, wy, sx, sy, px, py, nx, is_column);
return outArray;
}
#define INSTANTIATE(T) \
template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
INSTANTIATE(float)
diff --git a/src/backend/cuda/unwrap.hpp b/src/backend/cuda/unwrap.hpp
index 0217c4b..7105585 100644
--- a/src/backend/cuda/unwrap.hpp
+++ b/src/backend/cuda/unwrap.hpp
@@ -13,6 +13,5 @@ namespace cuda
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
}
-
diff --git a/src/backend/opencl/kernel/unwrap.cl b/src/backend/opencl/kernel/unwrap.cl
index 61aab1a..6ffd1e4 100644
--- a/src/backend/opencl/kernel/unwrap.cl
+++ b/src/backend/opencl/kernel/unwrap.cl
@@ -7,24 +7,11 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#define divup(a, b) (((a)+(b)-1)/(b))
-
-#if CPLX
-#define set(a, b) a = b
-#define set_scalar(a, b) do { \
- a.x = b; \
- a.y = 0; \
- } while(0)
-#else
-#define set(a, b) a = b
-#define set_scalar(a, b) a = b
-#endif
-
__kernel
void unwrap_kernel(__global T *d_out, const KParam out,
__global const T *d_in, const KParam in,
const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
- const dim_t px, const dim_t py, const dim_t nx, const dim_t repsPerColumn)
+ const dim_t px, const dim_t py, const dim_t nx, const dim_t reps)
{
// Compute channel and volume
const dim_t w = get_group_id(1) / in.dims[2];
@@ -38,46 +25,53 @@ void unwrap_kernel(__global T *d_out, const KParam out,
const dim_t cIn = w * in.strides[3] + z * in.strides[2];
// Compute the output column index
- const dim_t colId = get_group_id(0) * get_local_size(1) + get_local_id(1);
+ const dim_t id = is_column ?
+ (get_group_id(0) * get_local_size(1) + get_local_id(1)) :
+ get_global_id(0);
- if(colId >= out.dims[1])
- return;
+ if (id >= (is_column ? out.dims[1] : out.dims[0])) return;
// Compute the starting index of window in x and y of input
- const dim_t startx = (colId % nx) * sx;
- const dim_t starty = (colId / nx) * sy;
+ const dim_t startx = (id % nx) * sx;
+ const dim_t starty = (id / nx) * sy;
const dim_t spx = startx - px;
const dim_t spy = starty - py;
// Offset the global pointers to the respective starting indices
- __global T* optr = d_out + cOut + colId * out.strides[1];
+ __global T* optr = d_out + cOut + id * (is_column ? out.strides[1] : 1);
__global const T* iptr = d_in + cIn + in.offset;
bool cond = (spx >= 0 && spx + wx < in.dims[0] && spy >= 0 && spy + wy < in.dims[1]);
- for(int i = 0; i < repsPerColumn; i++) {
+ for(int i = 0; i < reps; i++) {
+
// Compute output index local to column
- const dim_t colIndex = i * TX + get_local_id(0);
+ const dim_t outIdx = is_column ?
+ (i * get_local_size(0) + get_local_id(0)) :
+ (i * get_local_size(1) + get_local_id(1));
- if(colIndex >= out.dims[0])
+ if(outIdx >= (is_column ? out.dims[0] : out.dims[1]))
return;
// Compute input index local to window
- const dim_t y = colIndex / wx;
- const dim_t x = colIndex % wx;
+ const dim_t y = outIdx / wx;
+ const dim_t x = outIdx % wx;
const dim_t xpad = spx + x;
const dim_t ypad = spy + y;
- const dim_t outIdx = (y * wx + x) * out.strides[0];
-
// Copy
T val = ZERO;
if(cond || (xpad >= 0 && xpad < in.dims[0] && ypad >= 0 && ypad < in.dims[1])) {
const dim_t inIdx = ypad * in.strides[1] + xpad * in.strides[0];
val = iptr[inIdx];
}
- optr[outIdx] = val;
+
+ if (is_column) {
+ optr[outIdx] = val;
+ } else {
+ optr[outIdx * out.strides[1]] = val;
+ }
}
}
diff --git a/src/backend/opencl/kernel/unwrap.hpp b/src/backend/opencl/kernel/unwrap.hpp
index 51c3297..180dc8a 100644
--- a/src/backend/opencl/kernel/unwrap.hpp
+++ b/src/backend/opencl/kernel/unwrap.hpp
@@ -33,7 +33,7 @@ namespace opencl
{
namespace kernel
{
- template<typename T, int TX>
+ template<typename T, bool is_column>
void unwrap(Param out, const Param in, const dim_t wx, const dim_t wy,
const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx)
{
@@ -48,9 +48,9 @@ namespace opencl
ToNum<T> toNum;
std::ostringstream options;
- options << " -D ZERO=" << toNum(scalar<T>(0));
- options << " -D T=" << dtype_traits<T>::getName();
- options << " -D TX=" << TX;
+ options << " -D is_column=" << is_column
+ << " -D ZERO=" << toNum(scalar<T>(0))
+ << " -D T=" << dtype_traits<T>::getName();
if((af_dtype) dtype_traits<T>::af_type == c32 ||
(af_dtype) dtype_traits<T>::af_type == c64) {
@@ -75,20 +75,29 @@ namespace opencl
const dim_t, const dim_t, const dim_t, const dim_t>
(*unwrapKernels[device]);
- const dim_t TY = 256 / TX;
- dim_t repsPerColumn = 1;
- if(TX == 256 && wx * wy > 256) {
- repsPerColumn = divup((wx * wy), 256);
- }
+ dim_t TX = 1, TY = 1;
+ dim_t BX = 1;
+ const dim_t BY = out.info.dims[2] * out.info.dims[3];
+ dim_t reps = 1;
- NDRange local(TX, TY, 1);
+ if (is_column) {
+ TX = std::min(THREADS_PER_GROUP, nextpow2(out.info.dims[0]));
+ TY = THREADS_PER_GROUP / TX;
+ BX = divup(out.info.dims[1], TY);
+ reps = divup((wx * wy), TX);
+ } else {
+ TX = THREADS_X;
+ TY = THREADS_Y;
+ BX = divup(out.info.dims[0], TX);
+ reps = divup((wx * wy), TY);
+ }
- NDRange global(local[0] * divup(out.info.dims[1], TY),
- local[1] * out.info.dims[2] * out.info.dims[3],
- 1);
+ NDRange local(TX, TY);
+ NDRange global(local[0] * BX,
+ local[1] * BY);
unwrapOp(EnqueueArgs(getQueue(), global, local),
- *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, repsPerColumn);
+ *out.data, out.info, *in.data, in.info, wx, wy, sx, sy, px, py, nx, reps);
CL_DEBUG_FINISH(getQueue());
} catch (cl::Error err) {
diff --git a/src/backend/opencl/unwrap.cpp b/src/backend/opencl/unwrap.cpp
index da8f8b3..2a1662f 100644
--- a/src/backend/opencl/unwrap.cpp
+++ b/src/backend/opencl/unwrap.cpp
@@ -17,7 +17,7 @@ namespace opencl
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
{
af::dim4 idims = in.dims();
@@ -26,19 +26,17 @@ namespace opencl
af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
+ if (!is_column) {
+ std::swap(odims[0], odims[1]);
+ }
+
// Create output placeholder
Array<T> outArray = createEmptyArray<T>(odims);
- if(odims[0] <= 16) {
- kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy, px, py, nx);
- } else if (odims[0] <= 32) {
- kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy, px, py, nx);
- } else if (odims[0] <= 64) {
- kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy, px, py, nx);
- } else if(odims[0] <= 128) {
- kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy, px, py, nx);
+ if (is_column) {
+ kernel::unwrap<T, true >(outArray, in, wx, wy, sx, sy, px, py, nx);
} else {
- kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy, px, py, nx);
+ kernel::unwrap<T, false>(outArray, in, wx, wy, sx, sy, px, py, nx);
}
return outArray;
@@ -47,7 +45,7 @@ namespace opencl
#define INSTANTIATE(T) \
template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
INSTANTIATE(float)
@@ -61,4 +59,3 @@ namespace opencl
INSTANTIATE(uchar)
INSTANTIATE(char)
}
-
diff --git a/src/backend/opencl/unwrap.hpp b/src/backend/opencl/unwrap.hpp
index 40efb4c..d8d3d55 100644
--- a/src/backend/opencl/unwrap.hpp
+++ b/src/backend/opencl/unwrap.hpp
@@ -13,6 +13,5 @@ namespace opencl
{
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py);
+ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
}
-
diff --git a/test/unwrap.cpp b/test/unwrap.cpp
index 0602caf..d8238c1 100644
--- a/test/unwrap.cpp
+++ b/test/unwrap.cpp
@@ -54,26 +54,34 @@ void unwrapTest(string pTestFile, const unsigned resultIdx,
af_array inArray = 0;
af_array outArray = 0;
+ af_array outArrayT = 0;
+ af_array outArray2 = 0;
ASSERT_EQ(AF_SUCCESS, af_create_array(&inArray, &(in[0].front()), idims.ndims(), idims.get(), (af_dtype) af::dtype_traits<T>::af_type));
- ASSERT_EQ(AF_SUCCESS, af_unwrap(&outArray, inArray, wx, wy, sx, sy, px, py));
+ ASSERT_EQ(AF_SUCCESS, af_unwrap(&outArray , inArray, wx, wy, sx, sy, px, py, true ));
+ ASSERT_EQ(AF_SUCCESS, af_unwrap(&outArrayT, inArray, wx, wy, sx, sy, px, py, false));
+ ASSERT_EQ(AF_SUCCESS, af_transpose(&outArray2, outArrayT, false));
- // Get result
- T* outData = new T[tests[resultIdx].size()];
- ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray));
-
- // Compare result
size_t nElems = tests[resultIdx].size();
+ std::vector<T> outData(nElems);
+
+ // Compare is_column == true results
+ ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)&outData[0], outArray));
for (size_t elIter = 0; elIter < nElems; ++elIter) {
ASSERT_EQ(tests[resultIdx][elIter], outData[elIter]) << "at: " << elIter << std::endl;
}
- // Delete
- delete[] outData;
+ // Compare is_column == false results
+ ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)&outData[0], outArray2));
+ for (size_t elIter = 0; elIter < nElems; ++elIter) {
+ ASSERT_EQ(tests[resultIdx][elIter], outData[elIter]) << "at: " << elIter << std::endl;
+ }
if(inArray != 0) af_release_array(inArray);
if(outArray != 0) af_release_array(outArray);
+ if(outArrayT != 0) af_release_array(outArrayT);
+ if(outArray2 != 0) af_release_array(outArray2);
}
#define UNWRAP_INIT(desc, file, resultIdx, wx, wy, sx, sy, px,py) \
@@ -164,4 +172,3 @@ TEST(Unwrap, CPP)
// Delete
delete[] outData;
}
-
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list