[arrayfire] 283/408: FEAT: wrap for CUDA backend
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:14 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 1909fa5e11a710bea56729860d5ead19629c88ca
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Tue Aug 18 21:39:15 2015 -0400
FEAT: wrap for CUDA backend
- Additionally cleaned up wrap tests to be more modular
---
src/backend/cuda/kernel/wrap.hpp | 113 +++++++++++++++++++++++++++++++++++++++
src/backend/cuda/wrap.cu | 8 ++-
test/wrap.cpp | 107 ++++++++++++++++++------------------
3 files changed, 174 insertions(+), 54 deletions(-)
diff --git a/src/backend/cuda/kernel/wrap.hpp b/src/backend/cuda/kernel/wrap.hpp
new file mode 100644
index 0000000..e02dcb2
--- /dev/null
+++ b/src/backend/cuda/kernel/wrap.hpp
@@ -0,0 +1,113 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <math.hpp>
+#include "config.hpp"
+#include "atomics.hpp"
+
+namespace cuda
+{
+ namespace kernel
+ {
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Wrap Kernel
+ ///////////////////////////////////////////////////////////////////////////
+ template<typename T, bool is_column>
+ __global__
+ void wrap_kernel(Param<T> out, CParam<T> in,
+ const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy,
+ const dim_t px, const dim_t py,
+ const dim_t nx, const dim_t ny,
+ dim_t blocks_x,
+ dim_t blocks_y)
+ {
+ dim_t idx2 = blockIdx.x / blocks_x;
+ dim_t idx3 = blockIdx.y / blocks_y;
+
+ dim_t blockIdx_x = blockIdx.x - idx2 * blocks_x;
+ dim_t blockIdx_y = blockIdx.y - idx3 * blocks_y;
+
+ dim_t oidx0 = threadIdx.x + blockDim.x * blockIdx_x;
+ dim_t oidx1 = threadIdx.y + blockDim.y * blockIdx_y;
+
+ T *optr = out.ptr + idx2 * out.strides[2] + idx3 * out.strides[3];
+ const T *iptr = in.ptr + idx2 * in.strides[2] + idx3 * in.strides[3];
+
+
+ if (oidx0 >= out.dims[0] || oidx1 >= out.dims[1]) return;
+
+ dim_t pidx0 = oidx0 + px;
+ dim_t pidx1 = oidx1 + py;
+
+ // The last time a value appears in the unwrapped index is padded_index / stride
+ // Each previous index has the value appear "stride" locations earlier
+ // We work our way back from the last index
+
+ const dim_t x_end = min(pidx0 / sx, nx - 1);
+ const dim_t y_end = min(pidx1 / sy, ny - 1);
+
+ const dim_t x_off = pidx0 - sx * x_end;
+ const dim_t y_off = pidx1 - sy * y_end;
+
+ T val = scalar<T>(0);
+ dim_t idx = 1;
+
+ for (dim_t y = y_end, yo = y_off; y >= 0 && yo < wy; yo += sy, y--) {
+ dim_t win_end_y = yo * wx;
+ dim_t dim_end_y = y * nx;
+
+ for (dim_t x = x_end, xo = x_off; x >= 0 && xo < wx; xo += sx, x--) {
+
+ dim_t win_end = win_end_y + xo;
+ dim_t dim_end = dim_end_y + x;
+
+ if (is_column) {
+ idx = dim_end * in.strides[1] + win_end;
+ } else {
+ idx = dim_end + win_end * in.strides[1];
+ }
+
+ val = val + iptr[idx];
+ }
+ }
+
+ optr[oidx1 * out.strides[1] + oidx0] = val;
+ }
+
+ template <typename T>
+ void wrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy,
+ const dim_t px, const dim_t py,
+ const bool is_column)
+ {
+ dim_t nx = (out.dims[0] + 2 * px - wx) / sx + 1;
+ dim_t ny = (out.dims[1] + 2 * py - wy) / sy + 1;
+
+ dim3 threads(THREADS_X, THREADS_Y);
+ dim_t blocks_x = divup(out.dims[0], threads.x);
+ dim_t blocks_y = divup(out.dims[1], threads.y);
+
+ dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);
+
+ if (is_column) {
+ CUDA_LAUNCH((wrap_kernel<T, true >), blocks, threads,
+ out, in, wx, wy, sx, sy, px, py, nx, ny, blocks_x, blocks_y);
+ } else {
+ CUDA_LAUNCH((wrap_kernel<T, false>), blocks, threads,
+ out, in, wx, wy, sx, sy, px, py, nx, ny, blocks_x, blocks_y);
+ }
+ }
+ }
+}
diff --git a/src/backend/cuda/wrap.cu b/src/backend/cuda/wrap.cu
index f7da711..a1e70fc 100644
--- a/src/backend/cuda/wrap.cu
+++ b/src/backend/cuda/wrap.cu
@@ -13,6 +13,7 @@
#include <err_cuda.hpp>
#include <dispatch.hpp>
#include <math.hpp>
+#include <kernel/wrap.hpp>
namespace cuda
{
@@ -25,7 +26,12 @@ namespace cuda
const dim_t px, const dim_t py,
const bool is_column)
{
- CUDA_NOT_SUPPORTED();
+ af::dim4 idims = in.dims();
+ af::dim4 odims(ox, oy, idims[2], idims[3]);
+ Array<T> out = createValueArray<T>(odims, scalar<T>(0));
+
+ kernel::wrap<T>(out, in, wx, wy, sx, sy, px, py, is_column);
+ return out;
}
diff --git a/test/wrap.cpp b/test/wrap.cpp
index 3d5f735..8d26a5a 100644
--- a/test/wrap.cpp
+++ b/test/wrap.cpp
@@ -70,7 +70,8 @@ template<typename T>
void wrapTest(const dim_t ix, const dim_t iy,
const dim_t wx, const dim_t wy,
const dim_t sx, const dim_t sy,
- const dim_t px, const dim_t py)
+ const dim_t px, const dim_t py,
+ bool cond)
{
if (noDoubleTests<T>()) return;
@@ -110,69 +111,69 @@ void wrapTest(const dim_t ix, const dim_t iy,
af::array factor(ix, iy, &h_factor[0]);
- for (int i = 0; i <= 1; i++)
- {
- bool cond = i == 0;
- af::array in_dim = af::unwrap(in, wx, wy, sx, sy, px, py, cond);
- af::array res_dim = af::wrap(in_dim, ix, iy, wx, wy, sx, sy, px, py, cond);
+ af::array in_dim = af::unwrap(in, wx, wy, sx, sy, px, py, cond);
+ af::array res_dim = af::wrap(in_dim, ix, iy, wx, wy, sx, sy, px, py, cond);
- ASSERT_EQ(in.elements(), res_dim.elements());
+ ASSERT_EQ(in.elements(), res_dim.elements());
- std::vector<T> h_res(ix * iy);
- res_dim.host(&h_res[0]);
+ std::vector<T> h_res(ix * iy);
+ res_dim.host(&h_res[0]);
- for (int n = 0; n < nc; n++) {
- T *iptr = &h_in[n * ix * iy];
- T *rptr = &h_res[n * ix * iy];
+ for (int n = 0; n < nc; n++) {
+ T *iptr = &h_in[n * ix * iy];
+ T *rptr = &h_res[n * ix * iy];
- for (int y = 0; y < iy; y++) {
- for (int x = 0; x < ix; x++) {
+ for (int y = 0; y < iy; y++) {
+ for (int x = 0; x < ix; x++) {
- // FIXME: Use a better test
- T ival = iptr[y * ix + x];
- T rval = rptr[y * ix + x];
- int factor = h_factor[y * ix + x];
+ // FIXME: Use a better test
+ T ival = iptr[y * ix + x];
+ T rval = rptr[y * ix + x];
+ int factor = h_factor[y * ix + x];
- if (abs(ival) == 0) continue;
+ if (abs(ival) == 0) continue;
- ASSERT_NEAR(get_val<T>(ival * factor), get_val<T>(rval), 1E-5)
- << "at " << x << "," << y << std::endl;
- }
+ ASSERT_NEAR(get_val<T>(ival * factor), get_val<T>(rval), 1E-5)
+ << "at " << x << "," << y << " for cond == " << cond << std::endl;
}
-
}
+
}
}
-#define WRAP_INIT(desc, ix, iy, wx, wy, sx, sy, px,py) \
- TYPED_TEST(Wrap, desc) \
- { \
- wrapTest<TypeParam>(ix, iy, wx, wy, sx, sy, px, py); \
+#define WRAP_INIT(desc, ix, iy, wx, wy, sx, sy, px,py) \
+ TYPED_TEST(Wrap, Col##desc) \
+ { \
+ wrapTest<TypeParam>(ix, iy, wx, wy, sx, sy, px, py, true ); \
+ } \
+ TYPED_TEST(Wrap, Row##desc) \
+ { \
+ wrapTest<TypeParam>(ix, iy, wx, wy, sx, sy, px, py, false); \
}
- WRAP_INIT(Wrap00, 300, 100, 3, 3, 1, 1, 0, 0);
- WRAP_INIT(Wrap01, 300, 100, 3, 3, 1, 1, 1, 1);
- WRAP_INIT(Wrap03, 300, 100, 3, 3, 2, 2, 0, 0);
- WRAP_INIT(Wrap04, 300, 100, 3, 3, 2, 2, 1, 1);
- WRAP_INIT(Wrap05, 300, 100, 3, 3, 2, 2, 2, 2);
- WRAP_INIT(Wrap06, 300, 100, 3, 3, 3, 3, 0, 0);
- WRAP_INIT(Wrap07, 300, 100, 3, 3, 3, 3, 1, 1);
- WRAP_INIT(Wrap08, 300, 100, 3, 3, 3, 3, 2, 2);
- WRAP_INIT(Wrap09, 300, 100, 4, 4, 1, 1, 0, 0);
- WRAP_INIT(Wrap13, 300, 100, 4, 4, 2, 2, 0, 0);
- WRAP_INIT(Wrap14, 300, 100, 4, 4, 2, 2, 1, 1);
- WRAP_INIT(Wrap15, 300, 100, 4, 4, 2, 2, 2, 2);
- WRAP_INIT(Wrap16, 300, 100, 4, 4, 2, 2, 3, 3);
- WRAP_INIT(Wrap17, 300, 100, 4, 4, 4, 4, 0, 0);
- WRAP_INIT(Wrap18, 300, 100, 4, 4, 4, 4, 1, 1);
- WRAP_INIT(Wrap19, 300, 100, 4, 4, 4, 4, 2, 2);
- WRAP_INIT(Wrap27, 300, 100, 8, 8, 8, 8, 0, 0);
- WRAP_INIT(Wrap28, 300, 100, 8, 8, 8, 8, 7, 7);
- WRAP_INIT(Wrap31, 300, 100, 12, 12, 12, 12, 0, 0);
- WRAP_INIT(Wrap32, 300, 100, 12, 12, 12, 12, 2, 2);
- WRAP_INIT(Wrap35, 300, 100, 16, 16, 16, 16, 15, 15);
- WRAP_INIT(Wrap36, 300, 100, 31, 31, 8, 8, 15, 15);
- WRAP_INIT(Wrap39, 300, 100, 8, 12, 8, 12, 0, 0);
- WRAP_INIT(Wrap40, 300, 100, 8, 12, 8, 12, 7, 11);
- WRAP_INIT(Wrap43, 300, 100, 15, 10, 15, 10, 0, 0);
- WRAP_INIT(Wrap44, 300, 100, 15, 10, 15, 10, 14, 9);
+ WRAP_INIT(00, 300, 100, 3, 3, 1, 1, 0, 0);
+ WRAP_INIT(01, 300, 100, 3, 3, 1, 1, 1, 1);
+ WRAP_INIT(03, 300, 100, 3, 3, 2, 2, 0, 0);
+ WRAP_INIT(04, 300, 100, 3, 3, 2, 2, 1, 1);
+ WRAP_INIT(05, 300, 100, 3, 3, 2, 2, 2, 2);
+ WRAP_INIT(06, 300, 100, 3, 3, 3, 3, 0, 0);
+ WRAP_INIT(07, 300, 100, 3, 3, 3, 3, 1, 1);
+ WRAP_INIT(08, 300, 100, 3, 3, 3, 3, 2, 2);
+ WRAP_INIT(09, 300, 100, 4, 4, 1, 1, 0, 0);
+ WRAP_INIT(13, 300, 100, 4, 4, 2, 2, 0, 0);
+ WRAP_INIT(14, 300, 100, 4, 4, 2, 2, 1, 1);
+ WRAP_INIT(15, 300, 100, 4, 4, 2, 2, 2, 2);
+ WRAP_INIT(16, 300, 100, 4, 4, 2, 2, 3, 3);
+ WRAP_INIT(17, 300, 100, 4, 4, 4, 4, 0, 0);
+ WRAP_INIT(18, 300, 100, 4, 4, 4, 4, 1, 1);
+ WRAP_INIT(19, 300, 100, 4, 4, 4, 4, 2, 2);
+ WRAP_INIT(27, 300, 100, 8, 8, 8, 8, 0, 0);
+ WRAP_INIT(28, 300, 100, 8, 8, 8, 8, 7, 7);
+ WRAP_INIT(31, 300, 100, 12, 12, 12, 12, 0, 0);
+ WRAP_INIT(32, 300, 100, 12, 12, 12, 12, 2, 2);
+ WRAP_INIT(35, 300, 100, 16, 16, 16, 16, 15, 15);
+ WRAP_INIT(36, 300, 100, 31, 31, 8, 8, 15, 15);
+ WRAP_INIT(39, 300, 100, 8, 12, 8, 12, 0, 0);
+ WRAP_INIT(40, 300, 100, 8, 12, 8, 12, 7, 11);
+ WRAP_INIT(43, 300, 100, 15, 10, 15, 10, 0, 0);
+ WRAP_INIT(44, 300, 100, 15, 10, 15, 10, 14, 9);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list