[arrayfire] 08/408: Added CUDA backend for Unwrap
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:03 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 73d9038c4e3afb4dac02e4f9aec5bb56546b4bbc
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date: Wed Jun 17 16:34:49 2015 -0400
Added CUDA backend for Unwrap
---
src/backend/cpu/unwrap.cpp | 8 ++--
src/backend/cuda/kernel/unwrap.hpp | 89 ++++++++++++++++++++++++++++++++++++++
src/backend/cuda/unwrap.cu | 15 ++++++-
3 files changed, 106 insertions(+), 6 deletions(-)
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 691cf08..3d25661 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -20,7 +20,7 @@ namespace cpu
const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy)
{
dim_t nx = (idims[0] - wx) / sx + 1;
- dim_t ny = (idims[1] - wy) / sy + 1;
+ //dim_t ny = (idims[1] - wy) / sy + 1;
for(dim_t w = 0; w < odims[3]; w++) {
for(dim_t z = 0; z < odims[2]; z++) {
@@ -28,8 +28,8 @@ namespace cpu
dim_t cIn = w * istrides[3] + z * istrides[2];
for(dim_t col = 0; col < odims[1]; col++) {
// Calculate input window index
- dim_t winy = (col / ny);
- dim_t winx = (col % ny);
+ dim_t winy = (col / nx);
+ dim_t winx = (col % nx);
dim_t startx = winx * sx;
dim_t starty = winy * sy;
@@ -39,7 +39,7 @@ namespace cpu
for(dim_t y = 0; y < wy; y++) {
for(dim_t x = 0; x < wx; x++) {
- dim_t oloc = (y * wy + x) * ostrides[0];
+ dim_t oloc = (y * wx + x) * ostrides[0];
dim_t iloc = (y * istrides[1] + x * istrides[0]);
optr[oloc] = iptr[iloc];
}
diff --git a/src/backend/cuda/kernel/unwrap.hpp b/src/backend/cuda/kernel/unwrap.hpp
new file mode 100644
index 0000000..9ac0663
--- /dev/null
+++ b/src/backend/cuda/kernel/unwrap.hpp
@@ -0,0 +1,89 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <math.hpp>
+
+namespace cuda
+{
+ namespace kernel
+ {
+ ///////////////////////////////////////////////////////////////////////////
+ // Resize Kernel
+ ///////////////////////////////////////////////////////////////////////////
+ template<typename T, int threads>
+ __global__
+ void unwrap_kernel(Param<T> out, CParam<T> in,
+ const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy,
+ dim_t repsPerColumn)
+ {
+ const dim_t w = blockIdx.y / in.dims[2];
+ const dim_t z = blockIdx.y % in.dims[2];
+
+ if(w >= in.dims[3] || z >= in.dims[2])
+ return;
+
+ const dim_t cOut = w * out.strides[3] + z * out.strides[2];
+ const dim_t cIn = w * in.strides[3] + z * in.strides[2];
+
+ const dim_t nx = (in.dims[0] - wx) / sx + 1;
+ //dim_t ny = (in.dims[1] - wy) / sy + 1;
+
+ const dim_t colId = blockIdx.x * blockDim.y + threadIdx.y;
+
+ if(colId >= out.dims[1])
+ return;
+
+ const dim_t startx = (colId % nx) * sx;
+ const dim_t starty = (colId / nx) * sy;
+
+ T* optr = out.ptr + cOut + colId * out.strides[1];
+ const T* iptr = in.ptr + cIn + starty * in.strides[1] + startx;
+
+ for(int i = 0; i < repsPerColumn; i++) {
+ const dim_t colIndex = i * threads + threadIdx.x;
+
+ if(colIndex >= out.dims[0])
+ return;
+
+ const dim_t x = colIndex % wx;
+ const dim_t y = colIndex / wx;
+
+ const dim_t outIdx = (y * wx + x) * out.strides[0];
+ const dim_t inIdx = y * in.strides[1] + x * in.strides[0];
+
+ optr[outIdx] = iptr[inIdx];
+ }
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Wrapper functions
+ ///////////////////////////////////////////////////////////////////////////
+ template <typename T, int TX>
+ void unwrap(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
+ const dim_t sx, const dim_t sy)
+ {
+ dim3 threads(TX, 256 / TX, 1);
+
+ dim_t repsPerColumn = 1;
+ if(TX == 256 && wx * wy > 256) {
+ repsPerColumn = (wx * wy) / 256;
+ }
+
+ dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);
+
+ unwrap_kernel<T, TX><<<blocks, threads>>>(out, in, wx, wy, sx, sy, repsPerColumn);
+ POST_LAUNCH_CHECK();
+ }
+ }
+}
+
diff --git a/src/backend/cuda/unwrap.cu b/src/backend/cuda/unwrap.cu
index d44ffd8..b180420 100644
--- a/src/backend/cuda/unwrap.cu
+++ b/src/backend/cuda/unwrap.cu
@@ -9,7 +9,7 @@
#include <Array.hpp>
#include <unwrap.hpp>
-//#include <kernel/unwrap.hpp>
+#include <kernel/unwrap.hpp>
#include <stdexcept>
#include <err_cuda.hpp>
@@ -29,6 +29,18 @@ namespace cuda
// Create output placeholder
Array<T> outArray = createEmptyArray<T>(odims);
+ if(odims[0] <= 16) {
+ kernel::unwrap<T, 16 >(outArray, in, wx, wy, sx, sy);
+ } else if (odims[0] <= 32) {
+ kernel::unwrap<T, 32 >(outArray, in, wx, wy, sx, sy);
+ } else if (odims[0] <= 64) {
+ kernel::unwrap<T, 64 >(outArray, in, wx, wy, sx, sy);
+ } else if(odims[0] <= 128) {
+ kernel::unwrap<T, 128>(outArray, in, wx, wy, sx, sy);
+ } else {
+ kernel::unwrap<T, 256>(outArray, in, wx, wy, sx, sy);
+ }
+
return outArray;
}
@@ -49,4 +61,3 @@ namespace cuda
INSTANTIATE(uchar)
INSTANTIATE(char)
}
-
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list