[arrayfire] 185/408: Wrapped cuda kernel launches with CUDA_LAUNCH macro
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:53 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 136344180037b1a6f350bf3509674b15f44c21b9
Author: pradeep <pradeep at arrayfire.com>
Date: Tue Aug 4 17:42:05 2015 -0400
Wrapped cuda kernel launches with CUDA_LAUNCH macro
---
src/backend/cuda/kernel/convolve.cu | 16 +++++++---------
src/backend/cuda/kernel/convolve_separable.cu | 2 +-
src/backend/cuda/kernel/scan_dim.hpp | 2 +-
src/backend/cuda/kernel/transpose.hpp | 4 ++--
4 files changed, 11 insertions(+), 13 deletions(-)
diff --git a/src/backend/cuda/kernel/convolve.cu b/src/backend/cuda/kernel/convolve.cu
index 7cc9fd2..d1f894e 100644
--- a/src/backend/cuda/kernel/convolve.cu
+++ b/src/backend/cuda/kernel/convolve.cu
@@ -300,8 +300,8 @@ void prepareKernelArgs(conv_kparam_t ¶ms, dim_t oDims[], dim_t fDims[], int
template<typename T, typename aT, bool expand, int f0, int f1>
void conv2Helper(const conv_kparam_t &p, Param<T> out, CParam<T> sig)
{
- (convolve2<T, aT, expand, f0, f1>)
- <<<p.mBlocks, p.mThreads>>>(out, sig, p.mBlk_x, p.mBlk_y, p.o[1], p.o[2], p.s[1], p.s[2]);
+ CUDA_LAUNCH((convolve2<T, aT, expand, f0, f1>), p.mBlocks, p.mThreads,
+ out, sig, p.mBlk_x, p.mBlk_y, p.o[1], p.o[2], p.s[1], p.s[2]);
POST_LAUNCH_CHECK();
}
@@ -381,10 +381,9 @@ void convolve_1d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
p.s[1] = (p.inHasNoOffset ? 0 : b2);
p.s[2] = (p.inHasNoOffset ? 0 : b3);
- (convolve1<T, aT, expand>)
- <<<p.mBlocks, p.mThreads, p.mSharedSize>>>
- (out, sig, filt.dims[0], p.mBlk_x, p.mBlk_y,
- p.o[0], p.o[1], p.o[2], p.s[0], p.s[1], p.s[2]);
+ CUDA_LAUNCH_SMEM((convolve1<T, aT, expand>), p.mBlocks, p.mThreads, p.mSharedSize,
+ out, sig, filt.dims[0], p.mBlk_x, p.mBlk_y,
+ p.o[0], p.o[1], p.o[2], p.s[0], p.s[1], p.s[2]);
POST_LAUNCH_CHECK();
}
@@ -442,9 +441,8 @@ void convolve_3d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
p.o[2] = (p.outHasNoOffset ? 0 : b3);
p.s[2] = (p.inHasNoOffset ? 0 : b3);
- (convolve3<T, aT, expand>)
- <<<p.mBlocks, p.mThreads, p.mSharedSize>>>
- (out, sig, filt.dims[0], filt.dims[1], filt.dims[2], p.mBlk_x, p.o[2], p.s[2]);
+ CUDA_LAUNCH_SMEM((convolve3<T, aT, expand>), p.mBlocks, p.mThreads, p.mSharedSize,
+ out, sig, filt.dims[0], filt.dims[1], filt.dims[2], p.mBlk_x, p.o[2], p.s[2]);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/convolve_separable.cu b/src/backend/cuda/kernel/convolve_separable.cu
index 9b3092d..0b5f596 100644
--- a/src/backend/cuda/kernel/convolve_separable.cu
+++ b/src/backend/cuda/kernel/convolve_separable.cu
@@ -111,7 +111,7 @@ void convolve2_separable(Param<T> out, CParam<T> signal, int nBBS0, int nBBS1)
template<typename T, typename aT, int cDim, bool expand, int f>
void conv2Helper(dim3 blks, dim3 thrds, Param<T> out, CParam<T> sig, int nBBS0, int nBBS1)
{
- (convolve2_separable<T, aT, cDim, expand, f>)<<<blks, thrds>>>(out, sig, nBBS0, nBBS1);
+ CUDA_LAUNCH((convolve2_separable<T, aT, cDim, expand, f>), blks, thrds, out, sig, nBBS0, nBBS1);
}
template<typename T, typename accType, int conv_dim, bool expand>
diff --git a/src/backend/cuda/kernel/scan_dim.hpp b/src/backend/cuda/kernel/scan_dim.hpp
index 6bc0469..af90de0 100644
--- a/src/backend/cuda/kernel/scan_dim.hpp
+++ b/src/backend/cuda/kernel/scan_dim.hpp
@@ -226,7 +226,7 @@ namespace kernel
uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));
- (bcast_dim_kernel<To, op, dim>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((bcast_dim_kernel<To, op, dim>), blocks, threads,
out, tmp, blocks_all[0], blocks_all[1], blocks_all[dim], lim);
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/transpose.hpp b/src/backend/cuda/kernel/transpose.hpp
index 86c5e9b..d8dfb7d 100644
--- a/src/backend/cuda/kernel/transpose.hpp
+++ b/src/backend/cuda/kernel/transpose.hpp
@@ -103,9 +103,9 @@ namespace kernel
dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0)
- (transpose<T, conjugate, true >)<<<blocks, threads>>>(out, in, blk_x, blk_y);
+ CUDA_LAUNCH((transpose<T, conjugate, true >), blocks, threads, out, in, blk_x, blk_y);
else
- (transpose<T, conjugate, false>)<<<blocks, threads>>>(out, in, blk_x, blk_y);
+ CUDA_LAUNCH((transpose<T, conjugate, false>), blocks, threads, out, in, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list