[arrayfire] 182/408: Changed default cuda stream to be non-zero
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:52 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 3ede60b078ec7f3fe45369b1dc643c09cd5005e6
Author: pradeep <pradeep at arrayfire.com>
Date: Tue Aug 4 15:05:28 2015 -0400
Changed default cuda stream to be non-zero
* Added additional following api functions specific to cuda backend
* afcu_get_stream
* afcu_get_native_id
* Removed duplicate class in fast kernel that helps declare
dynamic shared memory based on template type
---
include/af/cuda.h | 51 +++++++++++++++
src/backend/cuda/debug_cuda.hpp | 13 +++-
src/backend/cuda/jit.cpp | 2 +-
src/backend/cuda/kernel/approx.hpp | 8 +--
src/backend/cuda/kernel/assign.hpp | 2 +-
src/backend/cuda/kernel/bilateral.hpp | 5 +-
src/backend/cuda/kernel/diagonal.hpp | 4 +-
src/backend/cuda/kernel/diff.hpp | 4 +-
src/backend/cuda/kernel/exampleFunction.hpp | 11 +++-
src/backend/cuda/kernel/fast.hpp | 92 ++++++---------------------
src/backend/cuda/kernel/fftconvolve.hpp | 24 +++----
src/backend/cuda/kernel/gradient.hpp | 3 +-
src/backend/cuda/kernel/harris.hpp | 26 ++++----
src/backend/cuda/kernel/histogram.hpp | 5 +-
src/backend/cuda/kernel/hsv_rgb.hpp | 2 +-
src/backend/cuda/kernel/identity.hpp | 2 +-
src/backend/cuda/kernel/iir.hpp | 2 +-
src/backend/cuda/kernel/index.hpp | 2 +-
src/backend/cuda/kernel/iota.hpp | 5 +-
src/backend/cuda/kernel/ireduce.hpp | 16 ++---
src/backend/cuda/kernel/join.hpp | 6 +-
src/backend/cuda/kernel/lookup.hpp | 4 +-
src/backend/cuda/kernel/lu_split.hpp | 4 +-
src/backend/cuda/kernel/match_template.hpp | 3 +-
src/backend/cuda/kernel/meanshift.hpp | 6 +-
src/backend/cuda/kernel/medfilt.hpp | 15 +++--
src/backend/cuda/kernel/memcopy.hpp | 11 ++--
src/backend/cuda/kernel/morph.hpp | 28 ++++----
src/backend/cuda/kernel/nearest_neighbour.hpp | 69 ++++++++++----------
src/backend/cuda/kernel/orb.hpp | 22 +++----
src/backend/cuda/kernel/random.hpp | 8 +--
src/backend/cuda/kernel/range.hpp | 2 +-
src/backend/cuda/kernel/reduce.hpp | 16 ++---
src/backend/cuda/kernel/regions.hpp | 10 ++-
src/backend/cuda/kernel/reorder.hpp | 5 +-
src/backend/cuda/kernel/resize.hpp | 3 +-
src/backend/cuda/kernel/rotate.hpp | 4 +-
src/backend/cuda/kernel/scan_dim.hpp | 8 +--
src/backend/cuda/kernel/scan_first.hpp | 11 ++--
src/backend/cuda/kernel/shift.hpp | 5 +-
src/backend/cuda/kernel/sobel.hpp | 4 +-
src/backend/cuda/kernel/susan.hpp | 7 +-
src/backend/cuda/kernel/tile.hpp | 2 +-
src/backend/cuda/kernel/transform.hpp | 8 +--
src/backend/cuda/kernel/transpose_inplace.hpp | 4 +-
src/backend/cuda/kernel/triangle.hpp | 3 +-
src/backend/cuda/kernel/unwrap.hpp | 3 +-
src/backend/cuda/kernel/where.hpp | 3 +-
src/backend/cuda/platform.cpp | 25 +++++++-
src/backend/cuda/platform.hpp | 5 ++
50 files changed, 313 insertions(+), 270 deletions(-)
diff --git a/include/af/cuda.h b/include/af/cuda.h
new file mode 100644
index 0000000..6b178d2
--- /dev/null
+++ b/include/af/cuda.h
@@ -0,0 +1,51 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <af/exception.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AFAPI af_err afcu_get_stream(cudaStream_t* stream, int id);
+
+AFAPI af_err afcu_get_native_id(int* nativeid, int id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+namespace afcu
+{
+
+static inline cudaStream_t getStream(int id)
+{
+ cudaStream_t retVal;
+ af_err err = afcu_get_stream(&retVal, id);
+ if (err!=AF_SUCCESS)
+ throw af::exception("Failed to get CUDA stream from ArrayFire");
+ return retVal;
+}
+
+static inline int getNativeId(int id)
+{
+ int retVal;
+ af_err err = afcu_get_native_id(&retVal, id);
+ if (err!=AF_SUCCESS)
+ throw af::exception("Failed to get CUDA device native id from ArrayFire");
+ return retVal;
+}
+
+}
+#endif
diff --git a/src/backend/cuda/debug_cuda.hpp b/src/backend/cuda/debug_cuda.hpp
index 807d0a2..3ed83d5 100644
--- a/src/backend/cuda/debug_cuda.hpp
+++ b/src/backend/cuda/debug_cuda.hpp
@@ -8,14 +8,21 @@
********************************************************/
#pragma once
+#include <platform.hpp>
#include <err_cuda.hpp>
+#define CUDA_LAUNCH_SMEM(fn, blks, thrds, smem_size, ...) \
+ fn<<<blks, thrds, smem_size, cuda::getStream(cuda::getActiveDeviceId())>>>(__VA_ARGS__)
+
+#define CUDA_LAUNCH(fn, blks, thrds, ...) \
+ CUDA_LAUNCH_SMEM(fn, blks, thrds, 0, __VA_ARGS__)
+
// FIXME: Add a special flag for debug
#ifndef NDEBUG
-#define POST_LAUNCH_CHECK() do { \
- CUDA_CHECK(cudaDeviceSynchronize()); \
- } while(0) \
+#define POST_LAUNCH_CHECK() do { \
+ CUDA_CHECK(cudaStreamSynchronize(getStream())); \
+ } while(0) \
#else
diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp
index c432e66..b001fef 100644
--- a/src/backend/cuda/jit.cpp
+++ b/src/backend/cuda/jit.cpp
@@ -485,7 +485,7 @@ void evalNodes(Param<T> &out, Node *node)
threads_y,
1,
0,
- NULL,
+ getStream(getActiveDeviceId()),
&args.front(),
NULL));
}
diff --git a/src/backend/cuda/kernel/approx.hpp b/src/backend/cuda/kernel/approx.hpp
index 1ad7061..6c9dd7d 100644
--- a/src/backend/cuda/kernel/approx.hpp
+++ b/src/backend/cuda/kernel/approx.hpp
@@ -228,8 +228,8 @@ namespace cuda
int blocksPerMat = divup(out.dims[0], threads.x);
dim3 blocks(blocksPerMat * out.dims[1], out.dims[2] * out.dims[3]);
- approx1_kernel<Ty, Tp, method><<<blocks, threads>>>
- (out, in, pos, offGrid, blocksPerMat);
+ CUDA_LAUNCH((approx1_kernel<Ty, Tp, method>), blocks, threads,
+ out, in, pos, offGrid, blocksPerMat);
POST_LAUNCH_CHECK();
}
@@ -242,8 +242,8 @@ namespace cuda
int blocksPerMatY = divup(out.dims[1], threads.y);
dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3]);
- approx2_kernel<Ty, Tp, method><<<blocks, threads>>>
- (out, in, pos, qos, offGrid, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((approx2_kernel<Ty, Tp, method>), blocks, threads,
+ out, in, pos, qos, offGrid, blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/assign.hpp b/src/backend/cuda/kernel/assign.hpp
index 7284437..5e792a5 100644
--- a/src/backend/cuda/kernel/assign.hpp
+++ b/src/backend/cuda/kernel/assign.hpp
@@ -77,7 +77,7 @@ void assign(Param<T> out, CParam<T> in, const AssignKernelParam_t& p)
dim3 blocks(blks_x*in.dims[2], blks_y*in.dims[3]);
- AssignKernel<T> <<<blocks, threads>>> (out, in, p, blks_x, blks_y);
+ CUDA_LAUNCH((AssignKernel<T>), blocks, threads, out, in, p, blks_x, blks_y);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/bilateral.hpp b/src/backend/cuda/kernel/bilateral.hpp
index 21740e0..d4e384b 100644
--- a/src/backend/cuda/kernel/bilateral.hpp
+++ b/src/backend/cuda/kernel/bilateral.hpp
@@ -151,9 +151,8 @@ void bilateral(Param<outType> out, CParam<inType> in, float s_sigma, float c_sig
int num_gauss_elems = (2 * radius + 1)*(2 * radius + 1);
int total_shrd_size = sizeof(outType) * (num_shrd_elems + num_gauss_elems);
- bilateralKernel<inType, outType>
- <<<blocks, threads, total_shrd_size>>>
- (out, in, s_sigma, c_sigma, num_shrd_elems, blk_x, blk_y);
+ CUDA_LAUNCH_SMEM((bilateralKernel<inType, outType>), blocks, threads, total_shrd_size,
+ out, in, s_sigma, c_sigma, num_shrd_elems, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/diagonal.hpp b/src/backend/cuda/kernel/diagonal.hpp
index 45f9c80..88acfe6 100644
--- a/src/backend/cuda/kernel/diagonal.hpp
+++ b/src/backend/cuda/kernel/diagonal.hpp
@@ -48,7 +48,7 @@ namespace kernel
int blocks_y = divup(out.dims[1], threads.y);
dim3 blocks(blocks_x * out.dims[2], blocks_y);
- diagCreateKernel<T> <<<blocks, threads>>> (out, in, num, blocks_x);
+ CUDA_LAUNCH((diagCreateKernel<T>), blocks, threads, out, in, num, blocks_x);
POST_LAUNCH_CHECK();
}
@@ -82,7 +82,7 @@ namespace kernel
int blocks_z = out.dims[2];
dim3 blocks(blocks_x, out.dims[3] * blocks_z);
- diagExtractKernel<T> <<<blocks, threads>>> (out, in, num, blocks_z);
+ CUDA_LAUNCH((diagExtractKernel<T>), blocks, threads, out, in, num, blocks_z);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/diff.hpp b/src/backend/cuda/kernel/diff.hpp
index c5f57d9..e2cfdd7 100644
--- a/src/backend/cuda/kernel/diff.hpp
+++ b/src/backend/cuda/kernel/diff.hpp
@@ -88,8 +88,8 @@ namespace cuda
const int oElem = out.dims[0] * out.dims[1] * out.dims[2] * out.dims[3];
- diff_kernel<T, dim, isDiff2> <<<blocks, threads>>>
- (out, in, oElem, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((diff_kernel<T, dim, isDiff2>), blocks, threads,
+ out, in, oElem, blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/exampleFunction.hpp b/src/backend/cuda/kernel/exampleFunction.hpp
index 0366b78..c9e032b 100644
--- a/src/backend/cuda/kernel/exampleFunction.hpp
+++ b/src/backend/cuda/kernel/exampleFunction.hpp
@@ -52,7 +52,16 @@ void exampleFunc(Param<T> out, CParam<T> in, const af_someenum_t p)
dim3 blocks(blk_x, blk_y); // set your opencl launch config for grid
// launch your kernel
- exampleFuncKernel<T> <<<blocks, threads>>> (out, in, p);
+ // One must use CUDA_LAUNCH macro to launch their kernels to ensure
+ // that the kernel is launched on an appropriate stream
+ //
+ // Use CUDA_LAUNCH macro for launching kernels that don't use dynamic shared memory
+ //
+ // Use CUDA_LAUNCH_SMEM macro for launching kernsl that use dynamic shared memory
+ //
+ // CUDA_LAUNCH_SMEM takes in an additional parameter, size of shared memory, after
+ // threads paramters, which are then followed by kernel parameters
+ CUDA_LAUNCH((exampleFuncKernel<T>), blocks, threads, out, in, p);
POST_LAUNCH_CHECK(); // Macro for post kernel launch checks
// these checks are carried ONLY IN DEBUG mode
diff --git a/src/backend/cuda/kernel/fast.hpp b/src/backend/cuda/kernel/fast.hpp
index bac1ee2..cf2061b 100644
--- a/src/backend/cuda/kernel/fast.hpp
+++ b/src/backend/cuda/kernel/fast.hpp
@@ -13,6 +13,7 @@
#include <debug_cuda.hpp>
#include <kernel/fast_lut.hpp>
#include <memory.hpp>
+#include "shared.hpp"
namespace cuda
{
@@ -117,64 +118,6 @@ inline __device__ double abs_diff(const double x, const double y)
return fabs(x - y);
}
-// non-specialized class template
-//http://www.naic.edu/~phil/hardware/nvidia/doc/src/simpleTemplates/doc/readme.txt
-template <class T>
-class ExtSharedMem
-{
- public:
- // Ensure that we won't compile any un-specialized types
- __device__ T* getPointer() { extern __shared__ float s_float[]; return s_float; };
-};
-
-// specialization for char
-template <>
-class ExtSharedMem <char>
-{
- public:
- __device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; }
-};
-
-// specialization for int
-template <>
-class ExtSharedMem <uchar>
-{
- public:
- __device__ uchar* getPointer() { extern __shared__ uchar s_uchar[]; return s_uchar; }
-};
-
-// specialization for int
-template <>
-class ExtSharedMem <int>
-{
- public:
- __device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; }
-};
-
-// specialization for unsigned
-template <>
-class ExtSharedMem <unsigned>
-{
- public:
- __device__ unsigned* getPointer() { extern __shared__ unsigned s_unsigned[]; return s_unsigned; }
-};
-
-// specialization for float
-template <>
-class ExtSharedMem <float>
-{
- public:
- __device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; }
-};
-
-// specialization for double
-template <>
-class ExtSharedMem <double>
-{
- public:
- __device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; }
-};
-
template<typename T, int arc_length>
__device__
void locate_features_core(
@@ -276,7 +219,7 @@ void locate_features(
unsigned lx = bx / 2 + 3;
unsigned ly = by / 2 + 3;
- ExtSharedMem<T> shared;
+ SharedMemory<T> shared;
T* local_image_curr = shared.getPointer();
load_shared_image(in, local_image_curr, ix, iy, bx, by, x, y, lx, ly, edge);
__syncthreads();
@@ -451,28 +394,28 @@ void fast(unsigned* out_feat,
switch(arc_length) {
case 9:
- locate_features<T, 9><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T, 9>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
case 10:
- locate_features<T,10><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T,10>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
case 11:
- locate_features<T,11><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T,11>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
case 12:
- locate_features<T,12><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T,12>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
case 13:
- locate_features<T,13><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T,13>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
case 14:
- locate_features<T,14><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T,14>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
case 15:
- locate_features<T,15><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T,15>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
case 16:
- locate_features<T,16><<<blocks, threads, shared_size>>>(in, d_score, thr, edge);
+ CUDA_LAUNCH_SMEM((locate_features<T,16>), blocks, threads, shared_size, in, d_score, thr, edge);
break;
}
@@ -490,11 +433,13 @@ void fast(unsigned* out_feat,
unsigned *d_offsets = memAlloc<unsigned>(blocks.x * blocks.y);
if (nonmax)
- non_max_counts<true ><<<blocks, threads>>>(d_counts, d_offsets, d_total, d_flags,
- d_score, in.dims[0], in.dims[1], edge);
+ CUDA_LAUNCH((non_max_counts<true >), blocks, threads,
+ d_counts, d_offsets, d_total, d_flags,
+ d_score, in.dims[0], in.dims[1], edge);
else
- non_max_counts<false><<<blocks, threads>>>(d_counts, d_offsets, d_total, d_flags,
- d_score, in.dims[0], in.dims[1], edge);
+ CUDA_LAUNCH((non_max_counts<false>), blocks, threads,
+ d_counts, d_offsets, d_total, d_flags,
+ d_score, in.dims[0], in.dims[1], edge);
POST_LAUNCH_CHECK();
@@ -508,8 +453,9 @@ void fast(unsigned* out_feat,
*y_out = memAlloc<float>(total);
*score_out = memAlloc<float>(total);
- get_features<float><<<blocks, threads>>>(*x_out, *y_out, *score_out, d_flags, d_counts,
- d_offsets, total, in.dims[0], in.dims[1], edge);
+ CUDA_LAUNCH((get_features<float>), blocks, threads,
+ *x_out, *y_out, *score_out, d_flags, d_counts,
+ d_offsets, total, in.dims[0], in.dims[1], edge);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/fftconvolve.hpp b/src/backend/cuda/kernel/fftconvolve.hpp
index 774fe80..2acf2db 100644
--- a/src/backend/cuda/kernel/fftconvolve.hpp
+++ b/src/backend/cuda/kernel/fftconvolve.hpp
@@ -274,13 +274,13 @@ void packDataHelper(Param<convT> sig_packed,
// Pack signal in a complex matrix where first dimension is half the input
// (allows faster FFT computation) and pad array to a power of 2 with 0s
- packData<convT, T><<<blocks, threads>>>(sig_packed, sig, sig_half_d0, sig_half_d0_odd);
+ CUDA_LAUNCH((packData<convT, T>), blocks, threads, sig_packed, sig, sig_half_d0, sig_half_d0_odd);
POST_LAUNCH_CHECK();
blocks = dim3(divup(filter_packed_elem, threads.x));
// Pad filter array with 0s
- padArray<convT, T><<<blocks, threads>>>(filter_packed, filter);
+ CUDA_LAUNCH((padArray<convT, T>), blocks, threads, filter_packed, filter);
POST_LAUNCH_CHECK();
}
@@ -305,20 +305,20 @@ void complexMultiplyHelper(Param<T> out,
// Multiply filter and signal FFT arrays
switch(kind) {
case CONVOLVE_BATCH_NONE:
- complexMultiply<convT, CONVOLVE_BATCH_NONE ><<<blocks, threads>>>
- (sig_packed, sig_packed, filter_packed, mul_elem);
+ CUDA_LAUNCH((complexMultiply<convT, CONVOLVE_BATCH_NONE>), blocks, threads,
+ sig_packed, sig_packed, filter_packed, mul_elem);
break;
case CONVOLVE_BATCH_SIGNAL:
- complexMultiply<convT, CONVOLVE_BATCH_SIGNAL ><<<blocks, threads>>>
- (sig_packed, sig_packed, filter_packed, mul_elem);
+ CUDA_LAUNCH((complexMultiply<convT, CONVOLVE_BATCH_SIGNAL>), blocks, threads,
+ sig_packed, sig_packed, filter_packed, mul_elem);
break;
case CONVOLVE_BATCH_KERNEL:
- complexMultiply<convT, CONVOLVE_BATCH_KERNEL ><<<blocks, threads>>>
- (filter_packed, sig_packed, filter_packed, mul_elem);
+ CUDA_LAUNCH((complexMultiply<convT, CONVOLVE_BATCH_KERNEL>), blocks, threads,
+ filter_packed, sig_packed, filter_packed, mul_elem);
break;
case CONVOLVE_BATCH_SAME:
- complexMultiply<convT, CONVOLVE_BATCH_SAME><<<blocks, threads>>>
- (sig_packed, sig_packed, filter_packed, mul_elem);
+ CUDA_LAUNCH((complexMultiply<convT, CONVOLVE_BATCH_SAME>), blocks, threads,
+ sig_packed, sig_packed, filter_packed, mul_elem);
break;
case CONVOLVE_BATCH_UNSUPPORTED:
default:
@@ -347,8 +347,8 @@ void reorderOutputHelper(Param<T> out,
dim3 threads(THREADS);
dim3 blocks(divup(out.strides[3] * out.dims[3], threads.x));
- reorderOutput<T, convT, expand, roundOut><<<blocks, threads>>>
- (out, packed, filter, sig_half_d0, baseDim, fftScale);
+ CUDA_LAUNCH((reorderOutput<T, convT, expand, roundOut>), blocks, threads,
+ out, packed, filter, sig_half_d0, baseDim, fftScale);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/gradient.hpp b/src/backend/cuda/kernel/gradient.hpp
index 6d44534..0dd1dab 100644
--- a/src/backend/cuda/kernel/gradient.hpp
+++ b/src/backend/cuda/kernel/gradient.hpp
@@ -107,7 +107,8 @@ namespace cuda
blocksPerMatY * in.dims[3],
1);
- gradient_kernel<T><<<blocks, threads>>>(grad0, grad1, in, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((gradient_kernel<T>), blocks, threads,
+ grad0, grad1, in, blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index cae58ad..11f628b 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -248,8 +248,9 @@ void harris(unsigned* corners_out,
// Compute second-order derivatives
dim3 threads(THREADS_PER_BLOCK, 1);
dim3 blocks(divup(in.dims[3] * in.strides[3], threads.x), 1);
- second_order_deriv<T><<<blocks, threads>>>(ixx.ptr, ixy.ptr, iyy.ptr,
- in.dims[3] * in.strides[3], ix.ptr, iy.ptr);
+ CUDA_LAUNCH((second_order_deriv<T>), blocks, threads,
+ ixx.ptr, ixy.ptr, iyy.ptr,
+ in.dims[3] * in.strides[3], ix.ptr, iy.ptr);
memFree(ix.ptr);
memFree(iy.ptr);
@@ -287,10 +288,9 @@ void harris(unsigned* corners_out,
threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
blocks = dim3(divup(in.dims[1] - border_len*2, threads.x),
divup(in.dims[0] - border_len*2, threads.y));
- harris_responses<T><<<blocks, threads>>>(d_responses,
- in.dims[0], in.dims[1],
- ixx.ptr, ixy.ptr, iyy.ptr,
- k_thr, border_len);
+ CUDA_LAUNCH((harris_responses<T>), blocks, threads,
+ d_responses, in.dims[0], in.dims[1],
+ ixx.ptr, ixy.ptr, iyy.ptr, k_thr, border_len);
memFree(ixx.ptr);
memFree(ixy.ptr);
@@ -299,10 +299,9 @@ void harris(unsigned* corners_out,
const float min_r = (max_corners > 0) ? 0.f : min_response;
// Perform non-maximal suppression
- non_maximal<T><<<blocks, threads>>>(d_x_corners, d_y_corners,
- d_resp_corners, d_corners_found,
- in.dims[0], in.dims[1], d_responses,
- min_r, border_len, corner_lim);
+ CUDA_LAUNCH((non_maximal<T>), blocks, threads,
+ d_x_corners, d_y_corners, d_resp_corners, d_corners_found,
+ in.dims[0], in.dims[1], d_responses, min_r, border_len, corner_lim);
unsigned corners_found = 0;
CUDA_CHECK(cudaMemcpy(&corners_found, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
@@ -346,10 +345,9 @@ void harris(unsigned* corners_out,
// responses
threads = dim3(THREADS_PER_BLOCK, 1);
blocks = dim3(divup(*corners_out, threads.x), 1);
- keep_corners<<<blocks, threads>>>(*x_out, *y_out, *resp_out,
- d_x_corners, d_y_corners,
- harris_responses.ptr, harris_idx.ptr,
- *corners_out);
+ CUDA_LAUNCH(keep_corners, blocks, threads,
+ *x_out, *y_out, *resp_out, d_x_corners, d_y_corners,
+ harris_responses.ptr, harris_idx.ptr, *corners_out);
memFree(d_x_corners);
memFree(d_y_corners);
diff --git a/src/backend/cuda/kernel/histogram.hpp b/src/backend/cuda/kernel/histogram.hpp
index 6926fd0..32bee36 100644
--- a/src/backend/cuda/kernel/histogram.hpp
+++ b/src/backend/cuda/kernel/histogram.hpp
@@ -86,9 +86,8 @@ void histogram(Param<outType> out, CParam<inType> in, cfloat *d_minmax, int nbin
int smem_size = nbins * sizeof(outType);
- histogramKernel<inType, outType>
- <<<blocks, threads, smem_size>>>
- (out, in, d_minmax, nElems, nbins, blk_x);
+ CUDA_LAUNCH_SMEM((histogramKernel<inType, outType>), blocks, threads, smem_size,
+ out, in, d_minmax, nElems, nbins, blk_x);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/hsv_rgb.hpp b/src/backend/cuda/kernel/hsv_rgb.hpp
index d9a54d3..8e8dd04 100644
--- a/src/backend/cuda/kernel/hsv_rgb.hpp
+++ b/src/backend/cuda/kernel/hsv_rgb.hpp
@@ -106,7 +106,7 @@ void hsv2rgb_convert(Param<T> out, CParam<T> in)
// parameter would be along 4th dimension
dim3 blocks(blk_x*in.dims[3], blk_y);
- convert<T, isHSV2RGB> <<<blocks, threads>>> (out, in, blk_x);
+ CUDA_LAUNCH((convert<T, isHSV2RGB>), blocks, threads, out, in, blk_x);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/identity.hpp b/src/backend/cuda/kernel/identity.hpp
index 670eba7..ac97670 100644
--- a/src/backend/cuda/kernel/identity.hpp
+++ b/src/backend/cuda/kernel/identity.hpp
@@ -51,7 +51,7 @@ namespace kernel
int blocks_y = divup(out.dims[1], threads.y);
dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);
- identity_kernel<T> <<<blocks, threads>>> (out, blocks_x, blocks_y);
+ CUDA_LAUNCH((identity_kernel<T>), blocks, threads, out, blocks_x, blocks_y);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/iir.hpp b/src/backend/cuda/kernel/iir.hpp
index 9c21057..1916ea3 100644
--- a/src/backend/cuda/kernel/iir.hpp
+++ b/src/backend/cuda/kernel/iir.hpp
@@ -89,7 +89,7 @@ namespace cuda
int threads = 256;
while (threads > y.dims[0] && threads > 32) threads /= 2;
- (iir_kernel<T, batch_a>)<<<blocks, threads>>>(y, c, a, blocks_y);
+ CUDA_LAUNCH((iir_kernel<T, batch_a>), blocks, threads, y, c, a, blocks_y);
}
}
diff --git a/src/backend/cuda/kernel/index.hpp b/src/backend/cuda/kernel/index.hpp
index 11c655c..66e8d18 100644
--- a/src/backend/cuda/kernel/index.hpp
+++ b/src/backend/cuda/kernel/index.hpp
@@ -77,7 +77,7 @@ void index(Param<T> out, CParam<T> in, const IndexKernelParam_t& p)
dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]);
- indexKernel<T> <<<blocks, threads>>> (out, in, p, blks_x, blks_y);
+ CUDA_LAUNCH((indexKernel<T>), blocks, threads, out, in, p, blks_x, blks_y);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/iota.hpp b/src/backend/cuda/kernel/iota.hpp
index c29c7c5..2632266 100644
--- a/src/backend/cuda/kernel/iota.hpp
+++ b/src/backend/cuda/kernel/iota.hpp
@@ -79,8 +79,9 @@ namespace cuda
blocksPerMatY * out.dims[3],
1);
- iota_kernel<T><<<blocks, threads>>>(out, sdims[0], sdims[1], sdims[2], sdims[3],
- tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((iota_kernel<T>), blocks, threads,
+ out, sdims[0], sdims[1], sdims[2], sdims[3],
+ tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/ireduce.hpp b/src/backend/cuda/kernel/ireduce.hpp
index 8c0dff1..4f9cac5 100644
--- a/src/backend/cuda/kernel/ireduce.hpp
+++ b/src/backend/cuda/kernel/ireduce.hpp
@@ -199,16 +199,16 @@ namespace kernel
switch (threads_y) {
case 8:
- (ireduce_dim_kernel<T, op, dim, is_first, 8>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_dim_kernel<T, op, dim, is_first, 8>), blocks, threads,
out, olptr, in, ilptr, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
case 4:
- (ireduce_dim_kernel<T, op, dim, is_first, 4>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_dim_kernel<T, op, dim, is_first, 4>), blocks, threads,
out, olptr, in, ilptr, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
case 2:
- (ireduce_dim_kernel<T, op, dim, is_first, 2>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_dim_kernel<T, op, dim, is_first, 2>), blocks, threads,
out, olptr, in, ilptr, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
case 1:
- (ireduce_dim_kernel<T, op, dim, is_first, 1>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_dim_kernel<T, op, dim, is_first, 1>), blocks, threads,
out, olptr, in, ilptr, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
}
@@ -377,16 +377,16 @@ namespace kernel
switch (threads_x) {
case 32:
- (ireduce_first_kernel<T, op, is_first, 32>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_first_kernel<T, op, is_first, 32>), blocks, threads,
out, olptr, in, ilptr, blocks_x, blocks_y, repeat); break;
case 64:
- (ireduce_first_kernel<T, op, is_first, 64>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_first_kernel<T, op, is_first, 64>), blocks, threads,
out, olptr, in, ilptr, blocks_x, blocks_y, repeat); break;
case 128:
- (ireduce_first_kernel<T, op, is_first, 128>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_first_kernel<T, op, is_first, 128>), blocks, threads,
out, olptr, in, ilptr, blocks_x, blocks_y, repeat); break;
case 256:
- (ireduce_first_kernel<T, op, is_first, 256>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((ireduce_first_kernel<T, op, is_first, 256>), blocks, threads,
out, olptr, in, ilptr, blocks_x, blocks_y, repeat); break;
}
diff --git a/src/backend/cuda/kernel/join.hpp b/src/backend/cuda/kernel/join.hpp
index 2bf68aa..c601537 100644
--- a/src/backend/cuda/kernel/join.hpp
+++ b/src/backend/cuda/kernel/join.hpp
@@ -73,9 +73,9 @@ namespace cuda
blocksPerMatY * X.dims[3],
1);
- join_kernel<To, Tx, dim><<<blocks, threads>>>
- (out, X, offset[0], offset[1], offset[2], offset[3],
- blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((join_kernel<To, Tx, dim>), blocks, threads,
+ out, X, offset[0], offset[1], offset[2], offset[3],
+ blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/lookup.hpp b/src/backend/cuda/kernel/lookup.hpp
index 1b23ff5..3936c8d 100644
--- a/src/backend/cuda/kernel/lookup.hpp
+++ b/src/backend/cuda/kernel/lookup.hpp
@@ -95,7 +95,7 @@ void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims)
dim3 blocks(blks, 1);
- lookup1D<in_t, idx_t> <<<blocks, threads>>> (out, in, indices, vDim);
+ CUDA_LAUNCH((lookup1D<in_t, idx_t>), blocks, threads, out, in, indices, vDim);
} else {
const dim3 threads(THREADS_X, THREADS_Y);
@@ -104,7 +104,7 @@ void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims)
dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]);
- lookupND<in_t, idx_t, dim> <<<blocks, threads>>> (out, in, indices, blks_x, blks_y);
+ CUDA_LAUNCH((lookupND<in_t, idx_t, dim>), blocks, threads, out, in, indices, blks_x, blks_y);
}
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/lu_split.hpp b/src/backend/cuda/kernel/lu_split.hpp
index 33182ea..1c9a5cc 100644
--- a/src/backend/cuda/kernel/lu_split.hpp
+++ b/src/backend/cuda/kernel/lu_split.hpp
@@ -90,9 +90,9 @@ namespace cuda
1);
if(lower.dims[0] == in.dims[0] && lower.dims[1] == in.dims[1]) {
- lu_split_kernel<T, true><<<blocks, threads>>>(lower, upper, in, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((lu_split_kernel<T, true>), blocks, threads, lower, upper, in, blocksPerMatX, blocksPerMatY);
} else {
- lu_split_kernel<T, false><<<blocks, threads>>>(lower, upper, in, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((lu_split_kernel<T, false>), blocks, threads, lower, upper, in, blocksPerMatX, blocksPerMatY);
}
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/match_template.hpp b/src/backend/cuda/kernel/match_template.hpp
index e86c398..675ef6c 100644
--- a/src/backend/cuda/kernel/match_template.hpp
+++ b/src/backend/cuda/kernel/match_template.hpp
@@ -135,7 +135,8 @@ void matchTemplate(Param<outType> out, CParam<inType> srch, CParam<inType> tmplt
dim3 blocks(blk_x*srch.dims[2], blk_y*srch.dims[3]);
- matchTemplate<inType, outType, mType, needMean> <<< blocks, threads >>> (out, srch, tmplt, blk_x, blk_y);
+ CUDA_LAUNCH((matchTemplate<inType, outType, mType, needMean>), blocks, threads,
+ out, srch, tmplt, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/meanshift.hpp b/src/backend/cuda/kernel/meanshift.hpp
index 11ff10d..dc8c096 100644
--- a/src/backend/cuda/kernel/meanshift.hpp
+++ b/src/backend/cuda/kernel/meanshift.hpp
@@ -212,9 +212,11 @@ void meanshift(Param<T> out, CParam<T> in, float s_sigma, float c_sigma, uint it
size_t shrd_size = channels*(threads.x + padding)*(threads.y+padding)*sizeof(T);
if (is_color)
- (meanshiftKernel<T, 3>) <<<blocks, threads, shrd_size>>>(out, in, space_, radius, cvar, iter, blk_x, blk_y);
+ CUDA_LAUNCH_SMEM((meanshiftKernel<T, 3>), blocks, threads, shrd_size,
+ out, in, space_, radius, cvar, iter, blk_x, blk_y);
else
- (meanshiftKernel<T, 1>) <<<blocks, threads, shrd_size>>>(out, in, space_, radius, cvar, iter, blk_x, blk_y);
+ CUDA_LAUNCH_SMEM((meanshiftKernel<T, 1>), blocks, threads, shrd_size,
+ out, in, space_, radius, cvar, iter, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/medfilt.hpp b/src/backend/cuda/kernel/medfilt.hpp
index 16665a4..37aa967 100644
--- a/src/backend/cuda/kernel/medfilt.hpp
+++ b/src/backend/cuda/kernel/medfilt.hpp
@@ -8,6 +8,7 @@
********************************************************/
#include <af/defines.h>
+#include <platform.hpp>
#include <backend.hpp>
#include <dispatch.hpp>
#include <Param.hpp>
@@ -212,13 +213,13 @@ void medfilt(Param<T> out, CParam<T> in, int w_len, int w_wid)
dim3 blocks(blk_x*in.dims[2], blk_y*in.dims[3]);
switch(w_len) {
- case 3: (medfilt<T, pad, 3, 3>)<<<blocks, threads>>>(out, in, blk_x, blk_y); break;
- case 5: (medfilt<T, pad, 5, 5>)<<<blocks, threads>>>(out, in, blk_x, blk_y); break;
- case 7: (medfilt<T, pad, 7, 7>)<<<blocks, threads>>>(out, in, blk_x, blk_y); break;
- case 9: (medfilt<T, pad, 9, 9>)<<<blocks, threads>>>(out, in, blk_x, blk_y); break;
- case 11: (medfilt<T, pad, 11, 11>)<<<blocks, threads>>>(out, in, blk_x, blk_y); break;
- case 13: (medfilt<T, pad, 13, 13>)<<<blocks, threads>>>(out, in, blk_x, blk_y); break;
- case 15: (medfilt<T, pad, 15, 15>)<<<blocks, threads>>>(out, in, blk_x, blk_y); break;
+ case 3: CUDA_LAUNCH((medfilt<T,pad, 3, 3>), blocks, threads, out, in, blk_x, blk_y); break;
+ case 5: CUDA_LAUNCH((medfilt<T,pad, 5, 5>), blocks, threads, out, in, blk_x, blk_y); break;
+ case 7: CUDA_LAUNCH((medfilt<T,pad, 7, 7>), blocks, threads, out, in, blk_x, blk_y); break;
+ case 9: CUDA_LAUNCH((medfilt<T,pad, 9, 9>), blocks, threads, out, in, blk_x, blk_y); break;
+ case 11: CUDA_LAUNCH((medfilt<T,pad,11,11>), blocks, threads, out, in, blk_x, blk_y); break;
+ case 13: CUDA_LAUNCH((medfilt<T,pad,13,13>), blocks, threads, out, in, blk_x, blk_y); break;
+ case 15: CUDA_LAUNCH((medfilt<T,pad,15,15>), blocks, threads, out, in, blk_x, blk_y); break;
}
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/memcopy.hpp b/src/backend/cuda/kernel/memcopy.hpp
index 96f7375..2246419 100644
--- a/src/backend/cuda/kernel/memcopy.hpp
+++ b/src/backend/cuda/kernel/memcopy.hpp
@@ -80,9 +80,8 @@ namespace kernel
dims_t _istrides = {{istrides[0], istrides[1], istrides[2], istrides[3]}};
dims_t _idims = {{idims[0], idims[1], idims[2], idims[3]}};
- (memcopy_kernel<T>)<<<blocks, threads>>>(out, _ostrides,
- in, _idims, _istrides,
- blocks_x, blocks_y);
+ CUDA_LAUNCH((memcopy_kernel<T>), blocks, threads,
+ out, _ostrides, in, _idims, _istrides, blocks_x, blocks_y);
POST_LAUNCH_CHECK();
}
@@ -211,9 +210,11 @@ namespace kernel
(src.dims[3]==dst.dims[3]) );
if (same_dims)
- (copy_kernel<inType, outType, true >)<<<blocks, threads>>>(dst, src, default_value, factor, trgt_dims, blk_x, blk_y);
+ CUDA_LAUNCH((copy_kernel<inType, outType, true >), blocks, threads,
+ dst, src, default_value, factor, trgt_dims, blk_x, blk_y);
else
- (copy_kernel<inType, outType, false>)<<<blocks, threads>>>(dst, src, default_value, factor, trgt_dims, blk_x, blk_y);
+ CUDA_LAUNCH((copy_kernel<inType, outType, false>), blocks, threads,
+ dst, src, default_value, factor, trgt_dims, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/morph.hpp b/src/backend/cuda/kernel/morph.hpp
index f7d29aa..30bee48 100644
--- a/src/backend/cuda/kernel/morph.hpp
+++ b/src/backend/cuda/kernel/morph.hpp
@@ -311,16 +311,16 @@ void morph(Param<T> out, CParam<T> in, int windLen)
int shrdSize = shrdLen * (kernel::THREADS_Y + padding) * sizeof(T);
switch(windLen) {
- case 3: morphKernel<T, isDilation, 3> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 5: morphKernel<T, isDilation, 5> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 7: morphKernel<T, isDilation, 7> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 9: morphKernel<T, isDilation, 9> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 11: morphKernel<T, isDilation,11> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 13: morphKernel<T, isDilation,13> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 15: morphKernel<T, isDilation,15> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 17: morphKernel<T, isDilation,17> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- case 19: morphKernel<T, isDilation,19> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
- default: morphKernel<T, isDilation, 3> <<< blocks, threads, shrdSize>>>(out, in, blk_x, blk_y); break;
+ case 3: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 5: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 7: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 9: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 9>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 11: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,11>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 13: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,13>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 15: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,15>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 17: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,17>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ case 19: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,19>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
+ default: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
}
POST_LAUNCH_CHECK();
@@ -343,10 +343,10 @@ void morph3d(Param<T> out, CParam<T> in, int windLen)
int shrdSize = shrdLen * (kernel::CUBE_Y + padding) * (kernel::CUBE_Z + padding) * sizeof(T);
switch(windLen) {
- case 3: morph3DKernel<T, isDilation, 3> <<< blocks, threads, shrdSize>>>(out, in, blk_x); break;
- case 5: morph3DKernel<T, isDilation, 5> <<< blocks, threads, shrdSize>>>(out, in, blk_x); break;
- case 7: morph3DKernel<T, isDilation, 7> <<< blocks, threads, shrdSize>>>(out, in, blk_x); break;
- default: morph3DKernel<T, isDilation, 3> <<< blocks, threads, shrdSize>>>(out, in, blk_x); break;
+ case 3: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x); break;
+ case 5: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x); break;
+ case 7: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x); break;
+ default: CUDA_LAUNCH_SMEM((morph3DKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x); break;
}
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/nearest_neighbour.hpp b/src/backend/cuda/kernel/nearest_neighbour.hpp
index f6baccd..14c448f 100644
--- a/src/backend/cuda/kernel/nearest_neighbour.hpp
+++ b/src/backend/cuda/kernel/nearest_neighbour.hpp
@@ -450,72 +450,72 @@ void nearest_neighbour(Param<uint> idx,
switch(feat_len) {
// Optimized lengths (faster due to loop unrolling)
case 1:
- nearest_neighbour_unroll<T,To,dist_type,1,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 2:
- nearest_neighbour_unroll<T,To,dist_type,2,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 4:
- nearest_neighbour_unroll<T,To,dist_type,4,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 8:
- nearest_neighbour_unroll<T,To,dist_type,8,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 16:
- nearest_neighbour_unroll<T,To,dist_type,16,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 32:
- nearest_neighbour_unroll<T,To,dist_type,32,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 64:
- nearest_neighbour_unroll<T,To,dist_type,64,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
default:
- nearest_neighbour<T,To,dist_type,true><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
+ CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
}
}
else {
switch(feat_len) {
// Optimized lengths (faster due to loop unrolling)
case 1:
- nearest_neighbour_unroll<T,To,dist_type,1,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 2:
- nearest_neighbour_unroll<T,To,dist_type,2,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 4:
- nearest_neighbour_unroll<T,To,dist_type,4,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 8:
- nearest_neighbour_unroll<T,To,dist_type,8,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 16:
- nearest_neighbour_unroll<T,To,dist_type,16,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 32:
- nearest_neighbour_unroll<T,To,dist_type,32,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 64:
- nearest_neighbour_unroll<T,To,dist_type,64,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist);
+ CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist);
break;
default:
- nearest_neighbour<T,To,dist_type,false><<<blocks, threads, smem_sz>>>
- (d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
+ CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz,
+ d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
}
}
POST_LAUNCH_CHECK();
@@ -525,9 +525,8 @@ void nearest_neighbour(Param<uint> idx,
// Reduce all smallest Hamming distances from each block and store final
// best match
- select_matches<<<blocks, threads>>>(idx, dist,
- d_blk_idx, d_blk_dist,
- nquery, nblk, max_dist);
+ CUDA_LAUNCH(select_matches, blocks, threads,
+ idx, dist, d_blk_idx, d_blk_dist, nquery, nblk, max_dist);
POST_LAUNCH_CHECK();
memFree(d_blk_idx);
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index e0ce695..ad99fcb 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -370,10 +370,8 @@ void orb(unsigned* out_feat,
// Good block_size >= 7 (must be an odd number)
dim3 threads(THREADS_X, THREADS_Y);
dim3 blocks(divup(feat_pyr[i], threads.x), 1);
- harris_response<T,false><<<blocks, threads>>>(d_score_harris, NULL,
- d_x_pyr[i], d_y_pyr[i], NULL,
- feat_pyr[i],
- img_pyr[i], 7, 0.04f, patch_size);
+ CUDA_LAUNCH((harris_response<T,false>), blocks, threads,
+ d_score_harris, NULL, d_x_pyr[i], d_y_pyr[i], NULL, feat_pyr[i], img_pyr[i], 7, 0.04f, patch_size);
POST_LAUNCH_CHECK();
Param<float> harris_sorted;
@@ -405,9 +403,9 @@ void orb(unsigned* out_feat,
// Keep only features with higher Harris responses
threads = dim3(THREADS, 1);
blocks = dim3(divup(feat_pyr[i], threads.x), 1);
- keep_features<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_score_lvl, NULL,
- d_x_pyr[i], d_y_pyr[i], harris_sorted.ptr, harris_idx.ptr,
- NULL, feat_pyr[i]);
+ CUDA_LAUNCH((keep_features<T>), blocks, threads,
+ d_x_lvl, d_y_lvl, d_score_lvl, NULL,
+ d_x_pyr[i], d_y_pyr[i], harris_sorted.ptr, harris_idx.ptr, NULL, feat_pyr[i]);
POST_LAUNCH_CHECK();
memFree(d_x_pyr[i]);
@@ -420,8 +418,8 @@ void orb(unsigned* out_feat,
// Compute orientation of features
threads = dim3(THREADS_X, THREADS_Y);
blocks = dim3(divup(feat_pyr[i], threads.x), 1);
- centroid_angle<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_ori_lvl, feat_pyr[i],
- img_pyr[i], patch_size);
+ CUDA_LAUNCH((centroid_angle<T>), blocks, threads,
+ d_x_lvl, d_y_lvl, d_ori_lvl, feat_pyr[i], img_pyr[i], patch_size);
POST_LAUNCH_CHECK();
Param<T> lvl_tmp;
@@ -462,9 +460,9 @@ void orb(unsigned* out_feat,
// Compute ORB descriptors
threads = dim3(THREADS_X, THREADS_Y);
blocks = dim3(divup(feat_pyr[i], threads.x), 1);
- extract_orb<T><<<blocks, threads>>>(d_desc_lvl, feat_pyr[i],
- d_x_lvl, d_y_lvl, d_ori_lvl, d_size_lvl,
- img_pyr[i], lvl_scl[i], patch_size);
+ CUDA_LAUNCH((extract_orb<T>), blocks, threads,
+ d_desc_lvl, feat_pyr[i], d_x_lvl, d_y_lvl, d_ori_lvl, d_size_lvl,
+ img_pyr[i], lvl_scl[i], patch_size);
POST_LAUNCH_CHECK();
if (i > 0)
diff --git a/src/backend/cuda/kernel/random.hpp b/src/backend/cuda/kernel/random.hpp
index 0c3167c..a79a781 100644
--- a/src/backend/cuda/kernel/random.hpp
+++ b/src/backend/cuda/kernel/random.hpp
@@ -136,7 +136,7 @@ namespace kernel
CUDA_CHECK(cudaMalloc(&states[device], BLOCKS * THREADS * sizeof(curandState_t)));
}
- setup_kernel<<<BLOCKS, THREADS>>>(states[device], seed);
+ CUDA_LAUNCH((setup_kernel), BLOCKS, THREADS, states[device], seed);
POST_LAUNCH_CHECK();
is_init[device] = true;
}
@@ -149,7 +149,7 @@ namespace kernel
int threads = THREADS;
int blocks = divup(elements, THREADS);
if (blocks > BLOCKS) blocks = BLOCKS;
- uniform_kernel<<<blocks, threads>>>(out, states[device], elements);
+ CUDA_LAUNCH(uniform_kernel, blocks, threads, out, states[device], elements);
POST_LAUNCH_CHECK();
}
@@ -165,12 +165,12 @@ namespace kernel
if (!states[device]) {
CUDA_CHECK(cudaMalloc(&states[device], BLOCKS * THREADS * sizeof(curandState_t)));
- setup_kernel<<<BLOCKS, THREADS>>>(states[device], seed);
+ CUDA_LAUNCH(setup_kernel, BLOCKS, THREADS, states[device], seed);
POST_LAUNCH_CHECK();
}
- normal_kernel<<<blocks, threads>>>(out, states[device], elements);
+ CUDA_LAUNCH(normal_kernel, blocks, threads, out, states[device], elements);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/range.hpp b/src/backend/cuda/kernel/range.hpp
index 3cabc10..9670b07 100644
--- a/src/backend/cuda/kernel/range.hpp
+++ b/src/backend/cuda/kernel/range.hpp
@@ -82,7 +82,7 @@ namespace cuda
blocksPerMatY * out.dims[3],
1);
- range_kernel<T><<<blocks, threads>>>(out, dim, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((range_kernel<T>), blocks, threads, out, dim, blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/reduce.hpp b/src/backend/cuda/kernel/reduce.hpp
index 0263142..be52375 100644
--- a/src/backend/cuda/kernel/reduce.hpp
+++ b/src/backend/cuda/kernel/reduce.hpp
@@ -118,19 +118,19 @@ namespace kernel
switch (threads_y) {
case 8:
- (reduce_dim_kernel<Ti, To, op, dim, 8>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 8>), blocks, threads,
out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
change_nan, scalar<To>(nanval)); break;
case 4:
- (reduce_dim_kernel<Ti, To, op, dim, 4>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 4>), blocks, threads,
out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
change_nan, scalar<To>(nanval)); break;
case 2:
- (reduce_dim_kernel<Ti, To, op, dim, 2>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 2>), blocks, threads,
out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
change_nan, scalar<To>(nanval)); break;
case 1:
- (reduce_dim_kernel<Ti, To, op, dim, 1>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 1>), blocks, threads,
out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
change_nan, scalar<To>(nanval)); break;
}
@@ -303,16 +303,16 @@ namespace kernel
switch (threads_x) {
case 32:
- (reduce_first_kernel<Ti, To, op, 32>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 32>), blocks, threads,
out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
case 64:
- (reduce_first_kernel<Ti, To, op, 64>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 64>), blocks, threads,
out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
case 128:
- (reduce_first_kernel<Ti, To, op, 128>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 128>), blocks, threads,
out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
case 256:
- (reduce_first_kernel<Ti, To, op, 256>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 256>), blocks, threads,
out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
}
diff --git a/src/backend/cuda/kernel/regions.hpp b/src/backend/cuda/kernel/regions.hpp
index 3f7e21a..4693986 100644
--- a/src/backend/cuda/kernel/regions.hpp
+++ b/src/backend/cuda/kernel/regions.hpp
@@ -410,7 +410,7 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
const dim3 blocks(blk_x, blk_y);
- (initial_label<T,n_per_thread>)<<<blocks, threads>>>(out, in);
+ CUDA_LAUNCH((initial_label<T,n_per_thread>), blocks, threads, out, in);
POST_LAUNCH_CHECK();
@@ -421,8 +421,7 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
CUDA_CHECK(cudaMemcpyToSymbol(continue_flag, &h_continue, sizeof(int),
0, cudaMemcpyHostToDevice));
- (update_equiv<T, 16, n_per_thread, full_conn>)<<<blocks, threads>>>
- (out, tex);
+ CUDA_LAUNCH((update_equiv<T, 16, n_per_thread, full_conn>), blocks, threads, out, tex);
POST_LAUNCH_CHECK();
@@ -472,9 +471,8 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
add);
// Apply the correct labels to the equivalency map
- (final_relabel<T,n_per_thread>)<<<blocks,threads>>>(out,
- in,
- thrust::raw_pointer_cast(&labels[0]));
+ CUDA_LAUNCH((final_relabel<T,n_per_thread>), blocks,threads,
+ out, in, thrust::raw_pointer_cast(&labels[0]));
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/reorder.hpp b/src/backend/cuda/kernel/reorder.hpp
index eb71e9d..033dc13 100644
--- a/src/backend/cuda/kernel/reorder.hpp
+++ b/src/backend/cuda/kernel/reorder.hpp
@@ -82,8 +82,9 @@ namespace cuda
blocksPerMatY * out.dims[3],
1);
- reorder_kernel<T><<<blocks, threads>>>(out, in, rdims[0], rdims[1], rdims[2], rdims[3],
- blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((reorder_kernel<T>), blocks, threads,
+ out, in, rdims[0], rdims[1], rdims[2], rdims[3],
+ blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/resize.hpp b/src/backend/cuda/kernel/resize.hpp
index d61c20e..6d831d8 100644
--- a/src/backend/cuda/kernel/resize.hpp
+++ b/src/backend/cuda/kernel/resize.hpp
@@ -184,7 +184,8 @@ namespace cuda
float xf = (float)in.dims[0] / (float)out.dims[0];
float yf = (float)in.dims[1] / (float)out.dims[1];
- resize_kernel<T, method><<<blocks, threads>>>(out, in, blocksPerMatX, blocksPerMatY, xf, yf);
+ CUDA_LAUNCH((resize_kernel<T, method>), blocks, threads,
+ out, in, blocksPerMatX, blocksPerMatY, xf, yf);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/rotate.hpp b/src/backend/cuda/kernel/rotate.hpp
index bac4cd0..d63f010 100644
--- a/src/backend/cuda/kernel/rotate.hpp
+++ b/src/backend/cuda/kernel/rotate.hpp
@@ -114,8 +114,8 @@ namespace cuda
blocks.y = blocks.y * nbatches;
- rotate_kernel<T, method><<<blocks, threads>>> (out, in, t, nimages, nbatches,
- blocksXPerImage, blocksYPerImage);
+ CUDA_LAUNCH((rotate_kernel<T, method>), blocks, threads,
+ out, in, t, nimages, nbatches, blocksXPerImage, blocksYPerImage);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/scan_dim.hpp b/src/backend/cuda/kernel/scan_dim.hpp
index 72e80b3..6bc0469 100644
--- a/src/backend/cuda/kernel/scan_dim.hpp
+++ b/src/backend/cuda/kernel/scan_dim.hpp
@@ -194,16 +194,16 @@ namespace kernel
switch (threads_y) {
case 8:
- (scan_dim_kernel<Ti, To, op, dim, isFinalPass, 8>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 8>), blocks, threads,
out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
case 4:
- (scan_dim_kernel<Ti, To, op, dim, isFinalPass, 4>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 4>), blocks, threads,
out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
case 2:
- (scan_dim_kernel<Ti, To, op, dim, isFinalPass, 2>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 2>), blocks, threads,
out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
case 1:
- (scan_dim_kernel<Ti, To, op, dim, isFinalPass, 1>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 1>), blocks, threads,
out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
}
diff --git a/src/backend/cuda/kernel/scan_first.hpp b/src/backend/cuda/kernel/scan_first.hpp
index fb370c9..4c63942 100644
--- a/src/backend/cuda/kernel/scan_first.hpp
+++ b/src/backend/cuda/kernel/scan_first.hpp
@@ -161,16 +161,16 @@ namespace kernel
switch (threads_x) {
case 32:
- (scan_first_kernel<Ti, To, op, isFinalPass, 32>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 32>), blocks, threads,
out, tmp, in, blocks_x, blocks_y, lim); break;
case 64:
- (scan_first_kernel<Ti, To, op, isFinalPass, 64>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 64>), blocks, threads,
out, tmp, in, blocks_x, blocks_y, lim); break;
case 128:
- (scan_first_kernel<Ti, To, op, isFinalPass, 128>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 128>), blocks, threads,
out, tmp, in, blocks_x, blocks_y, lim); break;
case 256:
- (scan_first_kernel<Ti, To, op, isFinalPass, 256>)<<<blocks, threads>>>(
+ CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass, 256>), blocks, threads,
out, tmp, in, blocks_x, blocks_y, lim); break;
}
@@ -193,8 +193,7 @@ namespace kernel
uint lim = divup(out.dims[0], (threads_x * blocks_x));
- (bcast_first_kernel<To, op>)<<<blocks, threads>>>(
- out, tmp, blocks_x, blocks_y, lim);
+ CUDA_LAUNCH((bcast_first_kernel<To, op>), blocks, threads, out, tmp, blocks_x, blocks_y, lim);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/shift.hpp b/src/backend/cuda/kernel/shift.hpp
index 5cbed9f..db73286 100644
--- a/src/backend/cuda/kernel/shift.hpp
+++ b/src/backend/cuda/kernel/shift.hpp
@@ -96,8 +96,9 @@ namespace cuda
assert(sdims_[i] >= 0 && sdims_[i] <= out.dims[i]);
}
- shift_kernel<T><<<blocks, threads>>>(out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3],
- blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((shift_kernel<T>), blocks, threads,
+ out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3],
+ blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/sobel.hpp b/src/backend/cuda/kernel/sobel.hpp
index e7de1ea..550339e 100644
--- a/src/backend/cuda/kernel/sobel.hpp
+++ b/src/backend/cuda/kernel/sobel.hpp
@@ -124,9 +124,7 @@ void sobel(Param<To> dx, Param<To> dy, CParam<Ti> in, const unsigned &ker_size)
//TODO: add more cases when 5x5 and 7x7 kernels are done
switch(ker_size) {
- case 3:
- (sobel3x3<Ti, To>) <<< blocks, threads >>> (dx, dy, in, blk_x, blk_y);
- break;
+ case 3: CUDA_LAUNCH((sobel3x3<Ti, To>), blocks, threads, dx, dy, in, blk_x, blk_y); break;
}
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index b744edc..512cc38 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -110,7 +110,8 @@ void susan_responses(T* out, const T* in,
dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));
const size_t SMEM_SIZE = (BLOCK_X+2*radius)*(BLOCK_Y+2*radius)*sizeof(T);
- susanKernel<T><<<blocks, threads, SMEM_SIZE>>>(out, in, idim0, idim1, radius, t, g, edge);
+ CUDA_LAUNCH_SMEM((susanKernel<T>), blocks, threads, SMEM_SIZE,
+ out, in, idim0, idim1, radius, t, g, edge);
POST_LAUNCH_CHECK();
}
@@ -164,8 +165,8 @@ void nonMaximal(float* x_out, float* y_out, float* resp_out,
unsigned* d_corners_found = memAlloc<unsigned>(1);
CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
- nonMaxKernel<T><<<blocks, threads>>>(x_out, y_out, resp_out, d_corners_found,
- idim0, idim1, resp_in, edge, max_corners);
+ CUDA_LAUNCH((nonMaxKernel<T>), blocks, threads,
+ x_out, y_out, resp_out, d_corners_found, idim0, idim1, resp_in, edge, max_corners);
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/tile.hpp b/src/backend/cuda/kernel/tile.hpp
index a0325cb..345e176 100644
--- a/src/backend/cuda/kernel/tile.hpp
+++ b/src/backend/cuda/kernel/tile.hpp
@@ -78,7 +78,7 @@ namespace cuda
blocksPerMatY * out.dims[3],
1);
- tile_kernel<T><<<blocks, threads>>>(out, in, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((tile_kernel<T>), blocks, threads, out, in, blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/transform.hpp b/src/backend/cuda/kernel/transform.hpp
index 4fdfa6c..0f3ee55 100644
--- a/src/backend/cuda/kernel/transform.hpp
+++ b/src/backend/cuda/kernel/transform.hpp
@@ -131,11 +131,11 @@ namespace cuda
if (ntransforms > 1) { blocks.y *= ntransforms; }
if(inverse) {
- transform_kernel<T, true, method><<<blocks, threads>>>
- (out, in, nimages, ntransforms, blocksXPerImage);
+ CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads,
+ out, in, nimages, ntransforms, blocksXPerImage);
} else {
- transform_kernel<T, false, method><<<blocks, threads>>>
- (out, in, nimages, ntransforms, blocksXPerImage);
+ CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads,
+ out, in, nimages, ntransforms, blocksXPerImage);
}
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/transpose_inplace.hpp b/src/backend/cuda/kernel/transpose_inplace.hpp
index b25917a..f004fc4 100644
--- a/src/backend/cuda/kernel/transpose_inplace.hpp
+++ b/src/backend/cuda/kernel/transpose_inplace.hpp
@@ -144,9 +144,9 @@ namespace kernel
dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0)
- (transposeIP<T, conjugate, true >)<<<blocks, threads>>>(in, blk_x, blk_y);
+ CUDA_LAUNCH((transposeIP<T, conjugate, true >), blocks, threads, in, blk_x, blk_y);
else
- (transposeIP<T, conjugate, false>)<<<blocks, threads>>>(in, blk_x, blk_y);
+ CUDA_LAUNCH((transposeIP<T, conjugate, false>), blocks, threads, in, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/triangle.hpp b/src/backend/cuda/kernel/triangle.hpp
index 6bce765..374f6b1 100644
--- a/src/backend/cuda/kernel/triangle.hpp
+++ b/src/backend/cuda/kernel/triangle.hpp
@@ -79,7 +79,8 @@ namespace cuda
blocksPerMatY * r.dims[3],
1);
- triangle_kernel<T, is_upper, is_unit_diag><<<blocks, threads>>>(r, in, blocksPerMatX, blocksPerMatY);
+ CUDA_LAUNCH((triangle_kernel<T, is_upper, is_unit_diag>), blocks, threads,
+ r, in, blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/cuda/kernel/unwrap.hpp b/src/backend/cuda/kernel/unwrap.hpp
index 8135844..4083ba2 100644
--- a/src/backend/cuda/kernel/unwrap.hpp
+++ b/src/backend/cuda/kernel/unwrap.hpp
@@ -100,7 +100,8 @@ namespace cuda
dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);
- unwrap_kernel<T, TX><<<blocks, threads>>>(out, in, wx, wy, sx, sy, px, py, nx, repsPerColumn);
+ CUDA_LAUNCH((unwrap_kernel<T, TX>), blocks, threads,
+ out, in, wx, wy, sx, sy, px, py, nx, repsPerColumn);
POST_LAUNCH_CHECK();
}
}
diff --git a/src/backend/cuda/kernel/where.hpp b/src/backend/cuda/kernel/where.hpp
index a3e6e4e..fb2fd1d 100644
--- a/src/backend/cuda/kernel/where.hpp
+++ b/src/backend/cuda/kernel/where.hpp
@@ -135,7 +135,8 @@ namespace kernel
uint lim = divup(otmp.dims[0], (threads_x * blocks_x));
- (get_out_idx<T>)<<<blocks, threads>>>(out.ptr, otmp, rtmp, in, blocks_x, blocks_y, lim);
+ CUDA_LAUNCH((get_out_idx<T>), blocks, threads,
+ out.ptr, otmp, rtmp, in, blocks_x, blocks_y, lim);
POST_LAUNCH_CHECK();
memFree(rtmp.ptr);
diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp
index fa5da8d..df15ce3 100644
--- a/src/backend/cuda/platform.cpp
+++ b/src/backend/cuda/platform.cpp
@@ -8,6 +8,7 @@
********************************************************/
#include <af/version.h>
+#include <af/cuda.h>
#include <platform.hpp>
#include <defines.hpp>
#include <driver.h>
@@ -267,6 +268,11 @@ int getDeviceNativeId(int device)
return -1;
}
+cudaStream_t getStream(int device)
+{
+ return DeviceManager::getInstance().streams[device];
+}
+
int setDevice(int device)
{
return DeviceManager::getInstance().setActiveDevice(device);
@@ -307,6 +313,11 @@ DeviceManager::DeviceManager()
sortDevices();
+ for(int i = 0; i < nDevices; i++) {
+ setActiveDevice(i, cuDevices[i].nativeId);
+ CUDA_CHECK(cudaStreamCreate(&streams[i]));
+ }
+
const char* deviceENV = getenv("AF_CUDA_DEFAULT_DEVICE");
if(!deviceENV) {
setActiveDevice(0, cuDevices[0].nativeId);
@@ -359,8 +370,20 @@ void sync(int device)
{
int currDevice = getActiveDeviceId();
setDevice(device);
- CUDA_CHECK(cudaDeviceSynchronize());
+ CUDA_CHECK(cudaStreamSynchronize(getStream(getActiveDeviceId())));
setDevice(currDevice);
}
}
+
+af_err afcu_get_stream(cudaStream_t* stream, int id)
+{
+ *stream = cuda::getStream(id);
+ return AF_SUCCESS;
+}
+
+af_err afcu_get_native_id(int* nativeid, int id)
+{
+ *nativeid = cuda::getDeviceNativeId(id);
+ return AF_SUCCESS;
+}
diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp
index de63c03..a893b01 100644
--- a/src/backend/cuda/platform.hpp
+++ b/src/backend/cuda/platform.hpp
@@ -42,6 +42,8 @@ int getActiveDeviceId();
int getDeviceNativeId(int device);
+cudaStream_t getStream(int device);
+
int setDevice(int device);
void sync(int device);
@@ -77,6 +79,8 @@ class DeviceManager
friend int getDeviceNativeId(int device);
+ friend cudaStream_t getStream(int device);
+
friend int setDevice(int device);
friend cudaDeviceProp getDeviceProp(int device);
@@ -102,6 +106,7 @@ class DeviceManager
int activeDev;
int nDevices;
+ cudaStream_t streams[MAX_DEVICES];
};
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list