[arrayfire] 198/408: Replaced cuda Memcopy/Memset with async versions
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:56 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 6cf4a5a16a30df12745a929ec2b2e9154e2191c4
Author: pradeep <pradeep at arrayfire.com>
Date: Thu Aug 6 11:53:08 2015 -0400
Replaced cuda Memcopy/Memset with async versions
---
src/backend/cuda/Array.cpp | 6 +++---
src/backend/cuda/copy.cu | 6 ++++--
src/backend/cuda/hist_graphics.cu | 3 ++-
src/backend/cuda/image.cu | 3 ++-
src/backend/cuda/kernel/convolve.cu | 15 +++++++++------
src/backend/cuda/kernel/convolve_separable.cu | 3 ++-
src/backend/cuda/kernel/fast.hpp | 2 +-
src/backend/cuda/kernel/harris.hpp | 12 ++++++++----
src/backend/cuda/kernel/orb.hpp | 22 ++++++++++++++--------
src/backend/cuda/kernel/regions.hpp | 5 +++--
src/backend/cuda/kernel/susan.hpp | 3 ++-
src/backend/cuda/kernel/transform.hpp | 5 +++--
src/backend/cuda/morph3d_impl.hpp | 5 +++--
src/backend/cuda/morph_impl.hpp | 5 +++--
src/backend/cuda/plot.cu | 3 ++-
15 files changed, 61 insertions(+), 37 deletions(-)
diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp
index 07f5ff6..ed86a8e 100644
--- a/src/backend/cuda/Array.cpp
+++ b/src/backend/cuda/Array.cpp
@@ -255,9 +255,9 @@ namespace cuda
T *ptr = arr.get();
- CUDA_CHECK(cudaMemcpy(ptr + arr.getOffset(), data,
- bytes,
- cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data,
+ bytes, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
return;
}
diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu
index c58b1cb..0be1c3d 100644
--- a/src/backend/cuda/copy.cu
+++ b/src/backend/cuda/copy.cu
@@ -55,7 +55,8 @@ namespace cuda
if (A.isLinear()) {
CUDA_CHECK(cudaMemcpyAsync(out.get(), A.get(),
A.elements() * sizeof(T),
- cudaMemcpyDeviceToDevice));
+ cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
} else {
// FIXME: Seems to fail when using Param<T>
kernel::memcopy(out.get(), out.strides().get(), A.get(), A.dims().get(),
@@ -91,7 +92,8 @@ namespace cuda
{
CUDA_CHECK(cudaMemcpyAsync(out.get(), in.get(),
in.elements() * sizeof(T),
- cudaMemcpyDeviceToDevice));
+ cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
} else {
kernel::copy<T, T>(out, in, in.ndims(), scalar<T>(0), 1);
}
diff --git a/src/backend/cuda/hist_graphics.cu b/src/backend/cuda/hist_graphics.cu
index d1424d8..69cb22c 100644
--- a/src/backend/cuda/hist_graphics.cu
+++ b/src/backend/cuda/hist_graphics.cu
@@ -31,7 +31,8 @@ void copy_histogram(const Array<T> &data, const fg::Histogram* hist)
T* d_vbo = NULL;
cudaGraphicsMapResources(1, &cudaVBOResource, 0);
cudaGraphicsResourceGetMappedPointer((void **)&d_vbo, &num_bytes, cudaVBOResource);
- cudaMemcpy(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice);
+ cudaMemcpyAsync(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId()));
cudaGraphicsUnmapResources(1, &cudaVBOResource, 0);
CheckGL("After cuda resource copy");
diff --git a/src/backend/cuda/image.cu b/src/backend/cuda/image.cu
index bf6e7df..7370fb2 100644
--- a/src/backend/cuda/image.cu
+++ b/src/backend/cuda/image.cu
@@ -36,7 +36,8 @@ void copy_image(const Array<T> &in, const fg::Image* image)
T* d_pbo = NULL;
cudaGraphicsMapResources(1, &cudaPBOResource, 0);
cudaGraphicsResourceGetMappedPointer((void **)&d_pbo, &num_bytes, cudaPBOResource);
- cudaMemcpy(d_pbo, d_X, num_bytes, cudaMemcpyDeviceToDevice);
+ cudaMemcpyAsync(d_pbo, d_X, num_bytes, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId()));
cudaGraphicsUnmapResources(1, &cudaPBOResource, 0);
POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/convolve.cu b/src/backend/cuda/kernel/convolve.cu
index d1f894e..78790c3 100644
--- a/src/backend/cuda/kernel/convolve.cu
+++ b/src/backend/cuda/kernel/convolve.cu
@@ -369,10 +369,11 @@ void convolve_1d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
// FIXME: if the filter array is strided, direct copy of symbols
// might cause issues
- CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter,
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
filt.ptr+(f1Off+f2Off+f3Off),
filterLen*sizeof(aT),
- 0, cudaMemcpyDeviceToDevice));
+ 0, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
p.o[0] = (p.outHasNoOffset ? 0 : b1);
p.o[1] = (p.outHasNoOffset ? 0 : b2);
@@ -406,10 +407,11 @@ void convolve_2d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
// FIXME: if the filter array is strided, direct copy of symbols
// might cause issues
- CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter,
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
filt.ptr+(f2Off+f3Off),
filterLen*sizeof(aT),
- 0, cudaMemcpyDeviceToDevice));
+ 0, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
p.o[1] = (p.outHasNoOffset ? 0 : b2);
p.o[2] = (p.outHasNoOffset ? 0 : b3);
@@ -433,10 +435,11 @@ void convolve_3d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
// FIXME: if the filter array is strided, direct copy of symbols
// might cause issues
- CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter,
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
filt.ptr+f3Off,
filterLen*sizeof(aT),
- 0, cudaMemcpyDeviceToDevice));
+ 0, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
p.o[2] = (p.outHasNoOffset ? 0 : b3);
p.s[2] = (p.inHasNoOffset ? 0 : b3);
diff --git a/src/backend/cuda/kernel/convolve_separable.cu b/src/backend/cuda/kernel/convolve_separable.cu
index 0b5f596..e2caec7 100644
--- a/src/backend/cuda/kernel/convolve_separable.cu
+++ b/src/backend/cuda/kernel/convolve_separable.cu
@@ -133,7 +133,8 @@ void convolve2(Param<T> out, CParam<T> signal, CParam<accType> filter)
// FIX ME: if the filter array is strided, direct copy of symbols
// might cause issues
- CUDA_CHECK(cudaMemcpyToSymbol(kernel::sFilter, filter.ptr, fLen*sizeof(accType), 0, cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::sFilter, filter.ptr, fLen*sizeof(accType), 0,
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
switch(fLen) {
case 2: conv2Helper<T, accType, conv_dim, expand, 2>(blocks, threads, out, signal, blk_x, blk_y); break;
diff --git a/src/backend/cuda/kernel/fast.hpp b/src/backend/cuda/kernel/fast.hpp
index cf2061b..df4f406 100644
--- a/src/backend/cuda/kernel/fast.hpp
+++ b/src/backend/cuda/kernel/fast.hpp
@@ -428,7 +428,7 @@ void fast(unsigned* out_feat,
blocks.y = divup(in.dims[1], 64);
unsigned *d_total = (unsigned *)(d_score + in.dims[0] * in.dims[1]);
- CUDA_CHECK(cudaMemset(d_total, 0, sizeof(unsigned)));
+ CUDA_CHECK(cudaMemsetAsync(d_total, 0, sizeof(unsigned), cuda::getStream(cuda::getActiveDeviceId())));
unsigned *d_counts = memAlloc<unsigned>(blocks.x * blocks.y);
unsigned *d_offsets = memAlloc<unsigned>(blocks.x * blocks.y);
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index 11f628b..d6228de 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -276,7 +276,8 @@ void harris(unsigned* corners_out,
unsigned corner_lim = in.dims[3] * in.strides[3] * 0.2f;
unsigned* d_corners_found = memAlloc<unsigned>(1);
- CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
+ CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned),
+ cuda::getStream(cuda::getActiveDeviceId())));
float* d_x_corners = memAlloc<float>(corner_lim);
float* d_y_corners = memAlloc<float>(corner_lim);
@@ -358,9 +359,12 @@ void harris(unsigned* corners_out,
*x_out = memAlloc<float>(*corners_out);
*y_out = memAlloc<float>(*corners_out);
*resp_out = memAlloc<float>(*corners_out);
- CUDA_CHECK(cudaMemcpy(*x_out, d_x_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*y_out, d_y_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*resp_out, d_resp_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(*x_out, d_x_corners, *corners_out * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*y_out, d_y_corners, *corners_out * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*resp_out, d_resp_corners, *corners_out * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
memFree(d_x_corners);
memFree(d_y_corners);
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index ad99fcb..38c6eaa 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -455,7 +455,8 @@ void orb(unsigned* out_feat,
float* d_size_lvl = memAlloc<float>(feat_pyr[i]);
unsigned* d_desc_lvl = memAlloc<unsigned>(feat_pyr[i] * 8);
- CUDA_CHECK(cudaMemset(d_desc_lvl, 0, feat_pyr[i] * 8 * sizeof(unsigned)));
+ CUDA_CHECK(cudaMemsetAsync(d_desc_lvl, 0, feat_pyr[i] * 8 * sizeof(unsigned),
+ cuda::getStream(cuda::getActiveDeviceId())));
// Compute ORB descriptors
threads = dim3(THREADS_X, THREADS_Y);
@@ -502,13 +503,18 @@ void orb(unsigned* out_feat,
if (i > 0)
offset += feat_pyr[i-1];
- CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_score+offset, d_score_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-
- CUDA_CHECK(cudaMemcpy(*d_desc+(offset*8), d_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned), cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_score+offset, d_score_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_desc+(offset*8), d_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
memFree(d_x_pyr[i]);
memFree(d_y_pyr[i]);
diff --git a/src/backend/cuda/kernel/regions.hpp b/src/backend/cuda/kernel/regions.hpp
index 9524811..27f1029 100644
--- a/src/backend/cuda/kernel/regions.hpp
+++ b/src/backend/cuda/kernel/regions.hpp
@@ -436,8 +436,9 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
// 1.
int size = in.dims[0] * in.dims[1];
T* tmp = cuda::memAlloc<T>(size);
- CUDA_CHECK(cudaMemcpy(tmp, out.ptr, size * sizeof(T),
- cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(tmp, out.ptr, size * sizeof(T),
+ cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
// Wrap raw device ptr
thrust::device_ptr<T> wrapped_tmp = thrust::device_pointer_cast(tmp);
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index 512cc38..fdbd88a 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -163,7 +163,8 @@ void nonMaximal(float* x_out, float* y_out, float* resp_out,
dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));
unsigned* d_corners_found = memAlloc<unsigned>(1);
- CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
+ CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned),
+ cuda::getStream(cuda::getActiveDeviceId())));
CUDA_LAUNCH((nonMaxKernel<T>), blocks, threads,
x_out, y_out, resp_out, d_corners_found, idim0, idim1, resp_in, edge, max_corners);
diff --git a/src/backend/cuda/kernel/transform.hpp b/src/backend/cuda/kernel/transform.hpp
index 0f3ee55..07be0a3 100644
--- a/src/backend/cuda/kernel/transform.hpp
+++ b/src/backend/cuda/kernel/transform.hpp
@@ -115,8 +115,9 @@ namespace cuda
const int ntransforms = out.dims[2] / in.dims[2];
// Copy transform to constant memory.
- CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
- cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
+ cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
dim3 threads(TX, TY, 1);
dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));
diff --git a/src/backend/cuda/morph3d_impl.hpp b/src/backend/cuda/morph3d_impl.hpp
index bd98ebd..5a02fad 100644
--- a/src/backend/cuda/morph3d_impl.hpp
+++ b/src/backend/cuda/morph3d_impl.hpp
@@ -33,9 +33,10 @@ Array<T> morph3d(const Array<T> &in, const Array<T> &mask)
Array<T> out = createEmptyArray<T>(in.dims());
- CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter, mask.get(),
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
mdims[0] * mdims[1] *mdims[2] * sizeof(T),
- 0, cudaMemcpyDeviceToDevice));
+ 0, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
if (isDilation)
kernel::morph3d<T, true >(out, in, mdims[0]);
diff --git a/src/backend/cuda/morph_impl.hpp b/src/backend/cuda/morph_impl.hpp
index 0b5b653..ea517d0 100644
--- a/src/backend/cuda/morph_impl.hpp
+++ b/src/backend/cuda/morph_impl.hpp
@@ -32,9 +32,10 @@ Array<T> morph(const Array<T> &in, const Array<T> &mask)
Array<T> out = createEmptyArray<T>(in.dims());
- CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter, mask.get(),
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
mdims[0] * mdims[1] * sizeof(T),
- 0, cudaMemcpyDeviceToDevice));
+ 0, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
if (isDilation)
kernel::morph<T, true >(out, in, mdims[0]);
diff --git a/src/backend/cuda/plot.cu b/src/backend/cuda/plot.cu
index 195988e..40a004e 100644
--- a/src/backend/cuda/plot.cu
+++ b/src/backend/cuda/plot.cu
@@ -36,7 +36,8 @@ void copy_plot(const Array<T> &P, fg::Plot* plot)
T* d_vbo = NULL;
cudaGraphicsMapResources(1, &cudaVBOResource, 0);
cudaGraphicsResourceGetMappedPointer((void **)&d_vbo, &num_bytes, cudaVBOResource);
- cudaMemcpy(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice);
+ cudaMemcpyAsync(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId()));
cudaGraphicsUnmapResources(1, &cudaVBOResource, 0);
CheckGL("After cuda resource copy");
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list