[arrayfire] 198/408: Replaced cuda Memcopy/Memset with async versions

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:56 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 6cf4a5a16a30df12745a929ec2b2e9154e2191c4
Author: pradeep <pradeep at arrayfire.com>
Date:   Thu Aug 6 11:53:08 2015 -0400

    Replaced cuda Memcopy/Memset with async versions
---
 src/backend/cuda/Array.cpp                    |  6 +++---
 src/backend/cuda/copy.cu                      |  6 ++++--
 src/backend/cuda/hist_graphics.cu             |  3 ++-
 src/backend/cuda/image.cu                     |  3 ++-
 src/backend/cuda/kernel/convolve.cu           | 15 +++++++++------
 src/backend/cuda/kernel/convolve_separable.cu |  3 ++-
 src/backend/cuda/kernel/fast.hpp              |  2 +-
 src/backend/cuda/kernel/harris.hpp            | 12 ++++++++----
 src/backend/cuda/kernel/orb.hpp               | 22 ++++++++++++++--------
 src/backend/cuda/kernel/regions.hpp           |  5 +++--
 src/backend/cuda/kernel/susan.hpp             |  3 ++-
 src/backend/cuda/kernel/transform.hpp         |  5 +++--
 src/backend/cuda/morph3d_impl.hpp             |  5 +++--
 src/backend/cuda/morph_impl.hpp               |  5 +++--
 src/backend/cuda/plot.cu                      |  3 ++-
 15 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp
index 07f5ff6..ed86a8e 100644
--- a/src/backend/cuda/Array.cpp
+++ b/src/backend/cuda/Array.cpp
@@ -255,9 +255,9 @@ namespace cuda
 
         T *ptr = arr.get();
 
-        CUDA_CHECK(cudaMemcpy(ptr + arr.getOffset(), data,
-                              bytes,
-                              cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data,
+                                   bytes, cudaMemcpyDeviceToDevice,
+                                   cuda::getStream(cuda::getActiveDeviceId())));
 
         return;
     }
diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu
index c58b1cb..0be1c3d 100644
--- a/src/backend/cuda/copy.cu
+++ b/src/backend/cuda/copy.cu
@@ -55,7 +55,8 @@ namespace cuda
         if (A.isLinear()) {
             CUDA_CHECK(cudaMemcpyAsync(out.get(), A.get(),
                                        A.elements() * sizeof(T),
-                                       cudaMemcpyDeviceToDevice));
+                                       cudaMemcpyDeviceToDevice,
+                                       cuda::getStream(cuda::getActiveDeviceId())));
         } else {
             // FIXME: Seems to fail when using Param<T>
             kernel::memcopy(out.get(), out.strides().get(), A.get(), A.dims().get(),
@@ -91,7 +92,8 @@ namespace cuda
             {
                 CUDA_CHECK(cudaMemcpyAsync(out.get(), in.get(),
                                            in.elements() * sizeof(T),
-                                           cudaMemcpyDeviceToDevice));
+                                           cudaMemcpyDeviceToDevice,
+                                           cuda::getStream(cuda::getActiveDeviceId())));
             } else {
                 kernel::copy<T, T>(out, in, in.ndims(), scalar<T>(0), 1);
             }
diff --git a/src/backend/cuda/hist_graphics.cu b/src/backend/cuda/hist_graphics.cu
index d1424d8..69cb22c 100644
--- a/src/backend/cuda/hist_graphics.cu
+++ b/src/backend/cuda/hist_graphics.cu
@@ -31,7 +31,8 @@ void copy_histogram(const Array<T> &data, const fg::Histogram* hist)
     T* d_vbo = NULL;
     cudaGraphicsMapResources(1, &cudaVBOResource, 0);
     cudaGraphicsResourceGetMappedPointer((void **)&d_vbo, &num_bytes, cudaVBOResource);
-    cudaMemcpy(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice);
+    cudaMemcpyAsync(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice,
+                    cuda::getStream(cuda::getActiveDeviceId()));
     cudaGraphicsUnmapResources(1, &cudaVBOResource, 0);
 
     CheckGL("After cuda resource copy");
diff --git a/src/backend/cuda/image.cu b/src/backend/cuda/image.cu
index bf6e7df..7370fb2 100644
--- a/src/backend/cuda/image.cu
+++ b/src/backend/cuda/image.cu
@@ -36,7 +36,8 @@ void copy_image(const Array<T> &in, const fg::Image* image)
     T* d_pbo = NULL;
     cudaGraphicsMapResources(1, &cudaPBOResource, 0);
     cudaGraphicsResourceGetMappedPointer((void **)&d_pbo, &num_bytes, cudaPBOResource);
-    cudaMemcpy(d_pbo, d_X, num_bytes, cudaMemcpyDeviceToDevice);
+    cudaMemcpyAsync(d_pbo, d_X, num_bytes, cudaMemcpyDeviceToDevice,
+                    cuda::getStream(cuda::getActiveDeviceId()));
     cudaGraphicsUnmapResources(1, &cudaPBOResource, 0);
 
     POST_LAUNCH_CHECK();
diff --git a/src/backend/cuda/kernel/convolve.cu b/src/backend/cuda/kernel/convolve.cu
index d1f894e..78790c3 100644
--- a/src/backend/cuda/kernel/convolve.cu
+++ b/src/backend/cuda/kernel/convolve.cu
@@ -369,10 +369,11 @@ void convolve_1d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
 
                 // FIXME: if the filter array is strided, direct copy of symbols
                 // might cause issues
-                CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter,
+                CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
                                               filt.ptr+(f1Off+f2Off+f3Off),
                                               filterLen*sizeof(aT),
-                                              0, cudaMemcpyDeviceToDevice));
+                                              0, cudaMemcpyDeviceToDevice,
+                                              cuda::getStream(cuda::getActiveDeviceId())));
 
                 p.o[0] = (p.outHasNoOffset ? 0 : b1);
                 p.o[1] = (p.outHasNoOffset ? 0 : b2);
@@ -406,10 +407,11 @@ void convolve_2d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
 
             // FIXME: if the filter array is strided, direct copy of symbols
             // might cause issues
-            CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter,
+            CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
                                           filt.ptr+(f2Off+f3Off),
                                           filterLen*sizeof(aT),
-                                          0, cudaMemcpyDeviceToDevice));
+                                          0, cudaMemcpyDeviceToDevice,
+                                          cuda::getStream(cuda::getActiveDeviceId())));
 
             p.o[1] = (p.outHasNoOffset ? 0 : b2);
             p.o[2] = (p.outHasNoOffset ? 0 : b3);
@@ -433,10 +435,11 @@ void convolve_3d(conv_kparam_t &p, Param<T> out, CParam<T> sig, CParam<aT> filt)
 
         // FIXME: if the filter array is strided, direct copy of symbols
         // might cause issues
-        CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter,
+        CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
                     filt.ptr+f3Off,
                     filterLen*sizeof(aT),
-                    0, cudaMemcpyDeviceToDevice));
+                    0, cudaMemcpyDeviceToDevice,
+                    cuda::getStream(cuda::getActiveDeviceId())));
 
         p.o[2] = (p.outHasNoOffset ? 0 : b3);
         p.s[2] = (p.inHasNoOffset ? 0 : b3);
diff --git a/src/backend/cuda/kernel/convolve_separable.cu b/src/backend/cuda/kernel/convolve_separable.cu
index 0b5f596..e2caec7 100644
--- a/src/backend/cuda/kernel/convolve_separable.cu
+++ b/src/backend/cuda/kernel/convolve_separable.cu
@@ -133,7 +133,8 @@ void convolve2(Param<T> out, CParam<T> signal, CParam<accType> filter)
 
    // FIX ME: if the filter array is strided, direct copy of symbols
    // might cause issues
-   CUDA_CHECK(cudaMemcpyToSymbol(kernel::sFilter, filter.ptr, fLen*sizeof(accType), 0, cudaMemcpyDeviceToDevice));
+   CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::sFilter, filter.ptr, fLen*sizeof(accType), 0,
+               cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
 
     switch(fLen) {
         case  2: conv2Helper<T, accType, conv_dim, expand,  2>(blocks, threads, out, signal, blk_x, blk_y); break;
diff --git a/src/backend/cuda/kernel/fast.hpp b/src/backend/cuda/kernel/fast.hpp
index cf2061b..df4f406 100644
--- a/src/backend/cuda/kernel/fast.hpp
+++ b/src/backend/cuda/kernel/fast.hpp
@@ -428,7 +428,7 @@ void fast(unsigned* out_feat,
     blocks.y = divup(in.dims[1], 64);
 
     unsigned *d_total = (unsigned *)(d_score + in.dims[0] * in.dims[1]);
-    CUDA_CHECK(cudaMemset(d_total, 0, sizeof(unsigned)));
+    CUDA_CHECK(cudaMemsetAsync(d_total, 0, sizeof(unsigned), cuda::getStream(cuda::getActiveDeviceId())));
     unsigned *d_counts  = memAlloc<unsigned>(blocks.x * blocks.y);
     unsigned *d_offsets = memAlloc<unsigned>(blocks.x * blocks.y);
 
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index 11f628b..d6228de 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -276,7 +276,8 @@ void harris(unsigned* corners_out,
     unsigned corner_lim = in.dims[3] * in.strides[3] * 0.2f;
 
     unsigned* d_corners_found = memAlloc<unsigned>(1);
-    CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
+    CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned),
+                cuda::getStream(cuda::getActiveDeviceId())));
 
     float* d_x_corners = memAlloc<float>(corner_lim);
     float* d_y_corners = memAlloc<float>(corner_lim);
@@ -358,9 +359,12 @@ void harris(unsigned* corners_out,
         *x_out = memAlloc<float>(*corners_out);
         *y_out = memAlloc<float>(*corners_out);
         *resp_out = memAlloc<float>(*corners_out);
-        CUDA_CHECK(cudaMemcpy(*x_out, d_x_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*y_out, d_y_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*resp_out, d_resp_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(*x_out, d_x_corners, *corners_out * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*y_out, d_y_corners, *corners_out * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*resp_out, d_resp_corners, *corners_out * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
 
         memFree(d_x_corners);
         memFree(d_y_corners);
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index ad99fcb..38c6eaa 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -455,7 +455,8 @@ void orb(unsigned* out_feat,
         float* d_size_lvl = memAlloc<float>(feat_pyr[i]);
 
         unsigned* d_desc_lvl = memAlloc<unsigned>(feat_pyr[i] * 8);
-        CUDA_CHECK(cudaMemset(d_desc_lvl, 0, feat_pyr[i] * 8 * sizeof(unsigned)));
+        CUDA_CHECK(cudaMemsetAsync(d_desc_lvl, 0, feat_pyr[i] * 8 * sizeof(unsigned),
+                    cuda::getStream(cuda::getActiveDeviceId())));
 
         // Compute ORB descriptors
         threads = dim3(THREADS_X, THREADS_Y);
@@ -502,13 +503,18 @@ void orb(unsigned* out_feat,
         if (i > 0)
             offset += feat_pyr[i-1];
 
-        CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_score+offset, d_score_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-
-        CUDA_CHECK(cudaMemcpy(*d_desc+(offset*8), d_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned), cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_score+offset, d_score_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+        CUDA_CHECK(cudaMemcpyAsync(*d_desc+(offset*8), d_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned),
+                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
 
         memFree(d_x_pyr[i]);
         memFree(d_y_pyr[i]);
diff --git a/src/backend/cuda/kernel/regions.hpp b/src/backend/cuda/kernel/regions.hpp
index 9524811..27f1029 100644
--- a/src/backend/cuda/kernel/regions.hpp
+++ b/src/backend/cuda/kernel/regions.hpp
@@ -436,8 +436,9 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
     // 1.
     int size = in.dims[0] * in.dims[1];
     T* tmp = cuda::memAlloc<T>(size);
-    CUDA_CHECK(cudaMemcpy(tmp, out.ptr, size * sizeof(T),
-                          cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(tmp, out.ptr, size * sizeof(T),
+                          cudaMemcpyDeviceToDevice,
+                          cuda::getStream(cuda::getActiveDeviceId())));
 
     // Wrap raw device ptr
     thrust::device_ptr<T> wrapped_tmp = thrust::device_pointer_cast(tmp);
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index 512cc38..fdbd88a 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -163,7 +163,8 @@ void nonMaximal(float* x_out, float* y_out, float* resp_out,
     dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));
 
     unsigned* d_corners_found = memAlloc<unsigned>(1);
-    CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
+    CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned),
+                cuda::getStream(cuda::getActiveDeviceId())));
 
     CUDA_LAUNCH((nonMaxKernel<T>), blocks, threads,
             x_out, y_out, resp_out, d_corners_found, idim0, idim1, resp_in, edge, max_corners);
diff --git a/src/backend/cuda/kernel/transform.hpp b/src/backend/cuda/kernel/transform.hpp
index 0f3ee55..07be0a3 100644
--- a/src/backend/cuda/kernel/transform.hpp
+++ b/src/backend/cuda/kernel/transform.hpp
@@ -115,8 +115,9 @@ namespace cuda
             const int ntransforms = out.dims[2] / in.dims[2];
 
             // Copy transform to constant memory.
-            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
-                                          cudaMemcpyDeviceToDevice));
+            CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
+                                          cudaMemcpyDeviceToDevice,
+                                          cuda::getStream(cuda::getActiveDeviceId())));
 
             dim3 threads(TX, TY, 1);
             dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));
diff --git a/src/backend/cuda/morph3d_impl.hpp b/src/backend/cuda/morph3d_impl.hpp
index bd98ebd..5a02fad 100644
--- a/src/backend/cuda/morph3d_impl.hpp
+++ b/src/backend/cuda/morph3d_impl.hpp
@@ -33,9 +33,10 @@ Array<T> morph3d(const Array<T> &in, const Array<T> &mask)
 
     Array<T> out       = createEmptyArray<T>(in.dims());
 
-    CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter, mask.get(),
+    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
                                   mdims[0] * mdims[1] *mdims[2] * sizeof(T),
-                                  0, cudaMemcpyDeviceToDevice));
+                                  0, cudaMemcpyDeviceToDevice,
+                                  cuda::getStream(cuda::getActiveDeviceId())));
 
     if (isDilation)
         kernel::morph3d<T, true >(out, in, mdims[0]);
diff --git a/src/backend/cuda/morph_impl.hpp b/src/backend/cuda/morph_impl.hpp
index 0b5b653..ea517d0 100644
--- a/src/backend/cuda/morph_impl.hpp
+++ b/src/backend/cuda/morph_impl.hpp
@@ -32,9 +32,10 @@ Array<T>  morph(const Array<T> &in, const Array<T> &mask)
 
     Array<T> out = createEmptyArray<T>(in.dims());
 
-    CUDA_CHECK(cudaMemcpyToSymbol(kernel::cFilter, mask.get(),
+    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
                                   mdims[0] * mdims[1] * sizeof(T),
-                                  0, cudaMemcpyDeviceToDevice));
+                                  0, cudaMemcpyDeviceToDevice,
+                                  cuda::getStream(cuda::getActiveDeviceId())));
 
     if (isDilation)
         kernel::morph<T, true >(out, in, mdims[0]);
diff --git a/src/backend/cuda/plot.cu b/src/backend/cuda/plot.cu
index 195988e..40a004e 100644
--- a/src/backend/cuda/plot.cu
+++ b/src/backend/cuda/plot.cu
@@ -36,7 +36,8 @@ void copy_plot(const Array<T> &P, fg::Plot* plot)
     T* d_vbo = NULL;
     cudaGraphicsMapResources(1, &cudaVBOResource, 0);
     cudaGraphicsResourceGetMappedPointer((void **)&d_vbo, &num_bytes, cudaVBOResource);
-    cudaMemcpy(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice);
+    cudaMemcpyAsync(d_vbo, d_P, num_bytes, cudaMemcpyDeviceToDevice,
+               cuda::getStream(cuda::getActiveDeviceId()));
     cudaGraphicsUnmapResources(1, &cudaVBOResource, 0);
 
     CheckGL("After cuda resource copy");

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list