[arrayfire] 151/248: Replaced deviceSychronize calls with async versions
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Nov 17 15:54:18 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.
commit 8f54598104760937d4505eee9cd893f14b36d889
Author: pradeep <pradeep at arrayfire.com>
Date: Thu Oct 29 16:51:35 2015 -0400
Replaced deviceSychronize calls with async versions
---
src/backend/cuda/Array.cpp | 10 ++++---
src/backend/cuda/kernel/fast.hpp | 4 ++-
src/backend/cuda/kernel/harris.hpp | 7 +++--
src/backend/cuda/kernel/ireduce.hpp | 11 +++++---
src/backend/cuda/kernel/orb.hpp | 7 +++--
src/backend/cuda/kernel/reduce.hpp | 8 ++++--
src/backend/cuda/kernel/regions.hpp | 11 +++++---
src/backend/cuda/kernel/sift_nonfree.hpp | 46 ++++++++++++++++++++++----------
src/backend/cuda/kernel/susan.hpp | 4 ++-
src/backend/cuda/kernel/where.hpp | 6 +++--
10 files changed, 79 insertions(+), 35 deletions(-)
diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp
index b7d7b3c..23a7512 100644
--- a/src/backend/cuda/Array.cpp
+++ b/src/backend/cuda/Array.cpp
@@ -46,7 +46,9 @@ namespace cuda
static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
#endif
if (!is_device) {
- CUDA_CHECK(cudaMemcpy(data.get(), in_data, dims.elements() * sizeof(T), cudaMemcpyHostToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(data.get(), in_data, dims.elements() * sizeof(T),
+ cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
}
}
@@ -238,9 +240,9 @@ namespace cuda
T *ptr = arr.get();
- CUDA_CHECK(cudaMemcpy(ptr + arr.getOffset(), data,
- bytes,
- cudaMemcpyHostToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, bytes, cudaMemcpyHostToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
return;
}
diff --git a/src/backend/cuda/kernel/fast.hpp b/src/backend/cuda/kernel/fast.hpp
index 5f220cd..6d6b0e0 100644
--- a/src/backend/cuda/kernel/fast.hpp
+++ b/src/backend/cuda/kernel/fast.hpp
@@ -465,7 +465,9 @@ void fast(unsigned* out_feat,
// Dimensions of output array
unsigned total;
- CUDA_CHECK(cudaMemcpy(&total, d_total, sizeof(unsigned), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(&total, d_total, sizeof(unsigned), cudaMemcpyDeviceToHost,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
total = total < max_feat ? total : max_feat;
if (total > 0) {
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index d6228de..44f98d9 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -216,7 +216,8 @@ void harris(unsigned* corners_out,
int filter_elem = filter.strides[3] * filter.dims[3];
filter.ptr = memAlloc<convAccT>(filter_elem);
- CUDA_CHECK(cudaMemcpy(filter.ptr, h_filter, filter_elem * sizeof(convAccT), cudaMemcpyHostToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(filter.ptr, h_filter, filter_elem * sizeof(convAccT),
+ cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
delete[] h_filter;
@@ -305,7 +306,9 @@ void harris(unsigned* corners_out,
in.dims[0], in.dims[1], d_responses, min_r, border_len, corner_lim);
unsigned corners_found = 0;
- CUDA_CHECK(cudaMemcpy(&corners_found, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(&corners_found, d_corners_found, sizeof(unsigned),
+ cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
memFree(d_responses);
memFree(d_corners_found);
diff --git a/src/backend/cuda/kernel/ireduce.hpp b/src/backend/cuda/kernel/ireduce.hpp
index 4354f2a..7aaeb24 100644
--- a/src/backend/cuda/kernel/ireduce.hpp
+++ b/src/backend/cuda/kernel/ireduce.hpp
@@ -492,8 +492,11 @@ namespace kernel
T* h_ptr_raw = h_ptr.get();
uint* h_lptr_raw = h_lptr.get();
- CUDA_CHECK(cudaMemcpy(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(T), cudaMemcpyDeviceToHost));
- CUDA_CHECK(cudaMemcpy(h_lptr_raw, tlptr, tmp_elements * sizeof(uint), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(T),
+ cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(h_lptr_raw, tlptr, tmp_elements * sizeof(uint),
+ cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
memFree(tmp.ptr);
memFree(tlptr);
@@ -520,7 +523,9 @@ namespace kernel
scoped_ptr<T> h_ptr(new T[in_elements]);
T* h_ptr_raw = h_ptr.get();
- CUDA_CHECK(cudaMemcpy(h_ptr_raw, in.ptr, in_elements * sizeof(T), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, in.ptr, in_elements * sizeof(T),
+ cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
MinMaxOp<op, T> Op(h_ptr_raw[0], 0);
for (int i = 1; i < in_elements; i++) {
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index 4935405..89de560 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -330,7 +330,8 @@ void orb(unsigned* out_feat,
// In future implementations, the user will be capable of passing his
// distribution instead of using the reference one
- //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));
+ //CUDA_CHECK(cudaMemcpyToSymbolAsync(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0,
+ // cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
vector<float*> d_score_pyr(max_levels);
vector<float*> d_ori_pyr(max_levels);
@@ -356,7 +357,9 @@ void orb(unsigned* out_feat,
int gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);
- CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss.get(), gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(gauss_filter.ptr, h_gauss.get(), gauss_elem * sizeof(convAccT),
+ cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
}
for (int i = 0; i < (int)max_levels; i++) {
diff --git a/src/backend/cuda/kernel/reduce.hpp b/src/backend/cuda/kernel/reduce.hpp
index 89b604e..118ba4e 100644
--- a/src/backend/cuda/kernel/reduce.hpp
+++ b/src/backend/cuda/kernel/reduce.hpp
@@ -414,7 +414,9 @@ namespace kernel
scoped_ptr<To> h_ptr(new To[tmp_elements]);
To* h_ptr_raw = h_ptr.get();
- CUDA_CHECK(cudaMemcpy(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(To), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, tmp.ptr, tmp_elements * sizeof(To),
+ cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
memFree(tmp.ptr);
Binary<To, op> reduce;
@@ -429,7 +431,9 @@ namespace kernel
scoped_ptr<Ti> h_ptr(new Ti[in_elements]);
Ti* h_ptr_raw = h_ptr.get();
- CUDA_CHECK(cudaMemcpy(h_ptr_raw, in.ptr, in_elements * sizeof(Ti), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(h_ptr_raw, in.ptr, in_elements * sizeof(Ti),
+ cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
Transform<Ti, To, op> transform;
Binary<To, op> reduce;
diff --git a/src/backend/cuda/kernel/regions.hpp b/src/backend/cuda/kernel/regions.hpp
index 27f1029..87fa78c 100644
--- a/src/backend/cuda/kernel/regions.hpp
+++ b/src/backend/cuda/kernel/regions.hpp
@@ -419,15 +419,18 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
while (h_continue) {
h_continue = 0;
- CUDA_CHECK(cudaMemcpyToSymbol(continue_flag, &h_continue, sizeof(int),
- 0, cudaMemcpyHostToDevice));
+ CUDA_CHECK(cudaMemcpyToSymbolAsync(continue_flag, &h_continue, sizeof(int),
+ 0, cudaMemcpyHostToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
CUDA_LAUNCH((update_equiv<T, 16, n_per_thread, full_conn>), blocks, threads, out, tex);
POST_LAUNCH_CHECK();
- CUDA_CHECK(cudaMemcpyFromSymbol(&h_continue, continue_flag, sizeof(int),
- 0, cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_continue, continue_flag, sizeof(int),
+ 0, cudaMemcpyDeviceToHost,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
}
// Now, perform the final relabeling. This converts the equivalency
diff --git a/src/backend/cuda/kernel/sift_nonfree.hpp b/src/backend/cuda/kernel/sift_nonfree.hpp
index e94aeb1..bcc8ac0 100644
--- a/src/backend/cuda/kernel/sift_nonfree.hpp
+++ b/src/backend/cuda/kernel/sift_nonfree.hpp
@@ -191,7 +191,9 @@ Param<T> gauss_filter(float sigma)
dim_t gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
gauss_filter.ptr = memAlloc<T>(gauss_elem);
- CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(T), cudaMemcpyHostToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(T),
+ cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
delete[] h_gauss;
@@ -1237,7 +1239,9 @@ std::vector< Param<T> > buildGaussPyr(
const unsigned imel = tmp_pyr[idx].dims[3] * tmp_pyr[idx].strides[3];
const unsigned offset = imel * l;
- CUDA_CHECK(cudaMemcpy(gauss_pyr[o].ptr + offset, tmp_pyr[idx].ptr, imel * sizeof(T), cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(gauss_pyr[o].ptr + offset, tmp_pyr[idx].ptr,
+ imel * sizeof(T), cudaMemcpyDeviceToDevice,
+ cuda::getStream(cuda::getActiveDeviceId())));
}
}
@@ -1378,7 +1382,9 @@ void sift(unsigned* out_feat,
POST_LAUNCH_CHECK();
unsigned extrema_feat = 0;
- CUDA_CHECK(cudaMemcpy(&extrema_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(&extrema_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
extrema_feat = min(extrema_feat, max_feat);
if (extrema_feat == 0) {
@@ -1415,7 +1421,9 @@ void sift(unsigned* out_feat,
memFree(d_extrema_y);
memFree(d_extrema_layer);
- CUDA_CHECK(cudaMemcpy(&interp_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(&interp_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
interp_feat = min(interp_feat, max_feat);
CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
@@ -1475,7 +1483,9 @@ void sift(unsigned* out_feat,
memFree(d_interp_size);
unsigned nodup_feat = 0;
- CUDA_CHECK(cudaMemcpy(&nodup_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(&nodup_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
cuda::getStream(cuda::getActiveDeviceId())));
@@ -1507,7 +1517,9 @@ void sift(unsigned* out_feat,
memFree(d_nodup_size);
unsigned oriented_feat = 0;
- CUDA_CHECK(cudaMemcpy(&oriented_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(&oriented_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
oriented_feat = min(oriented_feat, max_oriented_feat);
if (oriented_feat == 0) {
@@ -1580,14 +1592,20 @@ void sift(unsigned* out_feat,
if (feat_pyr[i] == 0)
continue;
- CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_score+offset, d_response_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-
- CUDA_CHECK(cudaMemcpy(*d_desc+(offset*desc_len), d_desc_pyr[i],
- feat_pyr[i] * desc_len * sizeof(float), cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpyAsync(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_score+offset, d_response_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaMemcpyAsync(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
+
+ CUDA_CHECK(cudaMemcpyAsync(*d_desc+(offset*desc_len), d_desc_pyr[i],
+ feat_pyr[i] * desc_len * sizeof(float),
+ cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));
memFree(d_x_pyr[i]);
memFree(d_y_pyr[i]);
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index fdbd88a..30b40ba 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -171,7 +171,9 @@ void nonMaximal(float* x_out, float* y_out, float* resp_out,
POST_LAUNCH_CHECK();
- CUDA_CHECK(cudaMemcpy(count, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(count, d_corners_found, sizeof(unsigned),
+ cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
memFree(d_corners_found);
}
diff --git a/src/backend/cuda/kernel/where.hpp b/src/backend/cuda/kernel/where.hpp
index fb2fd1d..746e2b8 100644
--- a/src/backend/cuda/kernel/where.hpp
+++ b/src/backend/cuda/kernel/where.hpp
@@ -117,8 +117,10 @@ namespace kernel
// Get output size and allocate output
uint total;
- CUDA_CHECK(cudaMemcpy(&total, rtmp.ptr + rtmp_elements - 1,
- sizeof(uint), cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaMemcpyAsync(&total, rtmp.ptr + rtmp_elements - 1,
+ sizeof(uint), cudaMemcpyDeviceToHost,
+ cuda::getStream(cuda::getActiveDeviceId())));
+ CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
out.ptr = memAlloc<uint>(total);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list