[arrayfire] 259/408: Added missing CUDA_LAUNCH and THRUST_SELECT to SIFT
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:10 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit a5e8a5e9c9ce6f9529464bc857a4e2dbfeb75887
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date: Thu Aug 13 18:06:07 2015 -0400
Added missing CUDA_LAUNCH and THRUST_SELECT to SIFT
---
src/backend/cuda/kernel/sift.hpp | 59 +++++++++++++++++++++-------------------
1 file changed, 31 insertions(+), 28 deletions(-)
diff --git a/src/backend/cuda/kernel/sift.hpp b/src/backend/cuda/kernel/sift.hpp
index f551a61..7266a15 100644
--- a/src/backend/cuda/kernel/sift.hpp
+++ b/src/backend/cuda/kernel/sift.hpp
@@ -996,7 +996,8 @@ std::vector< Param<T> > buildDoGPyr(
dim3 threads(256);
dim3 blocks(divup(nel, threads.x));
- sub<T><<<blocks, threads>>>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
+ CUDA_LAUNCH((sub<T>), blocks, threads,
+ dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
POST_LAUNCH_CHECK();
}
}
@@ -1011,10 +1012,10 @@ void update_permutation(thrust::device_ptr<T>& keys, thrust::device_vector<int>&
thrust::device_vector<T> temp(permutation.size());
// permute the keys with the current reordering
- thrust::gather(permutation.begin(), permutation.end(), keys, temp.begin());
+ THRUST_SELECT((thrust::gather), permutation.begin(), permutation.end(), keys, temp.begin());
// stable_sort the permuted keys and update the permutation
- thrust::stable_sort_by_key(temp.begin(), temp.end(), permutation.begin());
+ THRUST_SELECT((thrust::stable_sort_by_key), temp.begin(), temp.end(), permutation.begin());
}
template <typename T>
@@ -1024,7 +1025,7 @@ void apply_permutation(thrust::device_ptr<T>& keys, thrust::device_vector<int>&
thrust::device_vector<T> temp(keys, keys+permutation.size());
// permute the keys
- thrust::gather(permutation.begin(), permutation.end(), temp.begin(), keys);
+ THRUST_SELECT((thrust::gather), permutation.begin(), permutation.end(), temp.begin(), keys);
}
template<typename T, typename convAccT>
@@ -1097,9 +1098,10 @@ void sift(unsigned* out_feat,
dim3 blocks(divup(dim0-2*IMG_BORDER, threads.x), divup(dim1-2*IMG_BORDER, threads.y));
float extrema_thr = 0.5f * contrast_thr / n_layers;
- detectExtrema<T><<<blocks, threads>>>(d_extrema_x, d_extrema_y, d_extrema_layer, d_count,
- CParam<T>(dog_pyr[prev]), CParam<T>(dog_pyr[center]), CParam<T>(dog_pyr[next]),
- layer, max_feat, extrema_thr);
+ CUDA_LAUNCH((detectExtrema<T>), blocks, threads,
+ d_extrema_x, d_extrema_y, d_extrema_layer, d_count,
+ CParam<T>(dog_pyr[prev]), CParam<T>(dog_pyr[center]), CParam<T>(dog_pyr[next]),
+ layer, max_feat, extrema_thr);
POST_LAUNCH_CHECK();
}
@@ -1132,12 +1134,12 @@ void sift(unsigned* out_feat,
CUDA_CHECK(cudaMalloc((void **)&dog_octave, (n_layers+2)*sizeof(Param<T>)));
CUDA_CHECK(cudaMemcpy(dog_octave, &dog_pyr[i*(n_layers+2)], (n_layers+2)*sizeof(Param<T>), cudaMemcpyHostToDevice));
- interpolateExtrema<T><<<blocks, threads>>>(d_interp_x, d_interp_y, d_interp_layer,
- d_interp_response, d_interp_size, d_count,
-
- d_extrema_x, d_extrema_y, d_extrema_layer, extrema_feat,
- dog_octave, max_feat, i, n_layers,
- contrast_thr, edge_thr, init_sigma, img_scale);
+ CUDA_LAUNCH((interpolateExtrema<T>), blocks, threads,
+ d_interp_x, d_interp_y, d_interp_layer,
+ d_interp_response, d_interp_size, d_count,
+ d_extrema_x, d_extrema_y, d_extrema_layer, extrema_feat,
+ dog_octave, max_feat, i, n_layers,
+ contrast_thr, edge_thr, init_sigma, img_scale);
POST_LAUNCH_CHECK();
CUDA_CHECK(cudaFree(dog_octave));
@@ -1191,10 +1193,11 @@ void sift(unsigned* out_feat,
threads = dim3(256, 1);
blocks = dim3(divup(interp_feat, threads.x), 1);
- removeDuplicates<<<blocks, threads>>>(d_nodup_x, d_nodup_y, d_nodup_layer,
- d_nodup_response, d_nodup_size, d_count,
- d_interp_x, d_interp_y, d_interp_layer,
- d_interp_response, d_interp_size, interp_feat);
+ CUDA_LAUNCH((removeDuplicates), blocks, threads,
+ d_nodup_x, d_nodup_y, d_nodup_layer,
+ d_nodup_response, d_nodup_size, d_count,
+ d_interp_x, d_interp_y, d_interp_layer,
+ d_interp_response, d_interp_size, interp_feat);
POST_LAUNCH_CHECK();
memFree(d_interp_x);
@@ -1223,11 +1226,12 @@ void sift(unsigned* out_feat,
threads = dim3(8, 32);
blocks = dim3(divup(nodup_feat, threads.x), 1);
- calcOrientation<T><<<blocks, threads>>>(d_oriented_x, d_oriented_y, d_oriented_layer,
- d_oriented_response, d_oriented_size, d_oriented_ori, d_count,
- d_nodup_x, d_nodup_y, d_nodup_layer,
- d_nodup_response, d_nodup_size, nodup_feat,
- gauss_octave, max_oriented_feat, i, double_input);
+ CUDA_LAUNCH((calcOrientation<T>), blocks, threads,
+ d_oriented_x, d_oriented_y, d_oriented_layer,
+ d_oriented_response, d_oriented_size, d_oriented_ori, d_count,
+ d_nodup_x, d_nodup_y, d_nodup_layer,
+ d_nodup_response, d_nodup_size, nodup_feat,
+ gauss_octave, max_oriented_feat, i, double_input);
POST_LAUNCH_CHECK();
memFree(d_nodup_x);
@@ -1256,15 +1260,14 @@ void sift(unsigned* out_feat,
float scale = 1.f/(1 << i);
if (double_input) scale *= 2.f;
- //threads = dim3(8, 32);
threads = dim3(1, 256);
- //threads = dim3(1, 256);
blocks = dim3(divup(oriented_feat, threads.x), 1);
- computeDescriptor<<<blocks, threads>>>(d_desc, desc_len,
- d_oriented_x, d_oriented_y, d_oriented_layer,
- d_oriented_response, d_oriented_size, d_oriented_ori,
- oriented_feat, gauss_octave, d, n, scale, init_sigma, n_layers);
+ CUDA_LAUNCH((computeDescriptor), blocks, threads,
+ d_desc, desc_len,
+ d_oriented_x, d_oriented_y, d_oriented_layer,
+ d_oriented_response, d_oriented_size, d_oriented_ori,
+ oriented_feat, gauss_octave, d, n, scale, init_sigma, n_layers);
POST_LAUNCH_CHECK();
total_feat += oriented_feat;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list