[arrayfire] 294/408: Moving CUDA SIFT syncthreads calls out of thread conditionals
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:17 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit ee4d0bd77d7060f96dda813480d147409331c7ef
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date: Wed Aug 19 17:03:35 2015 -0400
Moving CUDA SIFT syncthreads calls out of thread conditionals
---
src/backend/cuda/kernel/sift.hpp | 48 +++++++++++++++++++++-------------------
1 file changed, 25 insertions(+), 23 deletions(-)
diff --git a/src/backend/cuda/kernel/sift.hpp b/src/backend/cuda/kernel/sift.hpp
index 8f75fa7..c8de759 100644
--- a/src/backend/cuda/kernel/sift.hpp
+++ b/src/backend/cuda/kernel/sift.hpp
@@ -760,6 +760,12 @@ __global__ void computeDescriptor(
float* desc = shrdMem;
float* accum = shrdMem + desc_len * histsz;
+ const int histlen = (d)*(d)*(n);
+
+ for (int i = tid_x; i < histlen*histsz; i += bsz_x)
+ desc[tid_y*histlen+i] = 0.f;
+ __syncthreads();
+
if (f < total_feat) {
const unsigned layer = layer_in[f];
float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
@@ -783,13 +789,8 @@ __global__ void computeDescriptor(
int radius = hist_width * sqrtf(2.f) * (d + 1.f) * 0.5f + 0.5f;
int len = radius*2+1;
- const int histlen = (d)*(d)*(n);
const int hist_off = (tid_x % histsz) * desc_len;
- for (int i = tid_x; i < histlen*histsz; i += bsz_x)
- desc[tid_y*histlen+i] = 0.f;
- __syncthreads();
-
// Calculate orientation histogram
for (int l = tid_x; l < len*len; l += bsz_x) {
int i = l / len - radius;
@@ -845,31 +846,32 @@ __global__ void computeDescriptor(
}
}
}
- __syncthreads();
+ }
+ __syncthreads();
- // Combine histograms (reduces previous atomicAdd overhead)
- for (int l = tid_x; l < desc_len*4; l += bsz_x)
- desc[l] += desc[l+4*desc_len];
- __syncthreads();
- for (int l = tid_x; l < desc_len*2; l += bsz_x)
- desc[l ] += desc[l+2*desc_len];
- __syncthreads();
- for (int l = tid_x; l < desc_len; l += bsz_x)
- desc[l] += desc[l+desc_len];
- __syncthreads();
+ // Combine histograms (reduces previous atomicAdd overhead)
+ for (int l = tid_x; l < desc_len*4; l += bsz_x)
+ desc[l] += desc[l+4*desc_len];
+ __syncthreads();
+ for (int l = tid_x; l < desc_len*2; l += bsz_x)
+ desc[l ] += desc[l+2*desc_len];
+ __syncthreads();
+ for (int l = tid_x; l < desc_len; l += bsz_x)
+ desc[l] += desc[l+desc_len];
+ __syncthreads();
- normalizeDesc(desc, accum, histlen);
+ normalizeDesc(desc, accum, histlen);
- for (int i = tid_x; i < d*d*n; i += bsz_x)
- desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
- __syncthreads();
+ for (int i = tid_x; i < d*d*n; i += bsz_x)
+ desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
+ __syncthreads();
- normalizeDesc(desc, accum, histlen);
+ normalizeDesc(desc, accum, histlen);
+ if (f < total_feat) {
// Calculate final descriptor values
- for (int k = tid_x; k < d*d*n; k += bsz_x) {
+ for (int k = tid_x; k < d*d*n; k += bsz_x)
desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
- }
}
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list