[arrayfire] 294/408: Moving CUDA SIFT syncthreads calls out of thread conditionals

Mon Sep 21 19:12:17 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit ee4d0bd77d7060f96dda813480d147409331c7ef
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date:   Wed Aug 19 17:03:35 2015 -0400

    Moving CUDA SIFT syncthreads calls out of thread conditionals
---
 src/backend/cuda/kernel/sift.hpp | 48 +++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/backend/cuda/kernel/sift.hpp b/src/backend/cuda/kernel/sift.hpp
index 8f75fa7..c8de759 100644
--- a/src/backend/cuda/kernel/sift.hpp
+++ b/src/backend/cuda/kernel/sift.hpp
@@ -760,6 +760,12 @@ __global__ void computeDescriptor(
     float* desc = shrdMem;
     float* accum = shrdMem + desc_len * histsz;
 
+    const int histlen = (d)*(d)*(n);
+
+    for (int i = tid_x; i < histlen*histsz; i += bsz_x)
+        desc[tid_y*histlen+i] = 0.f;
+    __syncthreads();
+
     if (f < total_feat) {
         const unsigned layer = layer_in[f];
         float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
@@ -783,13 +789,8 @@ __global__ void computeDescriptor(
         int radius = hist_width * sqrtf(2.f) * (d + 1.f) * 0.5f + 0.5f;
 
         int len = radius*2+1;
-        const int histlen = (d)*(d)*(n);
         const int hist_off = (tid_x % histsz) * desc_len;
 
-        for (int i = tid_x; i < histlen*histsz; i += bsz_x)
-            desc[tid_y*histlen+i] = 0.f;
-        __syncthreads();
-
         // Calculate orientation histogram
         for (int l = tid_x; l < len*len; l += bsz_x) {
             int i = l / len - radius;
@@ -845,31 +846,32 @@ __global__ void computeDescriptor(
                 }
             }
         }
-        __syncthreads();
+    }
+    __syncthreads();
 
-        // Combine histograms (reduces previous atomicAdd overhead)
-        for (int l = tid_x; l < desc_len*4; l += bsz_x)
-            desc[l] += desc[l+4*desc_len];
-        __syncthreads();
-        for (int l = tid_x; l < desc_len*2; l += bsz_x)
-            desc[l    ] += desc[l+2*desc_len];
-        __syncthreads();
-        for (int l = tid_x; l < desc_len; l += bsz_x)
-            desc[l] += desc[l+desc_len];
-        __syncthreads();
+    // Combine histograms (reduces previous atomicAdd overhead)
+    for (int l = tid_x; l < desc_len*4; l += bsz_x)
+        desc[l] += desc[l+4*desc_len];
+    __syncthreads();
+    for (int l = tid_x; l < desc_len*2; l += bsz_x)
+        desc[l    ] += desc[l+2*desc_len];
+    __syncthreads();
+    for (int l = tid_x; l < desc_len; l += bsz_x)
+        desc[l] += desc[l+desc_len];
+    __syncthreads();
 
-        normalizeDesc(desc, accum, histlen);
+    normalizeDesc(desc, accum, histlen);
 
-        for (int i = tid_x; i < d*d*n; i += bsz_x)
-            desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
-        __syncthreads();
+    for (int i = tid_x; i < d*d*n; i += bsz_x)
+        desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
+    __syncthreads();
 
-        normalizeDesc(desc, accum, histlen);
+    normalizeDesc(desc, accum, histlen);
 
+    if (f < total_feat) {
         // Calculate final descriptor values
-        for (int k = tid_x; k < d*d*n; k += bsz_x) {
+        for (int k = tid_x; k < d*d*n; k += bsz_x)
             desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
-        }
     }
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git