[arrayfire] 279/408: Moved syncthreads/barriers out of thread conditionals

Mon Sep 21 19:12:13 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit faefa30c3a0b9b386919e40151dc966b42110ef5
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date:   Tue Aug 18 14:54:32 2015 -0400

    Moved syncthreads/barriers out of thread conditionals
---
 src/backend/cuda/kernel/orb.hpp   | 40 ++++++++++++++++----------------
 src/backend/opencl/kernel/sift.cl | 48 ++++++++++++++++++++-------------------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index 38c6eaa..4935405 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -115,6 +115,9 @@ __global__ void harris_response(
 {
     unsigned f = blockDim.x * blockIdx.x + threadIdx.x;
 
+    float ixx = 0.f, iyy = 0.f, ixy = 0.f;
+    float size = 0.f;
+
     if (f < total_feat) {
         unsigned x, y;
         float scl = 1.f;
@@ -130,7 +133,7 @@ __global__ void harris_response(
         }
 
         // Round feature size to nearest odd integer
-        float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
+        size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
 
         // Avoid keeping features that might be too wide and might not fit on
         // the image, sqrt(2.f) is the radius when angle is 45 degrees and
@@ -141,7 +144,6 @@ __global__ void harris_response(
 
         unsigned r = block_size / 2;
 
-        float ixx = 0.f, iyy = 0.f, ixy = 0.f;
         unsigned block_size_sq = block_size * block_size;
         for (unsigned k = threadIdx.y; k < block_size_sq; k += blockDim.y) {
             int i = k / block_size - r;
@@ -156,28 +158,28 @@ __global__ void harris_response(
             iyy += iy*iy;
             ixy += ix*iy;
         }
-        __syncthreads();
+    }
+    __syncthreads();
 
-        ixx = block_reduce_sum(ixx);
-        iyy = block_reduce_sum(iyy);
-        ixy = block_reduce_sum(ixy);
+    ixx = block_reduce_sum(ixx);
+    iyy = block_reduce_sum(iyy);
+    ixy = block_reduce_sum(ixy);
 
-        if (threadIdx.y == 0) {
-            float tr = ixx + iyy;
-            float det = ixx*iyy - ixy*ixy;
+    if (f < total_feat && threadIdx.y == 0) {
+        float tr = ixx + iyy;
+        float det = ixx*iyy - ixy*ixy;
 
-            // Calculate Harris responses
-            float resp = det - k_thr * (tr*tr);
+        // Calculate Harris responses
+        float resp = det - k_thr * (tr*tr);
 
-            // Scale factor
-            // TODO: improve response scaling
-            float rscale = 0.001f;
-            rscale = rscale * rscale * rscale * rscale;
+        // Scale factor
+        // TODO: improve response scaling
+        float rscale = 0.001f;
+        rscale = rscale * rscale * rscale * rscale;
 
-            score_out[f] = resp * rscale;
-            if (use_scl)
-                size_out[f] = size;
-        }
+        score_out[f] = resp * rscale;
+        if (use_scl)
+            size_out[f] = size;
     }
 }
 
diff --git a/src/backend/opencl/kernel/sift.cl b/src/backend/opencl/kernel/sift.cl
index 57610df..ef63cab 100644
--- a/src/backend/opencl/kernel/sift.cl
+++ b/src/backend/opencl/kernel/sift.cl
@@ -690,6 +690,12 @@ __kernel void computeDescriptor(
     __local float* desc = l_mem;
     __local float* accum = l_mem + desc_len * histsz;
 
+    const int histlen = d*d*n;
+
+    for (int i = lid_x; i < histlen*histsz; i += lsz_x)
+        desc[lid_y*histlen+i] = 0.f;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     if (f < total_feat) {
         const unsigned layer = layer_in[f];
         float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
@@ -711,13 +717,8 @@ __kernel void computeDescriptor(
         int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
 
         int len = radius*2+1;
-        const int histlen = d*d*n;
         const int hist_off = (lid_x % histsz) * desc_len;
 
-        for (int i = lid_x; i < histlen*histsz; i += lsz_x)
-            desc[lid_y*histlen+i] = 0.f;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
         // Calculate orientation histogram
         for (int l = lid_x; l < len*len; l += lsz_x) {
             int i = l / len - radius;
@@ -773,31 +774,32 @@ __kernel void computeDescriptor(
                 }
             }
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-        // Combine histograms (reduces previous atomicAdd overhead)
-        for (int l = lid_x; l < desc_len*4; l += lsz_x)
-            desc[l] += desc[l+4*desc_len];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for (int l = lid_x; l < desc_len*2; l += lsz_x)
-            desc[l    ] += desc[l+2*desc_len];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for (int l = lid_x; l < desc_len; l += lsz_x)
-            desc[l] += desc[l+desc_len];
-        barrier(CLK_LOCAL_MEM_FENCE);
+    // Combine histograms (reduces previous atomicAdd overhead)
+    for (int l = lid_x; l < desc_len*4; l += lsz_x)
+        desc[l] += desc[l+4*desc_len];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int l = lid_x; l < desc_len*2; l += lsz_x)
+        desc[l    ] += desc[l+2*desc_len];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int l = lid_x; l < desc_len; l += lsz_x)
+        desc[l] += desc[l+desc_len];
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-        normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+    normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
 
-        for (int i = lid_x; i < d*d*n; i += lsz_x)
-            desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
-        barrier(CLK_LOCAL_MEM_FENCE);
+    for (int i = lid_x; i < d*d*n; i += lsz_x)
+        desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-        normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+    normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
 
+    if (f < total_feat) {
         // Calculate final descriptor values
-        for (int k = lid_x; k < d*d*n; k += lsz_x) {
+        for (int k = lid_x; k < d*d*n; k += lsz_x)
             desc_out[f*desc_len+k] = round(min(255.f, desc[lid_y*desc_len+k] * INT_DESCR_FCTR));
-        }
     }
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git