[arrayfire] 265/408: Improved SIFT OpenCL code

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:11 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit b5cf8d504ebbfbc43e17cc6c24b4490918d10ac0
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date:   Fri Aug 14 14:24:10 2015 -0400

    Improved SIFT OpenCL code
    
    * Using existant conv2Helper function from OpenCL backend
    * Passing local memory size dynamically
    * Using pre-defined constants for workgroup sizes instead of hardcoded values
---
 src/backend/opencl/kernel/sift.cl  | 213 ++++++++++++++++++-------------------
 src/backend/opencl/kernel/sift.hpp | 120 +++++++--------------
 2 files changed, 143 insertions(+), 190 deletions(-)

diff --git a/src/backend/opencl/kernel/sift.cl b/src/backend/opencl/kernel/sift.cl
index a465b17..b93a8c3 100644
--- a/src/backend/opencl/kernel/sift.cl
+++ b/src/backend/opencl/kernel/sift.cl
@@ -149,46 +149,46 @@ inline void normalizeDesc(
     __local float* desc,
     __local float* accum,
     const int histlen,
-    int tid_x,
-    int tid_y,
-    int bsz_x)
+    int lid_x,
+    int lid_y,
+    int lsz_x)
 {
-    for (int i = tid_x; i < histlen; i += bsz_x)
-        accum[i] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
+    for (int i = lid_x; i < histlen; i += lsz_x)
+        accum[i] = desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
     barrier(CLK_LOCAL_MEM_FENCE);
 
     float sum = 0.0f;
     for (int i = 0; i < histlen; i++)
-        sum += desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
+        sum += desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid_x < 64)
-        accum[tid_x] += accum[tid_x+64];
+    if (lid_x < 64)
+        accum[lid_x] += accum[lid_x+64];
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid_x < 32)
-        accum[tid_x] += accum[tid_x+32];
+    if (lid_x < 32)
+        accum[lid_x] += accum[lid_x+32];
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid_x < 16)
-        accum[tid_x] += accum[tid_x+16];
+    if (lid_x < 16)
+        accum[lid_x] += accum[lid_x+16];
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid_x < 8)
-        accum[tid_x] += accum[tid_x+8];
+    if (lid_x < 8)
+        accum[lid_x] += accum[lid_x+8];
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid_x < 4)
-        accum[tid_x] += accum[tid_x+4];
+    if (lid_x < 4)
+        accum[lid_x] += accum[lid_x+4];
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid_x < 2)
-        accum[tid_x] += accum[tid_x+2];
+    if (lid_x < 2)
+        accum[lid_x] += accum[lid_x+2];
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid_x < 1)
-        accum[tid_x] += accum[tid_x+1];
+    if (lid_x < 1)
+        accum[lid_x] += accum[lid_x+1];
     barrier(CLK_LOCAL_MEM_FENCE);
 
     float len_sq = accum[0];
     float len_inv = 1.0f / sqrt(len_sq);
 
-    for (int i = tid_x; i < histlen; i += bsz_x) {
-        desc[tid_y*histlen+i] *= len_inv;
+    for (int i = lid_x; i < histlen; i += lsz_x) {
+        desc[lid_y*histlen+i] *= len_inv;
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -219,18 +219,9 @@ __kernel void detectExtrema(
     __global const T* dog,
     KParam iDoG,
     const unsigned max_feat,
-    const float threshold)
+    const float threshold,
+    __local float* l_mem)
 {
-    // One pixel border for each side
-    const int l_i = 32+2;
-    const int l_j = 8+2;
-
-    __local float l_mem[(32+2)*(8+2)*3];
-    __local float* l_prev   = l_mem;
-    __local float* l_center = l_mem + (32+2)*(8+2);
-    __local float* l_next   = l_mem + (32+2)*(8+2)*2;
-    __local float* l_tmp;
-
     const int dim0 = iDoG.dims[0];
     const int dim1 = iDoG.dims[1];
     const int imel = iDoG.dims[0]*iDoG.dims[1];
@@ -242,6 +233,14 @@ __kernel void detectExtrema(
     const int i = get_group_id(0) * lsz_i + lid_i+IMG_BORDER;
     const int j = get_group_id(1) * lsz_j + lid_j+IMG_BORDER;
 
+    // One pixel border for each side
+    const int l_i = lsz_i+2;
+    const int l_j = lsz_j+2;
+
+    __local float* l_prev   = l_mem;
+    __local float* l_center = l_mem + l_i * l_j;
+    __local float* l_next   = l_mem + l_i * l_j * 2;
+
     const int x = lid_i+1;
     const int y = lid_j+1;
 
@@ -249,21 +248,21 @@ __kernel void detectExtrema(
         const int l_i_half = l_i/2;
         const int l_j_half = l_j/2;
         if (lid_i < l_i_half && lid_j < l_j_half && i < dim0-IMG_BORDER+1 && j < dim1-IMG_BORDER+1) {
-                l_next  [lid_j*l_i + lid_i] = dog[(l+1)*imel+(j-1)*dim0+i-1];
-                l_center[lid_j*l_i + lid_i] = dog[(l  )*imel+(j-1)*dim0+i-1];
-                l_prev  [lid_j*l_i + lid_i] = dog[(l-1)*imel+(j-1)*dim0+i-1];
+                l_next  [lid_j*l_i + lid_i] = (float)dog[(l+1)*imel+(j-1)*dim0+i-1];
+                l_center[lid_j*l_i + lid_i] = (float)dog[(l  )*imel+(j-1)*dim0+i-1];
+                l_prev  [lid_j*l_i + lid_i] = (float)dog[(l-1)*imel+(j-1)*dim0+i-1];
 
-                l_next  [lid_j*l_i + lid_i+l_i_half] = dog[(l+1)*imel+(j-1)*dim0+i-1+l_i_half];
-                l_center[lid_j*l_i + lid_i+l_i_half] = dog[(l  )*imel+(j-1)*dim0+i-1+l_i_half];
-                l_prev  [lid_j*l_i + lid_i+l_i_half] = dog[(l-1)*imel+(j-1)*dim0+i-1+l_i_half];
+                l_next  [lid_j*l_i + lid_i+l_i_half] = (float)dog[(l+1)*imel+(j-1)*dim0+i-1+l_i_half];
+                l_center[lid_j*l_i + lid_i+l_i_half] = (float)dog[(l  )*imel+(j-1)*dim0+i-1+l_i_half];
+                l_prev  [lid_j*l_i + lid_i+l_i_half] = (float)dog[(l-1)*imel+(j-1)*dim0+i-1+l_i_half];
 
-                l_next  [(lid_j+l_j_half)*l_i + lid_i] = dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1];
-                l_center[(lid_j+l_j_half)*l_i + lid_i] = dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1];
-                l_prev  [(lid_j+l_j_half)*l_i + lid_i] = dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1];
+                l_next  [(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1];
+                l_center[(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1];
+                l_prev  [(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1];
 
-                l_next  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
-                l_center[(lid_j+l_j_half)*l_i + lid_i+l_i_half] = dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
-                l_prev  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
+                l_next  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
+                l_center[(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
+                l_prev  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -508,20 +507,19 @@ __kernel void calcOrientation(
     KParam iGauss,
     const unsigned max_feat,
     const unsigned octave,
-    const int double_input)
+    const int double_input,
+    __local float* l_mem)
 {
-    const int tid_x = get_local_id(0);
-    const int tid_y = get_local_id(1);
-    const int bsz_x = get_local_size(0);
+    const int lid_x = get_local_id(0);
+    const int lid_y = get_local_id(1);
+    const int lsz_x = get_local_size(0);
 
     const unsigned f = get_global_id(1);
 
     const int n = ORI_HIST_BINS;
 
-    const int hdim = ORI_HIST_BINS;
-    const int thdim = ORI_HIST_BINS;
-    __local float hist[ORI_HIST_BINS*8];
-    __local float temphist[ORI_HIST_BINS*8];
+    __local float* hist = l_mem;
+    __local float* temphist = l_mem + n*8;
 
     if (f < total_feat) {
         // Load keypoint information
@@ -542,8 +540,8 @@ __kernel void calcOrientation(
         const float exp_denom = 2.f * sigma * sigma;
 
         // Initialize temporary histogram
-        for (int i = tid_x; i < ORI_HIST_BINS; i += bsz_x) {
-            hist[tid_y*hdim + i] = 0.f;
+        for (int i = lid_x; i < n; i += lsz_x) {
+            hist[lid_y*n + i] = 0.f;
         }
         barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -555,7 +553,7 @@ __kernel void calcOrientation(
         __global const T* img = gauss_octave + layer_offset;
 
         // Calculate orientation histogram
-        for (int l = tid_x; l < len*len; l += bsz_x) {
+        for (int l = lid_x; l < len*len; l += lsz_x) {
             int i = l / len - radius;
             int j = l % len - radius;
 
@@ -576,60 +574,60 @@ __kernel void calcOrientation(
             bin = bin < n ? bin : 0;
             bin = (bin < 0) ? 0 : (bin >= n) ? n-1 : bin;
 
-            fatomic_add(&hist[tid_y*hdim+bin], w*mag);
+            fatomic_add(&hist[lid_y*n+bin], w*mag);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
 
         for (int i = 0; i < SMOOTH_ORI_PASSES; i++) {
-            for (int j = tid_x; j < n; j += bsz_x) {
-                temphist[tid_y*hdim+j] = hist[tid_y*hdim+j];
+            for (int j = lid_x; j < n; j += lsz_x) {
+                temphist[lid_y*n+j] = hist[lid_y*n+j];
             }
             barrier(CLK_LOCAL_MEM_FENCE);
-            for (int j = tid_x; j < n; j += bsz_x) {
-                float prev = (j == 0) ? temphist[tid_y*hdim+n-1] : temphist[tid_y*hdim+j-1];
-                float next = (j+1 == n) ? temphist[tid_y*hdim] : temphist[tid_y*hdim+j+1];
-                hist[tid_y*hdim+j] = 0.25f * prev + 0.5f * temphist[tid_y*hdim+j] + 0.25f * next;
+            for (int j = lid_x; j < n; j += lsz_x) {
+                float prev = (j == 0) ? temphist[lid_y*n+n-1] : temphist[lid_y*n+j-1];
+                float next = (j+1 == n) ? temphist[lid_y*n] : temphist[lid_y*n+j+1];
+                hist[lid_y*n+j] = 0.25f * prev + 0.5f * temphist[lid_y*n+j] + 0.25f * next;
             }
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
-        for (int i = tid_x; i < n; i += bsz_x)
-            temphist[tid_y*hdim+i] = hist[tid_y*hdim+i];
+        for (int i = lid_x; i < n; i += lsz_x)
+            temphist[lid_y*n+i] = hist[lid_y*n+i];
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        if (tid_x < 16)
-            temphist[tid_y*thdim+tid_x] = fmax(hist[tid_y*hdim+tid_x], hist[tid_y*hdim+tid_x+16]);
+        if (lid_x < 16)
+            temphist[lid_y*n+lid_x] = fmax(hist[lid_y*n+lid_x], hist[lid_y*n+lid_x+16]);
         barrier(CLK_LOCAL_MEM_FENCE);
-        if (tid_x < 8)
-            temphist[tid_y*thdim+tid_x] = fmax(temphist[tid_y*thdim+tid_x], temphist[tid_y*thdim+tid_x+8]);
+        if (lid_x < 8)
+            temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+8]);
         barrier(CLK_LOCAL_MEM_FENCE);
-        if (tid_x < 4) {
-            temphist[tid_y*thdim+tid_x] = fmax(temphist[tid_y*thdim+tid_x], hist[tid_y*hdim+tid_x+32]);
-            temphist[tid_y*thdim+tid_x] = fmax(temphist[tid_y*thdim+tid_x], temphist[tid_y*thdim+tid_x+4]);
+        if (lid_x < 4) {
+            temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], hist[lid_y*n+lid_x+32]);
+            temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+4]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
-        if (tid_x < 2)
-            temphist[tid_y*thdim+tid_x] = fmax(temphist[tid_y*thdim+tid_x], temphist[tid_y*thdim+tid_x+2]);
+        if (lid_x < 2)
+            temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+2]);
         barrier(CLK_LOCAL_MEM_FENCE);
-        if (tid_x < 1)
-            temphist[tid_y*thdim+tid_x] = fmax(temphist[tid_y*thdim+tid_x], temphist[tid_y*thdim+tid_x+1]);
+        if (lid_x < 1)
+            temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+1]);
         barrier(CLK_LOCAL_MEM_FENCE);
-        float omax = temphist[tid_y*thdim];
+        float omax = temphist[lid_y*n];
 
         float mag_thr = (float)(omax * ORI_PEAK_RATIO);
         int l, r;
         float bin;
-        for (int j = tid_x; j < n; j+=bsz_x) {
+        for (int j = lid_x; j < n; j+=lsz_x) {
             l = (j == 0) ? n - 1 : j - 1;
             r = (j + 1) % n;
-            if (hist[tid_y*hdim+j] > hist[tid_y*hdim+l] &&
-                hist[tid_y*hdim+j] > hist[tid_y*hdim+r] &&
-                hist[tid_y*hdim+j] >= mag_thr) {
+            if (hist[lid_y*n+j] > hist[lid_y*n+l] &&
+                hist[lid_y*n+j] > hist[lid_y*n+r] &&
+                hist[lid_y*n+j] >= mag_thr) {
                 unsigned idx = atomic_inc(counter);
 
                 if (idx < max_feat) {
-                    float bin = j + 0.5f * (hist[tid_y*hdim+l] - hist[tid_y*hdim+r]) /
-                                (hist[tid_y*hdim+l] - 2.0f*hist[tid_y*hdim+j] + hist[tid_y*hdim+r]);
+                    float bin = j + 0.5f * (hist[lid_y*n+l] - hist[lid_y*n+r]) /
+                                (hist[lid_y*n+l] - 2.0f*hist[lid_y*n+j] + hist[lid_y*n+r]);
                     bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
                     float ori = 360.f - ((360.f/n) * bin);
 
@@ -661,6 +659,7 @@ __kernel void calcOrientation(
 __kernel void computeDescriptor(
     __global float* desc_out,
     const unsigned desc_len,
+    const unsigned histsz,
     __global const float* x_in,
     __global const float* y_in,
     __global const unsigned* layer_in,
@@ -674,17 +673,17 @@ __kernel void computeDescriptor(
     const int n,
     const float scale,
     const float sigma,
-    const int n_layers)
+    const int n_layers,
+    __local float* l_mem)
 {
-    const int tid_x = get_local_id(0);
-    const int tid_y = get_local_id(1);
-    const int bsz_x = get_local_size(0);
+    const int lid_x = get_local_id(0);
+    const int lid_y = get_local_id(1);
+    const int lsz_x = get_local_size(0);
 
     const int f = get_global_id(1);
 
-    const int histsz = 8;
-    __local float desc[128*8];
-    __local float accum[128];
+    __local float* desc = l_mem;
+    __local float* accum = l_mem + desc_len * histsz;
 
     if (f < total_feat) {
         const unsigned layer = layer_in[f];
@@ -708,14 +707,14 @@ __kernel void computeDescriptor(
 
         int len = radius*2+1;
         const int histlen = d*d*n;
-        const int hist_off = (tid_x % histsz) * 128;
+        const int hist_off = (lid_x % histsz) * desc_len;
 
-        for (int i = tid_x; i < histlen*histsz; i += bsz_x)
-            desc[tid_y*histlen+i] = 0.f;
+        for (int i = lid_x; i < histlen*histsz; i += lsz_x)
+            desc[lid_y*histlen+i] = 0.f;
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Calculate orientation histogram
-        for (int l = tid_x; l < len*len; l += bsz_x) {
+        for (int l = lid_x; l < len*len; l += lsz_x) {
             int i = l / len - radius;
             int j = l % len - radius;
 
@@ -761,7 +760,7 @@ __kernel void computeDescriptor(
 		                        for (int ol = 0; ol <= 1; ol++) {
 		                            int ob = (o0 + ol) % n;
 		                            float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
-		                            fatomic_add(&desc[hist_off + tid_y*128 + (yb*d + xb)*n + ob], v_o);
+		                            fatomic_add(&desc[hist_off + lid_y*desc_len + (yb*d + xb)*n + ob], v_o);
 		                        }
 		                    }
 	                    }
@@ -772,27 +771,27 @@ __kernel void computeDescriptor(
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Combine histograms (reduces previous atomicAdd overhead)
-        for (int l = tid_x; l < 128*4; l += bsz_x)
-            desc[l] += desc[l+4*128];
+        for (int l = lid_x; l < desc_len*4; l += lsz_x)
+            desc[l] += desc[l+4*desc_len];
         barrier(CLK_LOCAL_MEM_FENCE);
-        for (int l = tid_x; l < 128*2; l += bsz_x)
-            desc[l    ] += desc[l+2*128];
+        for (int l = lid_x; l < desc_len*2; l += lsz_x)
+            desc[l    ] += desc[l+2*desc_len];
         barrier(CLK_LOCAL_MEM_FENCE);
-        for (int l = tid_x; l < 128; l += bsz_x)
-            desc[l] += desc[l+128];
+        for (int l = lid_x; l < desc_len; l += lsz_x)
+            desc[l] += desc[l+desc_len];
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        normalizeDesc(desc, accum, histlen, tid_x, tid_y, bsz_x);
+        normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
 
-        for (int i = tid_x; i < d*d*n; i += bsz_x)
-            desc[tid_y*128+i] = min(desc[tid_y*128+i], DESCR_MAG_THR);
+        for (int i = lid_x; i < d*d*n; i += lsz_x)
+            desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        normalizeDesc(desc, accum, histlen, tid_x, tid_y, bsz_x);
+        normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
 
         // Calculate final descriptor values
-        for (int k = tid_x; k < d*d*n; k += bsz_x) {
-            desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*128+k] * INT_DESCR_FCTR));
+        for (int k = lid_x; k < d*d*n; k += lsz_x) {
+            desc_out[f*desc_len+k] = round(min(255.f, desc[lid_y*desc_len+k] * INT_DESCR_FCTR));
         }
     }
 }
diff --git a/src/backend/opencl/kernel/sift.hpp b/src/backend/opencl/kernel/sift.hpp
index a826652..b8fc8bd 100644
--- a/src/backend/opencl/kernel/sift.hpp
+++ b/src/backend/opencl/kernel/sift.hpp
@@ -104,10 +104,13 @@ namespace kernel
 {
 
 static const int SIFT_THREADS   = 256;
-static const int SIFT_THREADS_X = 16;
-static const int SIFT_THREADS_Y = 16;
+static const int SIFT_THREADS_X = 32;
+static const int SIFT_THREADS_Y = 8;
 
+// assumed gaussian blur for input image
 static const float InitSigma = 0.5f;
+
+// width of border in which to ignore keypoints
 static const int ImgBorder = 5;
 
 // default width of descriptor histogram array
@@ -116,6 +119,9 @@ static const int DescrWidth = 4;
 // default number of bins per histogram in descriptor array
 static const int DescrHistBins = 8;
 
+// default number of bins in histogram for orientation assignment
+static const int OriHistBins = 36;
+
 static const float PI_VAL = 3.14159265358979323846f;
 
 template<typename T>
@@ -166,7 +172,7 @@ Param gaussFilter(float sigma)
 }
 
 template<typename T, typename convAccT>
-void convHelper(Param& dst, Param src, Param filter)
+void conv2HelperFull(Param& dst, Param src, Param filter)
 {
     Param tmp;
     tmp.info.offset = 0;
@@ -179,68 +185,8 @@ void convHelper(Param& dst, Param src, Param filter)
     tmp.data = bufferAlloc(src_el * sizeof(T));
 
     const dim_t fLen = filter.info.dims[0];
-    switch(fLen) {
-        case 3:
-            convolve2<T, convAccT, 0, false, 3 >(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 3 >(dst, tmp, filter);
-            break;
-        case 5:
-            convolve2<T, convAccT, 0, false, 5 >(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 5 >(dst, tmp, filter);
-            break;
-        case 7:
-            convolve2<T, convAccT, 0, false, 7 >(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 7 >(dst, tmp, filter);
-            break;
-        case 9:
-            convolve2<T, convAccT, 0, false, 9 >(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 9 >(dst, tmp, filter);
-            break;
-        case 11:
-            convolve2<T, convAccT, 0, false, 11>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 11>(dst, tmp, filter);
-            break;
-        case 13:
-            convolve2<T, convAccT, 0, false, 13>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 13>(dst, tmp, filter);
-            break;
-        case 15:
-            convolve2<T, convAccT, 0, false, 15>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 15>(dst, tmp, filter);
-            break;
-        case 17:
-            convolve2<T, convAccT, 0, false, 17>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 17>(dst, tmp, filter);
-            break;
-        case 19:
-            convolve2<T, convAccT, 0, false, 19>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 19>(dst, tmp, filter);
-            break;
-        case 21:
-            convolve2<T, convAccT, 0, false, 21>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 21>(dst, tmp, filter);
-            break;
-        case 23:
-            convolve2<T, convAccT, 0, false, 23>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 23>(dst, tmp, filter);
-            break;
-        case 25:
-            convolve2<T, convAccT, 0, false, 25>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 25>(dst, tmp, filter);
-            break;
-        case 27:
-            convolve2<T, convAccT, 0, false, 27>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 27>(dst, tmp, filter);
-            break;
-        case 29:
-            convolve2<T, convAccT, 0, false, 29>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 29>(dst, tmp, filter);
-            break;
-        case 31:
-            convolve2<T, convAccT, 0, false, 31>(tmp, src, filter);
-            convolve2<T, convAccT, 1, false, 31>(dst, tmp, filter);
-            break;
-    }
+    conv2Helper<T, convAccT, 0, false>(tmp, src, filter, fLen);
+    conv2Helper<T, convAccT, 1, false>(dst, tmp, filter, fLen);
 
     bufferFree(tmp.data);
 }
@@ -269,12 +215,12 @@ Param createInitialImage(
     float s = (double_input) ? sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4)
                              : sqrt(init_sigma * init_sigma - InitSigma * InitSigma);
 
-    Param filter = gaussFilter<convAccT>(s);
+    const Param filter = gaussFilter<convAccT>(s);
 
     if (double_input)
         resize<T, AF_INTERP_BILINEAR>(init_img, img);
 
-    convHelper<T, convAccT>(init_img, (double_input) ? init_img : img, filter);
+    conv2HelperFull<T, convAccT>(init_img, (double_input) ? init_img : img, filter);
 
     bufferFree(filter.data);
 
@@ -355,7 +301,7 @@ std::vector<Param> buildGaussPyr(
 
                 Param filter = gaussFilter<convAccT>(sig_layers[l]);
 
-                convHelper<T, convAccT>(tmp_pyr[idx], tmp_pyr[src_idx], filter);
+                conv2HelperFull<T, convAccT>(tmp_pyr[idx], tmp_pyr[src_idx], filter);
 
                 bufferFree(filter.data);
             }
@@ -530,19 +476,21 @@ void sift(unsigned* out_feat,
             int dim0 = dog_pyr[o].info.dims[0];
             int dim1 = dog_pyr[o].info.dims[1];
 
-            const int blk_x = divup(dim0-2*ImgBorder, 32);
-            const int blk_y = divup(dim1-2*ImgBorder, 8);
-            const NDRange local(32, 8);
-            const NDRange global(blk_x * 32, blk_y * 8);
+            const int blk_x = divup(dim0-2*ImgBorder, SIFT_THREADS_X);
+            const int blk_y = divup(dim1-2*ImgBorder, SIFT_THREADS_Y);
+            const NDRange local(SIFT_THREADS_X, SIFT_THREADS_Y);
+            const NDRange global(blk_x * SIFT_THREADS_X, blk_y * SIFT_THREADS_Y);
 
             float extrema_thr = 0.5f * contrast_thr / n_layers;
 
             auto deOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
-                                    Buffer, KParam, unsigned, float> (*deKernel[device]);
+                                    Buffer, KParam, unsigned, float,
+                                    LocalSpaceArg> (*deKernel[device]);
 
             deOp(EnqueueArgs(getQueue(), global, local),
                  *d_extrema_x, *d_extrema_y, *d_extrema_layer, *d_count,
-                 *dog_pyr[o].data, dog_pyr[o].info, max_feat, extrema_thr);
+                 *dog_pyr[o].data, dog_pyr[o].info, max_feat, extrema_thr,
+                 cl::Local((SIFT_THREADS_X+2) * (SIFT_THREADS_Y+2) * 3 * sizeof(float)));
             CL_DEBUG_FINISH(getQueue());
 
             getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &extrema_feat);
@@ -671,20 +619,22 @@ void sift(unsigned* out_feat,
             cl::Buffer* d_oriented_size     = bufferAlloc(max_oriented_feat * sizeof(float));
             cl::Buffer* d_oriented_ori      = bufferAlloc(max_oriented_feat * sizeof(float));
 
-            const int blk_x_ori = divup(nodup_feat, 8);
-            const NDRange local_ori(32, 8);
-            const NDRange global_ori(32, blk_x_ori * 8);
+            const int blk_x_ori = divup(nodup_feat, SIFT_THREADS_Y);
+            const NDRange local_ori(SIFT_THREADS_X, SIFT_THREADS_Y);
+            const NDRange global_ori(SIFT_THREADS_X, blk_x_ori * SIFT_THREADS_Y);
 
             auto coOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
                                     Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
-                                    Buffer, KParam, unsigned, unsigned, int> (*coKernel[device]);
+                                    Buffer, KParam, unsigned, unsigned, int,
+                                    LocalSpaceArg> (*coKernel[device]);
 
             coOp(EnqueueArgs(getQueue(), global_ori, local_ori),
                  *d_oriented_x, *d_oriented_y, *d_oriented_layer,
                  *d_oriented_response, *d_oriented_size, *d_oriented_ori, *d_count,
                  *d_nodup_x, *d_nodup_y, *d_nodup_layer,
                  *d_nodup_response, *d_nodup_size, nodup_feat,
-                 *gauss_pyr[o].data, gauss_pyr[o].info, max_oriented_feat, o, (int)double_input);
+                 *gauss_pyr[o].data, gauss_pyr[o].info, max_oriented_feat, o, (int)double_input,
+                 cl::Local(OriHistBins * SIFT_THREADS_Y * 2 * sizeof(float)));
             CL_DEBUG_FINISH(getQueue());
 
             getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &oriented_feat);
@@ -709,15 +659,19 @@ void sift(unsigned* out_feat,
             const NDRange local_desc(SIFT_THREADS, 1);
             const NDRange global_desc(SIFT_THREADS, blk_x_desc);
 
-            auto cdOp = make_kernel<Buffer, unsigned,
+            const unsigned histsz = 8;
+
+            auto cdOp = make_kernel<Buffer, unsigned, unsigned,
                                     Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
-                                    Buffer, KParam, int, int, float, float, int> (*cdKernel[device]);
+                                    Buffer, KParam, int, int, float, float, int,
+                                    LocalSpaceArg> (*cdKernel[device]);
 
             cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
-                 *d_desc, desc_len,
+                 *d_desc, desc_len, histsz,
                  *d_oriented_x, *d_oriented_y, *d_oriented_layer,
                  *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
-                 *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, init_sigma, n_layers);
+                 *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, init_sigma, n_layers,
+                 cl::Local(desc_len * (histsz+1) * sizeof(float)));
             CL_DEBUG_FINISH(getQueue());
 
             total_feat += oriented_feat;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list