[arrayfire] 175/408: Changed CUDA/OpenCL kernels to use zero leading dimension

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:50 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit e9535b9ce81a84d6051275c1caebb322fd55d1bb
Author: pradeep <pradeep at arrayfire.com>
Date:   Wed Jul 29 18:25:12 2015 -0400

    Changed CUDA/OpenCL kernels to use zero leading dimension
    
    Updated the following to reflect the algorithm change
    * SUSAN unit test data
    * SUSAN example
---
 examples/computer_vision/susan.cpp  | 12 ++++++------
 src/api/c/susan.cpp                 |  1 +
 src/backend/cpu/susan.cpp           | 28 ++++++++++++++--------------
 src/backend/cuda/kernel/susan.hpp   | 30 +++++++++++++++---------------
 src/backend/opencl/kernel/susan.cl  | 26 +++++++++++++-------------
 src/backend/opencl/kernel/susan.hpp |  8 ++++----
 test/data                           |  2 +-
 7 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/examples/computer_vision/susan.cpp b/examples/computer_vision/susan.cpp
index 858ed5f..eebf79e 100644
--- a/examples/computer_vision/susan.cpp
+++ b/examples/computer_vision/susan.cpp
@@ -41,15 +41,15 @@ static void susan_demo(bool console)
     for (size_t f = 0; f < feat.getNumFeatures(); f++) {
         int x = h_x[f];
         int y = h_y[f];
-        img_color(y, seq(x-draw_len, x+draw_len), 0) = 0.f;
-        img_color(y, seq(x-draw_len, x+draw_len), 1) = 1.f;
-        img_color(y, seq(x-draw_len, x+draw_len), 2) = 0.f;
+        img_color(x, seq(y-draw_len, y+draw_len), 0) = 0.f;
+        img_color(x, seq(y-draw_len, y+draw_len), 1) = 1.f;
+        img_color(x, seq(y-draw_len, y+draw_len), 2) = 0.f;
 
         // Draw vertical line of (draw_len * 2 + 1) pixels centered on  the corner
         // Set only the first channel to 1 (green lines)
-        img_color(seq(y-draw_len, y+draw_len), x, 0) = 0.f;
-        img_color(seq(y-draw_len, y+draw_len), x, 1) = 1.f;
-        img_color(seq(y-draw_len, y+draw_len), x, 2) = 0.f;
+        img_color(seq(x-draw_len, x+draw_len), y, 0) = 0.f;
+        img_color(seq(x-draw_len, x+draw_len), y, 1) = 1.f;
+        img_color(seq(x-draw_len, x+draw_len), y, 2) = 0.f;
     }
 
     printf("Features found: %lu\n", feat.getNumFeatures());
diff --git a/src/api/c/susan.cpp b/src/api/c/susan.cpp
index e37ba91..e070df8 100644
--- a/src/api/c/susan.cpp
+++ b/src/api/c/susan.cpp
@@ -56,6 +56,7 @@ af_err af_susan(af_features* out, const af_array in,
 
         ARG_ASSERT(1, dims.ndims()==2);
         ARG_ASSERT(2, radius < 10);
+        ARG_ASSERT(2, radius<=edge);
         ARG_ASSERT(3, diff_thr > 0.0f);
         ARG_ASSERT(4, geom_thr > 0.0f);
         ARG_ASSERT(5, (feature_ratio > 0.0f && feature_ratio <= 1.0f));
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index 770419c..ad5b702 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -26,9 +26,9 @@ void susan_responses(T* resp_out, const T* in,
     const unsigned r = border_len;
     const int rSqrd = radius*radius;
 
-    for (unsigned x = r; x < idim1 - r; ++x) {
-        for (unsigned y = r; y < idim0 - r; ++y) {
-            const unsigned idx = x * idim0 + y;
+    for (unsigned y = r; y < idim1 - r; ++y) {
+        for (unsigned x = r; x < idim0 - r; ++x) {
+            const unsigned idx = y * idim0 + x;
             T m_0 = in[idx];
             float nM = 0.0f;
 
@@ -37,7 +37,7 @@ void susan_responses(T* resp_out, const T* in,
                     if (i*i + j*j < rSqrd) {
                         int p = x + i;
                         int q = y + j;
-                        T m = in[p * idim0 + q];
+                        T m = in[p + idim0 * q];
                         float exp_pow = std::pow((m - m_0)/t, 6.0);
                         float cM = std::exp(-exp_pow);
                         nM += cM;
@@ -58,19 +58,19 @@ void non_maximal(float* x_out, float* y_out, float* resp_out,
     // Responses on the border don't have 8-neighbors to compare, discard them
     const unsigned r = border_len + 1;
 
-    for (unsigned x = r; x < idim1 - r; x++) {
-        for (unsigned y = r; y < idim0 - r; y++) {
-            const T v = resp_in[x * idim0 + y];
+    for (unsigned y = r; y < idim1 - r; y++) {
+        for (unsigned x = r; x < idim0 - r; x++) {
+            const T v = resp_in[y * idim0 + x];
 
             // Find maximum neighborhood response
             T max_v;
-            max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]);
-            max_v = max(max_v, resp_in[(x-1) * idim0 + y  ]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y  ]);
-            max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]);
-            max_v = max(max_v, resp_in[(x)   * idim0 + y+1]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]);
+            max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]);
+            max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]);
+            max_v = max(max_v, resp_in[(y-1) * idim0 + x  ]);
+            max_v = max(max_v, resp_in[(y+1) * idim0 + x  ]);
+            max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]);
+            max_v = max(max_v, resp_in[(y)   * idim0 + x+1]);
+            max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]);
 
             // Stores corner to {x,y,resp}_out if it's response is maximum compared
             // to its 8-neighborhood and greater or equal minimum response
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index 882adcf..45d03f6 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -65,15 +65,15 @@ void susanKernel(T* out, const T* in,
 #pragma unroll
         for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
             int i = gx2-radius;
-            shrdMem[b*shrdLen+a] = in[i*idim0+j];
+            shrdMem[b*shrdLen+a] = in[j*idim0+i];
         }
     }
     __syncthreads();
 
-    if (gx < idim1 - edge && gy < idim0 - edge) {
-        unsigned idx = gx*idim0 + gy;
+    if (gx < idim0 - edge && gy < idim1 - edge) {
+        unsigned idx = gy*idim0 + gx;
         float nM  = 0.0f;
-        float m_0 = in[idx];
+        float m_0 = shrdMem[(ly+RADIUS)*shrdLen + lx+RADIUS];
 #pragma unroll
         for (int p=0; p<windLen; ++p) {
 #pragma unroll
@@ -101,7 +101,7 @@ void susan_responses(T* out, const T* in,
                      const unsigned edge)
 {
     dim3 threads(BLOCK_X, BLOCK_Y);
-    dim3 blocks(divup(idim1-edge*2, BLOCK_X), divup(idim0-edge*2, BLOCK_Y));
+    dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));
 
     switch (radius) {
         case 1: susanKernel<T, 1><<<blocks, threads>>>(out, in, idim0, idim1, t, g, edge); break;
@@ -130,18 +130,18 @@ void nonMaxKernel(float* x_out, float* y_out, float* resp_out, unsigned* count,
     const unsigned gx = blockDim.x * blockIdx.x + threadIdx.x + r;
     const unsigned gy = blockDim.y * blockIdx.y + threadIdx.y + r;
 
-    if (gx < idim1 - r && gy < idim0 - r) {
-        const T v = resp_in[gx * idim0 + gy];
+    if (gx < idim0 - r && gy < idim1 - r) {
+        const T v = resp_in[gy * idim0 + gx];
 
         // Find maximum neighborhood response
         T max_v;
-        max_v = max_val(resp_in[(gx-1) * idim0 + gy-1], resp_in[gx * idim0 + gy-1]);
-        max_v = max_val(max_v, resp_in[(gx+1) * idim0 + gy-1]);
-        max_v = max_val(max_v, resp_in[(gx-1) * idim0 + gy  ]);
-        max_v = max_val(max_v, resp_in[(gx+1) * idim0 + gy  ]);
-        max_v = max_val(max_v, resp_in[(gx-1) * idim0 + gy+1]);
-        max_v = max_val(max_v, resp_in[(gx)   * idim0 + gy+1]);
-        max_v = max_val(max_v, resp_in[(gx+1) * idim0 + gy+1]);
+        max_v = max_val(resp_in[(gy-1) * idim0 + gx-1], resp_in[gy * idim0 + gx-1]);
+        max_v = max_val(max_v, resp_in[(gy+1) * idim0 + gx-1]);
+        max_v = max_val(max_v, resp_in[(gy-1) * idim0 + gx  ]);
+        max_v = max_val(max_v, resp_in[(gy+1) * idim0 + gx  ]);
+        max_v = max_val(max_v, resp_in[(gy-1) * idim0 + gx+1]);
+        max_v = max_val(max_v, resp_in[(gy)   * idim0 + gx+1]);
+        max_v = max_val(max_v, resp_in[(gy+1) * idim0 + gx+1]);
 
         // Stores corner to {x,y,resp}_out if it's response is maximum compared
         // to its 8-neighborhood and greater or equal minimum response
@@ -162,7 +162,7 @@ void nonMaximal(float* x_out, float* y_out, float* resp_out,
                  const T * resp_in, const unsigned edge, const unsigned max_corners)
 {
     dim3 threads(BLOCK_X, BLOCK_Y);
-    dim3 blocks(divup(idim1-edge*2, BLOCK_X), divup(idim0-edge*2, BLOCK_Y));
+    dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));
 
     unsigned* d_corners_found = memAlloc<unsigned>(1);
     CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
diff --git a/src/backend/opencl/kernel/susan.cl b/src/backend/opencl/kernel/susan.cl
index 69131fd..90ef70f 100644
--- a/src/backend/opencl/kernel/susan.cl
+++ b/src/backend/opencl/kernel/susan.cl
@@ -32,15 +32,15 @@ void susan_responses(global T* out, global const T* in,
 #pragma unroll
         for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
             int i = gx2-RADIUS;
-            localMem[b*shrdLen+a] = in[i*idim0+j];
+            localMem[b*shrdLen+a] = in[i+idim0*j];
         }
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (gx < idim1 - edge && gy < idim0 - edge) {
-        unsigned idx = gx*idim0 + gy;
+    if (gx < idim0 - edge && gy < idim1 - edge) {
+        unsigned idx = gx + idim0 * gy;
         float nM  = 0.0f;
-        float m_0 = in[idx];
+        float m_0 = localMem[(ly+RADIUS)*shrdLen + lx+RADIUS];
 #pragma unroll
         for (int p=0; p<windLen; ++p) {
 #pragma unroll
@@ -75,18 +75,18 @@ void non_maximal(global float* x_out, global float* y_out,
     const unsigned gx = get_global_id(0) + r;
     const unsigned gy = get_global_id(1) + r;
 
-    if (gx < idim1 - r && gy < idim0 - r) {
-        const T v = resp_in[gx * idim0 + gy];
+    if (gx < idim0 - r && gy < idim1 - r) {
+        const T v = resp_in[gy * idim0 + gx];
 
         // Find maximum neighborhood response
         T max_v;
-        max_v = MAX_VAL(resp_in[(gx-1) * idim0 + gy-1], resp_in[gx * idim0 + gy-1]);
-        max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy-1]);
-        max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy  ]);
-        max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy  ]);
-        max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy+1]);
-        max_v = MAX_VAL(max_v, resp_in[(gx)   * idim0 + gy+1]);
-        max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy+1]);
+        max_v = MAX_VAL(resp_in[(gy-1) * idim0 + gx-1], resp_in[gy * idim0 + gx-1]);
+        max_v = MAX_VAL(max_v, resp_in[(gy+1) * idim0 + gx-1]);
+        max_v = MAX_VAL(max_v, resp_in[(gy-1) * idim0 + gx  ]);
+        max_v = MAX_VAL(max_v, resp_in[(gy+1) * idim0 + gx  ]);
+        max_v = MAX_VAL(max_v, resp_in[(gy-1) * idim0 + gx+1]);
+        max_v = MAX_VAL(max_v, resp_in[(gy)   * idim0 + gx+1]);
+        max_v = MAX_VAL(max_v, resp_in[(gy+1) * idim0 + gx+1]);
 
         // Stores corner to {x,y,resp}_out if it's response is maximum compared
         // to its 8-neighborhood and greater or equal minimum response
diff --git a/src/backend/opencl/kernel/susan.hpp b/src/backend/opencl/kernel/susan.hpp
index 9551dbc..814616e 100644
--- a/src/backend/opencl/kernel/susan.hpp
+++ b/src/backend/opencl/kernel/susan.hpp
@@ -73,8 +73,8 @@ void susan(cl::Buffer* out, const cl::Buffer* in,
                                    float, float, unsigned>(*suKernel[device]);
 
         NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
-        NDRange global(divup(idim1-2*edge, local[0])*local[0],
-                       divup(idim0-2*edge, local[1])*local[1]);
+        NDRange global(divup(idim0-2*edge, local[0])*local[0],
+                       divup(idim1-2*edge, local[1])*local[1]);
 
         susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, idim0, idim1, t, g, edge);
 
@@ -122,8 +122,8 @@ unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
                                         unsigned, unsigned>(*nmKernel[device]);
 
         NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
-        NDRange global(divup(idim1-2*edge, local[0])*local[0],
-                       divup(idim0-2*edge, local[1])*local[1]);
+        NDRange global(divup(idim0-2*edge, local[0])*local[0],
+                       divup(idim1-2*edge, local[1])*local[1]);
 
         nonMaximalOp(EnqueueArgs(getQueue(), global, local),
                      *x_out, *y_out, *resp_out, *d_corners_found,
diff --git a/test/data b/test/data
index 79b0784..608ada5 160000
--- a/test/data
+++ b/test/data
@@ -1 +1 @@
-Subproject commit 79b0784a55044869ac32631560fae1e39aa53bbd
+Subproject commit 608ada5b36e4059a463a304ebc84991001094153

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list