[arrayfire] 177/408: SUSAN CUDA/OpenCL: Added bound checks to load shared/local Memory

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:51 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 26df8b39af9b3929d3f9e150db1d66ee603c1e72
Author: pradeep <pradeep at arrayfire.com>
Date:   Thu Jul 30 13:56:25 2015 -0400

    SUSAN CUDA/OpenCL: Added bound checks to load shared/local Memory
---
 src/backend/cuda/kernel/susan.hpp  | 10 +++++++---
 src/backend/opencl/kernel/susan.cl | 16 +++++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index bd5fab8..22d9b97 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -59,13 +59,17 @@ void susanKernel(T* out, const T* in,
     const unsigned gx  = blockDim.x * blockIdx.x + lx + edge;
     const unsigned gy  = blockDim.y * blockIdx.y + ly + edge;
 
+    const unsigned nucleusIdx = (ly+radius)*shrdLen + lx+radius;
+    shrdMem[nucleusIdx] = gx<idim0 && gy<idim1 ? in[gy*idim0+gx] : 0;
+    T m_0 = shrdMem[nucleusIdx];
+
 #pragma unroll
     for (int b=ly, gy2=gy; b<shrdLen; b+=BLOCK_Y, gy2+=BLOCK_Y) {
         int j = gy2-radius;
 #pragma unroll
         for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
             int i = gx2-radius;
-            shrdMem[b*shrdLen+a] = in[j*idim0+i];
+            shrdMem[b*shrdLen+a] = (i<idim0 && j<idim1 ? in[j*idim0+i]: m_0);
         }
     }
     __syncthreads();
@@ -73,7 +77,6 @@ void susanKernel(T* out, const T* in,
     if (gx < idim0 - edge && gy < idim1 - edge) {
         unsigned idx = gy*idim0 + gx;
         float nM  = 0.0f;
-        float m_0 = shrdMem[(ly+radius)*shrdLen + lx+radius];
 #pragma unroll
         for (int p=0; p<windLen; ++p) {
 #pragma unroll
@@ -83,8 +86,9 @@ void susanKernel(T* out, const T* in,
                 int a = lx + radius + i;
                 int b = ly + radius + j;
                 if (i*i + j*j < rSqrd) {
+                    float c = m_0;
                     float m = shrdMem[b * shrdLen + a];
-                    float exp_pow = powf((m - m_0)/t, 6.0f);
+                    float exp_pow = powf((m - c)/t, 6.0f);
                     float cM = expf(-exp_pow);
                     nM += cM;
                 }
diff --git a/src/backend/opencl/kernel/susan.cl b/src/backend/opencl/kernel/susan.cl
index 90ef70f..5bfd094 100644
--- a/src/backend/opencl/kernel/susan.cl
+++ b/src/backend/opencl/kernel/susan.cl
@@ -26,13 +26,23 @@ void susan_responses(global T* out, global const T* in,
     const unsigned gx = get_global_id(0) + edge;
     const unsigned gy = get_global_id(1) + edge;
 
+    const unsigned nucleusIdx = (ly+RADIUS)*shrdLen + lx+RADIUS;
+    if (gx<idim0 && gy<idim1)
+        localMem[nucleusIdx] = in[gy*idim0+gx];
+    else
+        localMem[nucleusIdx] = 0;
+    T m_0 = localMem[nucleusIdx];
+
 #pragma unroll
     for (int b=ly, gy2=gy; b<shrdLen; b+=BLOCK_Y, gy2+=BLOCK_Y) {
         int j = gy2-RADIUS;
 #pragma unroll
         for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
             int i = gx2-RADIUS;
-            localMem[b*shrdLen+a] = in[i+idim0*j];
+            if (i<idim0 && j<idim1)
+                localMem[b*shrdLen+a] = in[i+idim0*j];
+            else
+                localMem[b*shrdLen+a] = m_0;
         }
     }
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -40,7 +50,6 @@ void susan_responses(global T* out, global const T* in,
     if (gx < idim0 - edge && gy < idim1 - edge) {
         unsigned idx = gx + idim0 * gy;
         float nM  = 0.0f;
-        float m_0 = localMem[(ly+RADIUS)*shrdLen + lx+RADIUS];
 #pragma unroll
         for (int p=0; p<windLen; ++p) {
 #pragma unroll
@@ -50,8 +59,9 @@ void susan_responses(global T* out, global const T* in,
                 int a = lx + RADIUS + i;
                 int b = ly + RADIUS + j;
                 if (i*i + j*j < rSqrd) {
+                    float c = m_0;
                     float m = localMem[b * shrdLen + a];
-                    float exp_pow = pow((m - m_0)/t, 6.0f);
+                    float exp_pow = pow((m - c)/t, 6.0f);
                     float cM = exp(-exp_pow);
                     nM += cM;
                 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list