[arrayfire] 177/408: SUSAN CUDA/OpenCL: Added bound checks to load shared/local Memory
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:51 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 26df8b39af9b3929d3f9e150db1d66ee603c1e72
Author: pradeep <pradeep at arrayfire.com>
Date: Thu Jul 30 13:56:25 2015 -0400
SUSAN CUDA/OpenCL: Added bound checks to load shared/local Memory
---
src/backend/cuda/kernel/susan.hpp | 10 +++++++---
src/backend/opencl/kernel/susan.cl | 16 +++++++++++++---
2 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index bd5fab8..22d9b97 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -59,13 +59,17 @@ void susanKernel(T* out, const T* in,
const unsigned gx = blockDim.x * blockIdx.x + lx + edge;
const unsigned gy = blockDim.y * blockIdx.y + ly + edge;
+ const unsigned nucleusIdx = (ly+radius)*shrdLen + lx+radius;
+ shrdMem[nucleusIdx] = gx<idim0 && gy<idim1 ? in[gy*idim0+gx] : 0;
+ T m_0 = shrdMem[nucleusIdx];
+
#pragma unroll
for (int b=ly, gy2=gy; b<shrdLen; b+=BLOCK_Y, gy2+=BLOCK_Y) {
int j = gy2-radius;
#pragma unroll
for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
int i = gx2-radius;
- shrdMem[b*shrdLen+a] = in[j*idim0+i];
+ shrdMem[b*shrdLen+a] = (i<idim0 && j<idim1 ? in[j*idim0+i]: m_0);
}
}
__syncthreads();
@@ -73,7 +77,6 @@ void susanKernel(T* out, const T* in,
if (gx < idim0 - edge && gy < idim1 - edge) {
unsigned idx = gy*idim0 + gx;
float nM = 0.0f;
- float m_0 = shrdMem[(ly+radius)*shrdLen + lx+radius];
#pragma unroll
for (int p=0; p<windLen; ++p) {
#pragma unroll
@@ -83,8 +86,9 @@ void susanKernel(T* out, const T* in,
int a = lx + radius + i;
int b = ly + radius + j;
if (i*i + j*j < rSqrd) {
+ float c = m_0;
float m = shrdMem[b * shrdLen + a];
- float exp_pow = powf((m - m_0)/t, 6.0f);
+ float exp_pow = powf((m - c)/t, 6.0f);
float cM = expf(-exp_pow);
nM += cM;
}
diff --git a/src/backend/opencl/kernel/susan.cl b/src/backend/opencl/kernel/susan.cl
index 90ef70f..5bfd094 100644
--- a/src/backend/opencl/kernel/susan.cl
+++ b/src/backend/opencl/kernel/susan.cl
@@ -26,13 +26,23 @@ void susan_responses(global T* out, global const T* in,
const unsigned gx = get_global_id(0) + edge;
const unsigned gy = get_global_id(1) + edge;
+ const unsigned nucleusIdx = (ly+RADIUS)*shrdLen + lx+RADIUS;
+ if (gx<idim0 && gy<idim1)
+ localMem[nucleusIdx] = in[gy*idim0+gx];
+ else
+ localMem[nucleusIdx] = 0;
+ T m_0 = localMem[nucleusIdx];
+
#pragma unroll
for (int b=ly, gy2=gy; b<shrdLen; b+=BLOCK_Y, gy2+=BLOCK_Y) {
int j = gy2-RADIUS;
#pragma unroll
for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
int i = gx2-RADIUS;
- localMem[b*shrdLen+a] = in[i+idim0*j];
+ if (i<idim0 && j<idim1)
+ localMem[b*shrdLen+a] = in[i+idim0*j];
+ else
+ localMem[b*shrdLen+a] = m_0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -40,7 +50,6 @@ void susan_responses(global T* out, global const T* in,
if (gx < idim0 - edge && gy < idim1 - edge) {
unsigned idx = gx + idim0 * gy;
float nM = 0.0f;
- float m_0 = localMem[(ly+RADIUS)*shrdLen + lx+RADIUS];
#pragma unroll
for (int p=0; p<windLen; ++p) {
#pragma unroll
@@ -50,8 +59,9 @@ void susan_responses(global T* out, global const T* in,
int a = lx + RADIUS + i;
int b = ly + RADIUS + j;
if (i*i + j*j < rSqrd) {
+ float c = m_0;
float m = localMem[b * shrdLen + a];
- float exp_pow = pow((m - m_0)/t, 6.0f);
+ float exp_pow = pow((m - c)/t, 6.0f);
float cM = exp(-exp_pow);
nM += cM;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list