[arrayfire] 175/408: Changed CUDA/OpenCL kernels to use zero leading dimension
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:50 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit e9535b9ce81a84d6051275c1caebb322fd55d1bb
Author: pradeep <pradeep at arrayfire.com>
Date: Wed Jul 29 18:25:12 2015 -0400
Changed CUDA/OpenCL kernels to use zero leading dimension
Updated the following to reflect the algorithm change
* SUSAN unit test data
* SUSAN example
---
examples/computer_vision/susan.cpp | 12 ++++++------
src/api/c/susan.cpp | 1 +
src/backend/cpu/susan.cpp | 28 ++++++++++++++--------------
src/backend/cuda/kernel/susan.hpp | 30 +++++++++++++++---------------
src/backend/opencl/kernel/susan.cl | 26 +++++++++++++-------------
src/backend/opencl/kernel/susan.hpp | 8 ++++----
test/data | 2 +-
7 files changed, 54 insertions(+), 53 deletions(-)
diff --git a/examples/computer_vision/susan.cpp b/examples/computer_vision/susan.cpp
index 858ed5f..eebf79e 100644
--- a/examples/computer_vision/susan.cpp
+++ b/examples/computer_vision/susan.cpp
@@ -41,15 +41,15 @@ static void susan_demo(bool console)
for (size_t f = 0; f < feat.getNumFeatures(); f++) {
int x = h_x[f];
int y = h_y[f];
- img_color(y, seq(x-draw_len, x+draw_len), 0) = 0.f;
- img_color(y, seq(x-draw_len, x+draw_len), 1) = 1.f;
- img_color(y, seq(x-draw_len, x+draw_len), 2) = 0.f;
+ img_color(x, seq(y-draw_len, y+draw_len), 0) = 0.f;
+ img_color(x, seq(y-draw_len, y+draw_len), 1) = 1.f;
+ img_color(x, seq(y-draw_len, y+draw_len), 2) = 0.f;
// Draw vertical line of (draw_len * 2 + 1) pixels centered on the corner
// Set only the first channel to 1 (green lines)
- img_color(seq(y-draw_len, y+draw_len), x, 0) = 0.f;
- img_color(seq(y-draw_len, y+draw_len), x, 1) = 1.f;
- img_color(seq(y-draw_len, y+draw_len), x, 2) = 0.f;
+ img_color(seq(x-draw_len, x+draw_len), y, 0) = 0.f;
+ img_color(seq(x-draw_len, x+draw_len), y, 1) = 1.f;
+ img_color(seq(x-draw_len, x+draw_len), y, 2) = 0.f;
}
printf("Features found: %lu\n", feat.getNumFeatures());
diff --git a/src/api/c/susan.cpp b/src/api/c/susan.cpp
index e37ba91..e070df8 100644
--- a/src/api/c/susan.cpp
+++ b/src/api/c/susan.cpp
@@ -56,6 +56,7 @@ af_err af_susan(af_features* out, const af_array in,
ARG_ASSERT(1, dims.ndims()==2);
ARG_ASSERT(2, radius < 10);
+ ARG_ASSERT(2, radius<=edge);
ARG_ASSERT(3, diff_thr > 0.0f);
ARG_ASSERT(4, geom_thr > 0.0f);
ARG_ASSERT(5, (feature_ratio > 0.0f && feature_ratio <= 1.0f));
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index 770419c..ad5b702 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -26,9 +26,9 @@ void susan_responses(T* resp_out, const T* in,
const unsigned r = border_len;
const int rSqrd = radius*radius;
- for (unsigned x = r; x < idim1 - r; ++x) {
- for (unsigned y = r; y < idim0 - r; ++y) {
- const unsigned idx = x * idim0 + y;
+ for (unsigned y = r; y < idim1 - r; ++y) {
+ for (unsigned x = r; x < idim0 - r; ++x) {
+ const unsigned idx = y * idim0 + x;
T m_0 = in[idx];
float nM = 0.0f;
@@ -37,7 +37,7 @@ void susan_responses(T* resp_out, const T* in,
if (i*i + j*j < rSqrd) {
int p = x + i;
int q = y + j;
- T m = in[p * idim0 + q];
+ T m = in[p + idim0 * q];
float exp_pow = std::pow((m - m_0)/t, 6.0);
float cM = std::exp(-exp_pow);
nM += cM;
@@ -58,19 +58,19 @@ void non_maximal(float* x_out, float* y_out, float* resp_out,
// Responses on the border don't have 8-neighbors to compare, discard them
const unsigned r = border_len + 1;
- for (unsigned x = r; x < idim1 - r; x++) {
- for (unsigned y = r; y < idim0 - r; y++) {
- const T v = resp_in[x * idim0 + y];
+ for (unsigned y = r; y < idim1 - r; y++) {
+ for (unsigned x = r; x < idim0 - r; x++) {
+ const T v = resp_in[y * idim0 + x];
// Find maximum neighborhood response
T max_v;
- max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
- max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]);
- max_v = max(max_v, resp_in[(x-1) * idim0 + y ]);
- max_v = max(max_v, resp_in[(x+1) * idim0 + y ]);
- max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]);
- max_v = max(max_v, resp_in[(x) * idim0 + y+1]);
- max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]);
+ max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]);
+ max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]);
+ max_v = max(max_v, resp_in[(y-1) * idim0 + x ]);
+ max_v = max(max_v, resp_in[(y+1) * idim0 + x ]);
+ max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]);
+ max_v = max(max_v, resp_in[(y) * idim0 + x+1]);
+ max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]);
// Stores corner to {x,y,resp}_out if it's response is maximum compared
// to its 8-neighborhood and greater or equal minimum response
diff --git a/src/backend/cuda/kernel/susan.hpp b/src/backend/cuda/kernel/susan.hpp
index 882adcf..45d03f6 100644
--- a/src/backend/cuda/kernel/susan.hpp
+++ b/src/backend/cuda/kernel/susan.hpp
@@ -65,15 +65,15 @@ void susanKernel(T* out, const T* in,
#pragma unroll
for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
int i = gx2-radius;
- shrdMem[b*shrdLen+a] = in[i*idim0+j];
+ shrdMem[b*shrdLen+a] = in[j*idim0+i];
}
}
__syncthreads();
- if (gx < idim1 - edge && gy < idim0 - edge) {
- unsigned idx = gx*idim0 + gy;
+ if (gx < idim0 - edge && gy < idim1 - edge) {
+ unsigned idx = gy*idim0 + gx;
float nM = 0.0f;
- float m_0 = in[idx];
+ float m_0 = shrdMem[(ly+RADIUS)*shrdLen + lx+RADIUS];
#pragma unroll
for (int p=0; p<windLen; ++p) {
#pragma unroll
@@ -101,7 +101,7 @@ void susan_responses(T* out, const T* in,
const unsigned edge)
{
dim3 threads(BLOCK_X, BLOCK_Y);
- dim3 blocks(divup(idim1-edge*2, BLOCK_X), divup(idim0-edge*2, BLOCK_Y));
+ dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));
switch (radius) {
case 1: susanKernel<T, 1><<<blocks, threads>>>(out, in, idim0, idim1, t, g, edge); break;
@@ -130,18 +130,18 @@ void nonMaxKernel(float* x_out, float* y_out, float* resp_out, unsigned* count,
const unsigned gx = blockDim.x * blockIdx.x + threadIdx.x + r;
const unsigned gy = blockDim.y * blockIdx.y + threadIdx.y + r;
- if (gx < idim1 - r && gy < idim0 - r) {
- const T v = resp_in[gx * idim0 + gy];
+ if (gx < idim0 - r && gy < idim1 - r) {
+ const T v = resp_in[gy * idim0 + gx];
// Find maximum neighborhood response
T max_v;
- max_v = max_val(resp_in[(gx-1) * idim0 + gy-1], resp_in[gx * idim0 + gy-1]);
- max_v = max_val(max_v, resp_in[(gx+1) * idim0 + gy-1]);
- max_v = max_val(max_v, resp_in[(gx-1) * idim0 + gy ]);
- max_v = max_val(max_v, resp_in[(gx+1) * idim0 + gy ]);
- max_v = max_val(max_v, resp_in[(gx-1) * idim0 + gy+1]);
- max_v = max_val(max_v, resp_in[(gx) * idim0 + gy+1]);
- max_v = max_val(max_v, resp_in[(gx+1) * idim0 + gy+1]);
+ max_v = max_val(resp_in[(gy-1) * idim0 + gx-1], resp_in[gy * idim0 + gx-1]);
+ max_v = max_val(max_v, resp_in[(gy+1) * idim0 + gx-1]);
+ max_v = max_val(max_v, resp_in[(gy-1) * idim0 + gx ]);
+ max_v = max_val(max_v, resp_in[(gy+1) * idim0 + gx ]);
+ max_v = max_val(max_v, resp_in[(gy-1) * idim0 + gx+1]);
+ max_v = max_val(max_v, resp_in[(gy) * idim0 + gx+1]);
+ max_v = max_val(max_v, resp_in[(gy+1) * idim0 + gx+1]);
// Stores corner to {x,y,resp}_out if it's response is maximum compared
// to its 8-neighborhood and greater or equal minimum response
@@ -162,7 +162,7 @@ void nonMaximal(float* x_out, float* y_out, float* resp_out,
const T * resp_in, const unsigned edge, const unsigned max_corners)
{
dim3 threads(BLOCK_X, BLOCK_Y);
- dim3 blocks(divup(idim1-edge*2, BLOCK_X), divup(idim0-edge*2, BLOCK_Y));
+ dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));
unsigned* d_corners_found = memAlloc<unsigned>(1);
CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
diff --git a/src/backend/opencl/kernel/susan.cl b/src/backend/opencl/kernel/susan.cl
index 69131fd..90ef70f 100644
--- a/src/backend/opencl/kernel/susan.cl
+++ b/src/backend/opencl/kernel/susan.cl
@@ -32,15 +32,15 @@ void susan_responses(global T* out, global const T* in,
#pragma unroll
for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
int i = gx2-RADIUS;
- localMem[b*shrdLen+a] = in[i*idim0+j];
+ localMem[b*shrdLen+a] = in[i+idim0*j];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
- if (gx < idim1 - edge && gy < idim0 - edge) {
- unsigned idx = gx*idim0 + gy;
+ if (gx < idim0 - edge && gy < idim1 - edge) {
+ unsigned idx = gx + idim0 * gy;
float nM = 0.0f;
- float m_0 = in[idx];
+ float m_0 = localMem[(ly+RADIUS)*shrdLen + lx+RADIUS];
#pragma unroll
for (int p=0; p<windLen; ++p) {
#pragma unroll
@@ -75,18 +75,18 @@ void non_maximal(global float* x_out, global float* y_out,
const unsigned gx = get_global_id(0) + r;
const unsigned gy = get_global_id(1) + r;
- if (gx < idim1 - r && gy < idim0 - r) {
- const T v = resp_in[gx * idim0 + gy];
+ if (gx < idim0 - r && gy < idim1 - r) {
+ const T v = resp_in[gy * idim0 + gx];
// Find maximum neighborhood response
T max_v;
- max_v = MAX_VAL(resp_in[(gx-1) * idim0 + gy-1], resp_in[gx * idim0 + gy-1]);
- max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy-1]);
- max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy ]);
- max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy ]);
- max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy+1]);
- max_v = MAX_VAL(max_v, resp_in[(gx) * idim0 + gy+1]);
- max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy+1]);
+ max_v = MAX_VAL(resp_in[(gy-1) * idim0 + gx-1], resp_in[gy * idim0 + gx-1]);
+ max_v = MAX_VAL(max_v, resp_in[(gy+1) * idim0 + gx-1]);
+ max_v = MAX_VAL(max_v, resp_in[(gy-1) * idim0 + gx ]);
+ max_v = MAX_VAL(max_v, resp_in[(gy+1) * idim0 + gx ]);
+ max_v = MAX_VAL(max_v, resp_in[(gy-1) * idim0 + gx+1]);
+ max_v = MAX_VAL(max_v, resp_in[(gy) * idim0 + gx+1]);
+ max_v = MAX_VAL(max_v, resp_in[(gy+1) * idim0 + gx+1]);
// Stores corner to {x,y,resp}_out if it's response is maximum compared
// to its 8-neighborhood and greater or equal minimum response
diff --git a/src/backend/opencl/kernel/susan.hpp b/src/backend/opencl/kernel/susan.hpp
index 9551dbc..814616e 100644
--- a/src/backend/opencl/kernel/susan.hpp
+++ b/src/backend/opencl/kernel/susan.hpp
@@ -73,8 +73,8 @@ void susan(cl::Buffer* out, const cl::Buffer* in,
float, float, unsigned>(*suKernel[device]);
NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
- NDRange global(divup(idim1-2*edge, local[0])*local[0],
- divup(idim0-2*edge, local[1])*local[1]);
+ NDRange global(divup(idim0-2*edge, local[0])*local[0],
+ divup(idim1-2*edge, local[1])*local[1]);
susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, idim0, idim1, t, g, edge);
@@ -122,8 +122,8 @@ unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
unsigned, unsigned>(*nmKernel[device]);
NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
- NDRange global(divup(idim1-2*edge, local[0])*local[0],
- divup(idim0-2*edge, local[1])*local[1]);
+ NDRange global(divup(idim0-2*edge, local[0])*local[0],
+ divup(idim1-2*edge, local[1])*local[1]);
nonMaximalOp(EnqueueArgs(getQueue(), global, local),
*x_out, *y_out, *resp_out, *d_corners_found,
diff --git a/test/data b/test/data
index 79b0784..608ada5 160000
--- a/test/data
+++ b/test/data
@@ -1 +1 @@
-Subproject commit 79b0784a55044869ac32631560fae1e39aa53bbd
+Subproject commit 608ada5b36e4059a463a304ebc84991001094153
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list