[arrayfire] 166/408: OpenCL backend for SUSAN dectector
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:47 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit e57ed916c2ab7be1d53c61bd9f5248db8e1d74eb
Author: pradeep <pradeep at arrayfire.com>
Date: Fri Jul 24 14:48:32 2015 -0400
OpenCL backend for SUSAN dectector
---
src/backend/opencl/kernel/susan.cl | 121 ++++++++++++++++++++++++
src/backend/opencl/kernel/susan.hpp | 180 ++++++++++++++++++++++++++++++++++++
src/backend/opencl/susan.cpp | 62 ++++++++++++-
3 files changed, 361 insertions(+), 2 deletions(-)
diff --git a/src/backend/opencl/kernel/susan.cl b/src/backend/opencl/kernel/susan.cl
new file mode 100644
index 0000000..b70538e
--- /dev/null
+++ b/src/backend/opencl/kernel/susan.cl
@@ -0,0 +1,121 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#define MAX_VAL(A,B) (A) < (B) ? (B) : (A)
+
+#ifdef RESPONSE
+kernel
+void susan_responses(global T* out, global const T* in,
+ const unsigned idim0, const unsigned idim1,
+ const float t, const float g,
+ const unsigned edge)
+{
+ const int rSqrd = RADIUS*RADIUS;
+ const int windLen = 2*RADIUS+1;
+ const int shrdLen = BLOCK_X + windLen-1;
+ local T localMem[LOCAL_MEM_SIZE];
+
+ const unsigned lx = get_local_id(0);
+ const unsigned ly = get_local_id(1);
+ const unsigned gx = get_global_id(0) + edge;
+ const unsigned gy = get_global_id(1) + edge;
+
+#pragma unroll
+ for (int b=ly, gy2=gy; b<shrdLen; b+=BLOCK_Y, gy2+=BLOCK_Y) {
+ int j = gy2-RADIUS;
+#pragma unroll
+ for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
+ int i = gx2-RADIUS;
+ localMem[b*shrdLen+a] = in[i*idim0+j];
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (gx < idim1 - edge && gy < idim0 - edge) {
+ unsigned idx = gx*idim0 + gy;
+ float nM = 0.0f;
+ float m_0 = in[idx];
+#pragma unroll
+ for (int p=0; p<windLen; ++p) {
+#pragma unroll
+ for (int q=0; q<windLen; ++q) {
+ int i = p - RADIUS;
+ int j = q - RADIUS;
+ int a = lx + RADIUS + i;
+ int b = ly + RADIUS + j;
+ if (i*i + j*j < rSqrd) {
+ float m = localMem[b * shrdLen + a];
+ float exp_pow = pow((m - m_0)/t, 6.0f);
+ float cM = exp(-exp_pow);
+ nM += cM;
+ }
+ }
+ }
+ out[idx] = nM < g ? g - nM : (T)0;
+ }
+}
+#endif
+
+#ifdef NONMAX
+kernel
+void non_maximal(global float* x_out, global float* y_out,
+ global float* resp_out, global unsigned* count,
+ const unsigned idim0, const unsigned idim1, global const T* resp_in,
+ const unsigned edge, const unsigned max_corners)
+{
+ // Responses on the border don't have 8-neighbors to compare, discard them
+ const unsigned r = edge + 1;
+
+ const unsigned gx = get_global_id(0) + r;
+ const unsigned gy = get_global_id(1) + r;
+
+ if (gx < idim1 - r && gy < idim0 - r) {
+ const T v = resp_in[gx * idim0 + gy];
+
+ // Find maximum neighborhood response
+ T max_v;
+ max_v = MAX_VAL(resp_in[(gx-1) * idim0 + gy-1], resp_in[gx * idim0 + gy-1]);
+ max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy-1]);
+ max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy ]);
+ max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy ]);
+ max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy+1]);
+ max_v = MAX_VAL(max_v, resp_in[(gx) * idim0 + gy+1]);
+ max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy+1]);
+
+ // Stores corner to {x,y,resp}_out if it's response is maximum compared
+ // to its 8-neighborhood and greater or equal minimum response
+ if (v > max_v) {
+ const unsigned idx = atomic_inc(count);
+ if (idx < max_corners) {
+ x_out[idx] = (float)gx;
+ y_out[idx] = (float)gy;
+ resp_out[idx] = (float)v;
+ }
+ }
+ }
+}
+#endif
+
+#ifdef KEEP_CORNERS
+kernel
+void keep_corners(global float* x_out, global float* y_out, global float* resp_out,
+ global const float* x_in, global const float* y_in,
+ global const float* resp_in, global const unsigned* resp_idx,
+ const unsigned n_corners)
+{
+ const unsigned f = get_global_id(0);
+
+ // Keep only the first n_feat features
+ if (f < n_corners) {
+ x_out[f] = x_in[(unsigned)resp_idx[f]];
+ y_out[f] = y_in[(unsigned)resp_idx[f]];
+ resp_out[f] = resp_in[f];
+ }
+}
+#endif
diff --git a/src/backend/opencl/kernel/susan.hpp b/src/backend/opencl/kernel/susan.hpp
new file mode 100644
index 0000000..ba3bcda
--- /dev/null
+++ b/src/backend/opencl/kernel/susan.hpp
@@ -0,0 +1,180 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <program.hpp>
+#include <dispatch.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+#include <kernel_headers/susan.hpp>
+#include <memory.hpp>
+#include <map>
+#include "config.hpp"
+
+using cl::Buffer;
+using cl::Program;
+using cl::Kernel;
+using cl::EnqueueArgs;
+using cl::LocalSpaceArg;
+using cl::NDRange;
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+static const unsigned THREADS_PER_BLOCK = 256;
+static const unsigned SUSAN_THREADS_X = 16;
+static const unsigned SUSAN_THREADS_Y = 16;
+
+template<typename T, unsigned radius>
+void susan(cl::Buffer* out, const cl::Buffer* in,
+ const unsigned idim0, const unsigned idim1,
+ const float t, const float g, const unsigned edge)
+{
+ try {
+ static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+ static std::map<int, Program*> suProg;
+ static std::map<int, Kernel*> suKernel;
+
+ int device = getActiveDeviceId();
+
+ std::call_once( compileFlags[device], [device] () {
+
+ const size_t LOCAL_MEM_SIZE = (SUSAN_THREADS_X+2*radius)*(SUSAN_THREADS_Y+2*radius);
+ std::ostringstream options;
+ options << " -D T=" << dtype_traits<T>::getName()
+ << " -D LOCAL_MEM_SIZE=" << LOCAL_MEM_SIZE
+ << " -D BLOCK_X="<< SUSAN_THREADS_X
+ << " -D BLOCK_Y="<< SUSAN_THREADS_Y
+ << " -D RADIUS="<< radius
+ << " -D RESPONSE";
+
+ if (std::is_same<T, double>::value ||
+ std::is_same<T, cdouble>::value) {
+ options << " -D USE_DOUBLE";
+ }
+
+ cl::Program prog;
+ buildProgram(prog, susan_cl, susan_cl_len, options.str());
+ suProg[device] = new Program(prog);
+ suKernel[device] = new Kernel(*suProg[device], "susan_responses");
+ });
+
+ auto susanOp = make_kernel<Buffer, Buffer,
+ unsigned, unsigned,
+ float, float, unsigned>(*suKernel[device]);
+
+ NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
+ NDRange global(divup(idim1-2*edge, local[0])*local[0],
+ divup(idim0-2*edge, local[1])*local[1]);
+
+ susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, idim0, idim1, t, g, edge);
+
+ } catch (cl::Error err) {
+ CL_TO_AF_ERROR(err);
+ throw;
+ }
+}
+
+template<typename T>
+unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
+ const unsigned idim0, const unsigned idim1, const cl::Buffer* resp_in,
+ const unsigned edge, const unsigned max_corners)
+{
+ unsigned corners_found = 0;
+ try {
+ static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+ static std::map<int, Program*> nmProg;
+ static std::map<int, Kernel*> nmKernel;
+
+ int device = getActiveDeviceId();
+
+ std::call_once( compileFlags[device], [device] () {
+
+ std::ostringstream options;
+ options << " -D T=" << dtype_traits<T>::getName()
+ << " -D NONMAX";
+
+ if (std::is_same<T, double>::value ||
+ std::is_same<T, cdouble>::value) {
+ options << " -D USE_DOUBLE";
+ }
+
+ cl::Program prog;
+ buildProgram(prog, susan_cl, susan_cl_len, options.str());
+ nmProg[device] = new Program(prog);
+ nmKernel[device] = new Kernel(*nmProg[device], "non_maximal");
+ });
+
+ cl::Buffer *d_corners_found = bufferAlloc(sizeof(unsigned));
+ getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);
+
+ auto nonMaximalOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
+ unsigned, unsigned, Buffer,
+ unsigned, unsigned>(*nmKernel[device]);
+
+ NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
+ NDRange global(divup(idim1-2*edge, local[0])*local[0],
+ divup(idim0-2*edge, local[1])*local[1]);
+
+ nonMaximalOp(EnqueueArgs(getQueue(), global, local),
+ *x_out, *y_out, *resp_out, *d_corners_found,
+ idim0, idim1, *resp_in, edge, max_corners);
+
+ getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);
+ bufferFree(d_corners_found);
+ } catch (cl::Error err) {
+ CL_TO_AF_ERROR(err);
+ throw;
+ }
+ return corners_found;
+}
+
+void keepCorners(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
+ const cl::Buffer* x_in, const cl::Buffer* y_in,
+ const cl::Buffer* resp_in, const cl::Buffer* resp_idx,
+ const unsigned n_corners)
+{
+ try {
+ static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+ static std::map<int, Program*> kcProg;
+ static std::map<int, Kernel*> kcKernel;
+
+ int device = getActiveDeviceId();
+
+ std::call_once( compileFlags[device], [device] () {
+ std::ostringstream options;
+ options << " -D KEEP_CORNERS";
+ cl::Program prog;
+ buildProgram(prog, susan_cl, susan_cl_len, options.str());
+ kcProg[device] = new Program(prog);
+ kcKernel[device] = new Kernel(*kcProg[device], "keep_corners");
+ });
+
+ auto kcOp = make_kernel<Buffer, Buffer, Buffer,
+ Buffer, Buffer,
+ Buffer, Buffer,
+ unsigned>(*kcKernel[device]);
+
+ NDRange local(THREADS_PER_BLOCK, 1);
+ NDRange global(divup(n_corners, local[0]), 1);
+
+ kcOp(EnqueueArgs(getQueue(), global, local),
+ *x_out, *y_out, *resp_out, *x_in, *y_in, *resp_in, *resp_idx, n_corners);
+ } catch (cl::Error err) {
+ CL_TO_AF_ERROR(err);
+ throw;
+ }
+}
+
+}
+
+}
diff --git a/src/backend/opencl/susan.cpp b/src/backend/opencl/susan.cpp
index 1c3db1c..15308d9 100644
--- a/src/backend/opencl/susan.cpp
+++ b/src/backend/opencl/susan.cpp
@@ -10,6 +10,9 @@
#include <af/features.h>
#include <Array.hpp>
#include <err_opencl.hpp>
+#include <kernel/susan.hpp>
+#include <cmath>
+#include <sort_index.hpp>
using af::features;
@@ -17,12 +20,67 @@ namespace opencl
{
template<typename T>
-unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
+unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
const Array<T> &in,
const unsigned radius, const float diff_thr, const float geom_thr,
const float feature_ratio, const unsigned edge)
{
- OPENCL_NOT_SUPPORTED();
+ dim4 idims = in.dims();
+
+ const unsigned corner_lim = in.elements() * feature_ratio;
+ cl::Buffer* x_corners = bufferAlloc(corner_lim * sizeof(float));
+ cl::Buffer* y_corners = bufferAlloc(corner_lim * sizeof(float));
+ cl::Buffer* resp_corners = bufferAlloc(corner_lim * sizeof(float));
+
+ cl::Buffer* resp = bufferAlloc(in.elements()*sizeof(float));
+
+ switch(radius) {
+ case 1: kernel::susan<T, 1>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 2: kernel::susan<T, 2>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 3: kernel::susan<T, 3>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 4: kernel::susan<T, 4>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 5: kernel::susan<T, 5>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 6: kernel::susan<T, 6>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 7: kernel::susan<T, 7>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 8: kernel::susan<T, 8>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 9: kernel::susan<T, 9>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ }
+
+ unsigned corners_found = kernel::nonMaximal<T>(x_corners, y_corners, resp_corners,
+ idims[0], idims[1], resp, edge, corner_lim);
+ bufferFree(resp);
+
+ const unsigned corners_out = std::min(corners_found, corner_lim);
+ if (corners_out == 0)
+ return 0;
+
+ if (corners_found > corners_out) {
+ Array<float> susan_responses = createDeviceDataArray<float>(dim4(corners_found),
+ (void*)((*resp_corners)()));
+ Array<float> susan_sorted = createEmptyArray<float>(dim4(corners_found));
+ Array<unsigned> susan_idx = createEmptyArray<unsigned>(dim4(corners_found));
+
+ // Sort susan responses
+ sort_index<float, false>(susan_sorted, susan_idx, susan_responses, 0);
+
+ x_out = createEmptyArray<float>(dim4(corners_out));
+ y_out = createEmptyArray<float>(dim4(corners_out));
+ resp_out = createEmptyArray<float>(dim4(corners_out));
+
+ // Keep only the corners with higher SUSAN responses
+ kernel::keepCorners(x_out.get(), y_out.get(), resp_out.get(),
+ x_corners, y_corners, susan_sorted.get(), susan_idx.get(),
+ corners_out);
+
+ bufferFree(x_corners);
+ bufferFree(y_corners);
+ } else {
+ x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*x_corners)()));
+ y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*y_corners)()));
+ resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*resp_corners)()));
+ }
+
+ return corners_out;
}
#define INSTANTIATE(T) \
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list