[arrayfire] 166/408: OpenCL backend for SUSAN dectector

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:47 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit e57ed916c2ab7be1d53c61bd9f5248db8e1d74eb
Author: pradeep <pradeep at arrayfire.com>
Date:   Fri Jul 24 14:48:32 2015 -0400

    OpenCL backend for SUSAN dectector
---
 src/backend/opencl/kernel/susan.cl  | 121 ++++++++++++++++++++++++
 src/backend/opencl/kernel/susan.hpp | 180 ++++++++++++++++++++++++++++++++++++
 src/backend/opencl/susan.cpp        |  62 ++++++++++++-
 3 files changed, 361 insertions(+), 2 deletions(-)

diff --git a/src/backend/opencl/kernel/susan.cl b/src/backend/opencl/kernel/susan.cl
new file mode 100644
index 0000000..b70538e
--- /dev/null
+++ b/src/backend/opencl/kernel/susan.cl
@@ -0,0 +1,121 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#define MAX_VAL(A,B) (A) < (B) ? (B) : (A)
+
+#ifdef RESPONSE
+kernel
+void susan_responses(global T* out, global const T* in,
+                     const unsigned idim0, const unsigned idim1,
+                     const float t, const float g,
+                     const unsigned edge)
+{
+    const int rSqrd   = RADIUS*RADIUS;
+    const int windLen = 2*RADIUS+1;
+    const int shrdLen = BLOCK_X + windLen-1;
+    local T localMem[LOCAL_MEM_SIZE];
+
+    const unsigned lx = get_local_id(0);
+    const unsigned ly = get_local_id(1);
+    const unsigned gx = get_global_id(0) + edge;
+    const unsigned gy = get_global_id(1) + edge;
+
+#pragma unroll
+    for (int b=ly, gy2=gy; b<shrdLen; b+=BLOCK_Y, gy2+=BLOCK_Y) {
+        int j = gy2-RADIUS;
+#pragma unroll
+        for (int a=lx, gx2=gx; a<shrdLen; a+=BLOCK_X, gx2+=BLOCK_X) {
+            int i = gx2-RADIUS;
+            localMem[b*shrdLen+a] = in[i*idim0+j];
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (gx < idim1 - edge && gy < idim0 - edge) {
+        unsigned idx = gx*idim0 + gy;
+        float nM  = 0.0f;
+        float m_0 = in[idx];
+#pragma unroll
+        for (int p=0; p<windLen; ++p) {
+#pragma unroll
+            for (int q=0; q<windLen; ++q) {
+                int i = p - RADIUS;
+                int j = q - RADIUS;
+                int a = lx + RADIUS + i;
+                int b = ly + RADIUS + j;
+                if (i*i + j*j < rSqrd) {
+                    float m = localMem[b * shrdLen + a];
+                    float exp_pow = pow((m - m_0)/t, 6.0f);
+                    float cM = exp(-exp_pow);
+                    nM += cM;
+                }
+            }
+        }
+        out[idx] = nM < g ? g - nM : (T)0;
+    }
+}
+#endif
+
+#ifdef NONMAX
+kernel
+void non_maximal(global float* x_out, global float* y_out,
+                 global float* resp_out, global unsigned* count,
+                 const unsigned idim0, const unsigned idim1, global const T* resp_in,
+                 const unsigned edge, const unsigned max_corners)
+{
+    // Responses on the border don't have 8-neighbors to compare, discard them
+    const unsigned r = edge + 1;
+
+    const unsigned gx = get_global_id(0) + r;
+    const unsigned gy = get_global_id(1) + r;
+
+    if (gx < idim1 - r && gy < idim0 - r) {
+        const T v = resp_in[gx * idim0 + gy];
+
+        // Find maximum neighborhood response
+        T max_v;
+        max_v = MAX_VAL(resp_in[(gx-1) * idim0 + gy-1], resp_in[gx * idim0 + gy-1]);
+        max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy-1]);
+        max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy  ]);
+        max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy  ]);
+        max_v = MAX_VAL(max_v, resp_in[(gx-1) * idim0 + gy+1]);
+        max_v = MAX_VAL(max_v, resp_in[(gx)   * idim0 + gy+1]);
+        max_v = MAX_VAL(max_v, resp_in[(gx+1) * idim0 + gy+1]);
+
+        // Stores corner to {x,y,resp}_out if it's response is maximum compared
+        // to its 8-neighborhood and greater or equal minimum response
+        if (v > max_v) {
+            const unsigned idx = atomic_inc(count);
+            if (idx < max_corners) {
+                x_out[idx]    = (float)gx;
+                y_out[idx]    = (float)gy;
+                resp_out[idx] = (float)v;
+            }
+        }
+    }
+}
+#endif
+
+#ifdef KEEP_CORNERS
+kernel
+void keep_corners(global float* x_out, global float* y_out, global float* resp_out,
+                  global const float* x_in, global const float* y_in,
+                  global const float* resp_in, global const unsigned* resp_idx,
+                  const unsigned n_corners)
+{
+    const unsigned f = get_global_id(0);
+
+    // Keep only the first n_feat features
+    if (f < n_corners) {
+        x_out[f] = x_in[(unsigned)resp_idx[f]];
+        y_out[f] = y_in[(unsigned)resp_idx[f]];
+        resp_out[f] = resp_in[f];
+    }
+}
+#endif
diff --git a/src/backend/opencl/kernel/susan.hpp b/src/backend/opencl/kernel/susan.hpp
new file mode 100644
index 0000000..ba3bcda
--- /dev/null
+++ b/src/backend/opencl/kernel/susan.hpp
@@ -0,0 +1,180 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <program.hpp>
+#include <dispatch.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+#include <kernel_headers/susan.hpp>
+#include <memory.hpp>
+#include <map>
+#include "config.hpp"
+
+using cl::Buffer;
+using cl::Program;
+using cl::Kernel;
+using cl::EnqueueArgs;
+using cl::LocalSpaceArg;
+using cl::NDRange;
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+static const unsigned THREADS_PER_BLOCK = 256;
+static const unsigned SUSAN_THREADS_X = 16;
+static const unsigned SUSAN_THREADS_Y = 16;
+
+template<typename T, unsigned radius>
+void susan(cl::Buffer* out, const cl::Buffer* in,
+           const unsigned idim0, const unsigned idim1,
+           const float t, const float g, const unsigned edge)
+{
+    try {
+        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+        static std::map<int, Program*> suProg;
+        static std::map<int, Kernel*>  suKernel;
+
+        int device = getActiveDeviceId();
+
+        std::call_once( compileFlags[device], [device] () {
+
+                const size_t LOCAL_MEM_SIZE = (SUSAN_THREADS_X+2*radius)*(SUSAN_THREADS_Y+2*radius);
+                std::ostringstream options;
+                options << " -D T=" << dtype_traits<T>::getName()
+                        << " -D LOCAL_MEM_SIZE=" << LOCAL_MEM_SIZE
+                        << " -D BLOCK_X="<< SUSAN_THREADS_X
+                        << " -D BLOCK_Y="<< SUSAN_THREADS_Y
+                        << " -D RADIUS="<< radius
+                        << " -D RESPONSE";
+
+                if (std::is_same<T, double>::value ||
+                    std::is_same<T, cdouble>::value) {
+                    options << " -D USE_DOUBLE";
+                }
+
+                cl::Program prog;
+                buildProgram(prog, susan_cl, susan_cl_len, options.str());
+                suProg[device]   = new Program(prog);
+                suKernel[device] = new Kernel(*suProg[device], "susan_responses");
+            });
+
+        auto susanOp = make_kernel<Buffer, Buffer,
+                                   unsigned, unsigned,
+                                   float, float, unsigned>(*suKernel[device]);
+
+        NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
+        NDRange global(divup(idim1-2*edge, local[0])*local[0],
+                       divup(idim0-2*edge, local[1])*local[1]);
+
+        susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, idim0, idim1, t, g, edge);
+
+    } catch (cl::Error err) {
+        CL_TO_AF_ERROR(err);
+        throw;
+    }
+}
+
+template<typename T>
+unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
+                    const unsigned idim0, const unsigned idim1, const cl::Buffer* resp_in,
+                    const unsigned edge, const unsigned max_corners)
+{
+    unsigned corners_found = 0;
+    try {
+        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+        static std::map<int, Program*> nmProg;
+        static std::map<int, Kernel*>  nmKernel;
+
+        int device = getActiveDeviceId();
+
+        std::call_once( compileFlags[device], [device] () {
+
+                std::ostringstream options;
+                options << " -D T=" << dtype_traits<T>::getName()
+                        << " -D NONMAX";
+
+                if (std::is_same<T, double>::value ||
+                    std::is_same<T, cdouble>::value) {
+                    options << " -D USE_DOUBLE";
+                }
+
+                cl::Program prog;
+                buildProgram(prog, susan_cl, susan_cl_len, options.str());
+                nmProg[device]   = new Program(prog);
+                nmKernel[device] = new Kernel(*nmProg[device], "non_maximal");
+            });
+
+        cl::Buffer *d_corners_found = bufferAlloc(sizeof(unsigned));
+        getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);
+
+        auto nonMaximalOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
+                                        unsigned, unsigned, Buffer,
+                                        unsigned, unsigned>(*nmKernel[device]);
+
+        NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y);
+        NDRange global(divup(idim1-2*edge, local[0])*local[0],
+                       divup(idim0-2*edge, local[1])*local[1]);
+
+        nonMaximalOp(EnqueueArgs(getQueue(), global, local),
+                     *x_out, *y_out, *resp_out, *d_corners_found,
+                     idim0, idim1, *resp_in, edge, max_corners);
+
+        getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);
+        bufferFree(d_corners_found);
+    } catch (cl::Error err) {
+        CL_TO_AF_ERROR(err);
+        throw;
+    }
+    return corners_found;
+}
+
+void keepCorners(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out,
+                 const cl::Buffer* x_in, const cl::Buffer* y_in,
+                 const cl::Buffer* resp_in, const cl::Buffer* resp_idx,
+                 const unsigned n_corners)
+{
+    try {
+        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+        static std::map<int, Program*> kcProg;
+        static std::map<int, Kernel*>  kcKernel;
+
+        int device = getActiveDeviceId();
+
+        std::call_once( compileFlags[device], [device] () {
+                std::ostringstream options;
+                options << " -D KEEP_CORNERS";
+                cl::Program prog;
+                buildProgram(prog, susan_cl, susan_cl_len, options.str());
+                kcProg[device]   = new Program(prog);
+                kcKernel[device] = new Kernel(*kcProg[device], "keep_corners");
+            });
+
+        auto kcOp = make_kernel<Buffer, Buffer, Buffer,
+                                Buffer, Buffer,
+                                Buffer, Buffer,
+                                unsigned>(*kcKernel[device]);
+
+        NDRange local(THREADS_PER_BLOCK, 1);
+        NDRange global(divup(n_corners, local[0]), 1);
+
+        kcOp(EnqueueArgs(getQueue(), global, local),
+             *x_out, *y_out, *resp_out, *x_in, *y_in, *resp_in, *resp_idx, n_corners);
+    } catch (cl::Error err) {
+        CL_TO_AF_ERROR(err);
+        throw;
+    }
+}
+
+}
+
+}
diff --git a/src/backend/opencl/susan.cpp b/src/backend/opencl/susan.cpp
index 1c3db1c..15308d9 100644
--- a/src/backend/opencl/susan.cpp
+++ b/src/backend/opencl/susan.cpp
@@ -10,6 +10,9 @@
 #include <af/features.h>
 #include <Array.hpp>
 #include <err_opencl.hpp>
+#include <kernel/susan.hpp>
+#include <cmath>
+#include <sort_index.hpp>
 
 using af::features;
 
@@ -17,12 +20,67 @@ namespace opencl
 {
 
 template<typename T>
-unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
+unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
                const Array<T> &in,
                const unsigned radius, const float diff_thr, const float geom_thr,
                const float feature_ratio, const unsigned edge)
 {
-    OPENCL_NOT_SUPPORTED();
+    dim4 idims = in.dims();
+
+    const unsigned corner_lim = in.elements() * feature_ratio;
+    cl::Buffer* x_corners     = bufferAlloc(corner_lim * sizeof(float));
+    cl::Buffer* y_corners     = bufferAlloc(corner_lim * sizeof(float));
+    cl::Buffer* resp_corners  = bufferAlloc(corner_lim * sizeof(float));
+
+    cl::Buffer* resp = bufferAlloc(in.elements()*sizeof(float));
+
+    switch(radius) {
+        case 1: kernel::susan<T, 1>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 2: kernel::susan<T, 2>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 3: kernel::susan<T, 3>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 4: kernel::susan<T, 4>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 5: kernel::susan<T, 5>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 6: kernel::susan<T, 6>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 7: kernel::susan<T, 7>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 8: kernel::susan<T, 8>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+        case 9: kernel::susan<T, 9>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+    }
+
+    unsigned corners_found = kernel::nonMaximal<T>(x_corners, y_corners, resp_corners,
+                                                   idims[0], idims[1], resp, edge, corner_lim);
+    bufferFree(resp);
+
+    const unsigned corners_out = std::min(corners_found, corner_lim);
+    if (corners_out == 0)
+        return 0;
+
+    if (corners_found > corners_out) {
+        Array<float> susan_responses = createDeviceDataArray<float>(dim4(corners_found),
+                                                                    (void*)((*resp_corners)()));
+        Array<float> susan_sorted = createEmptyArray<float>(dim4(corners_found));
+        Array<unsigned> susan_idx = createEmptyArray<unsigned>(dim4(corners_found));
+
+        // Sort susan responses
+        sort_index<float, false>(susan_sorted, susan_idx, susan_responses, 0);
+
+        x_out = createEmptyArray<float>(dim4(corners_out));
+        y_out = createEmptyArray<float>(dim4(corners_out));
+        resp_out = createEmptyArray<float>(dim4(corners_out));
+
+        // Keep only the corners with higher SUSAN responses
+        kernel::keepCorners(x_out.get(), y_out.get(), resp_out.get(),
+                             x_corners, y_corners, susan_sorted.get(), susan_idx.get(),
+                             corners_out);
+
+        bufferFree(x_corners);
+        bufferFree(y_corners);
+    } else {
+        x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*x_corners)()));
+        y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*y_corners)()));
+        resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*resp_corners)()));
+    }
+
+    return corners_out;
 }
 
 #define INSTANTIATE(T) \

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list