[arrayfire] 78/408: Added OpenCL backend for Harris corner detector

Mon Sep 21 19:11:21 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit d1837321bcc323357ee6fca7c08974ddda6eba8e
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date:   Thu Jul 2 13:39:04 2015 -0400

    Added OpenCL backend for Harris corner detector
---
 src/backend/opencl/harris.cpp        |  57 ++++++
 src/backend/opencl/harris.hpp        |  23 +++
 src/backend/opencl/kernel/harris.cl  | 117 +++++++++++
 src/backend/opencl/kernel/harris.hpp | 374 +++++++++++++++++++++++++++++++++++
 4 files changed, 571 insertions(+)

diff --git a/src/backend/opencl/harris.cpp b/src/backend/opencl/harris.cpp
new file mode 100644
index 0000000..cc9a384
--- /dev/null
+++ b/src/backend/opencl/harris.cpp
@@ -0,0 +1,57 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <af/features.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_opencl.hpp>
+#include <handle.hpp>
+#include <kernel/harris.hpp>
+
+using af::dim4;
+using af::features;
+
+namespace opencl
+{
+
+template<typename T, typename convAccT>
+unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
+                const Array<T> &in, const unsigned max_corners, const float min_response,
+                const float sigma, const unsigned filter_len, const float k_thr)
+{
+    unsigned nfeat;
+
+    Param x;
+    Param y;
+    Param score;
+
+    kernel::harris<T, convAccT>(&nfeat, x, y, score, in,
+                                max_corners, min_response,
+                                sigma, filter_len, k_thr);
+
+    if (nfeat > 0) {
+        x_out = createParamArray<float>(x);
+        y_out = createParamArray<float>(y);
+        score_out = createParamArray<float>(score);
+    }
+
+    return nfeat;
+}
+
+#define INSTANTIATE(T, convAccT)                                                                                    \
+    template unsigned harris<T, convAccT>(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,        \
+                                          const Array<T> &in, const unsigned max_corners, const float min_response, \
+                                          const float sigma, const unsigned filter_len, const float k_thr);
+
+INSTANTIATE(double, double)
+INSTANTIATE(float , float)
+
+}
diff --git a/src/backend/opencl/harris.hpp b/src/backend/opencl/harris.hpp
new file mode 100644
index 0000000..e65053a
--- /dev/null
+++ b/src/backend/opencl/harris.hpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/features.h>
+#include <Array.hpp>
+
+using af::features;
+
+namespace opencl
+{
+
+template<typename T, typename convAccT>
+unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
+                const Array<T> &in, const unsigned max_corners, const float min_response,
+                const float sigma, const unsigned filter_len, const float k_thr);
+
+}
diff --git a/src/backend/opencl/kernel/harris.cl b/src/backend/opencl/kernel/harris.cl
new file mode 100644
index 0000000..2582b14
--- /dev/null
+++ b/src/backend/opencl/kernel/harris.cl
@@ -0,0 +1,117 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#define MAX_VAL(A,B) (A) < (B) ? (B) : (A)
+
+__kernel void second_order_deriv(
+    __global T* ixx_out,
+    __global T* ixy_out,
+    __global T* iyy_out,
+    const unsigned in_len,
+    __global const T* ix_in,
+    __global const T* iy_in)
+{
+    const unsigned x = get_global_id(0);
+
+    if (x < in_len) {
+        ixx_out[x] = ix_in[x] * ix_in[x];
+        ixy_out[x] = ix_in[x] * iy_in[x];
+        iyy_out[x] = iy_in[x] * iy_in[x];
+    }
+}
+
+__kernel void harris_responses(
+    __global T* resp_out,
+    const unsigned idim0,
+    const unsigned idim1,
+    __global const T* ixx_in,
+    __global const T* ixy_in,
+    __global const T* iyy_in,
+    const float k_thr,
+    const unsigned border_len)
+{
+    const unsigned r = border_len;
+
+    const unsigned x = get_global_id(0) + r;
+    const unsigned y = get_global_id(1) + r;
+
+    if (x < idim1 - r && y < idim0 - r) {
+        const unsigned idx = x * idim0 + y;
+
+        // Calculates matrix trace and determinant
+        T tr = ixx_in[idx] + iyy_in[idx];
+        T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx];
+
+        // Calculates local Harris response
+        resp_out[idx] = det - k_thr * (tr*tr);
+    }
+}
+
+__kernel void non_maximal(
+    __global float* x_out,
+    __global float* y_out,
+    __global float* resp_out,
+    __global unsigned* count,
+    __global const T* resp_in,
+    const unsigned idim0,
+    const unsigned idim1,
+    const float min_resp,
+    const unsigned border_len,
+    const unsigned max_corners)
+{
+    // Responses on the border don't have 8-neighbors to compare, discard them
+    const unsigned r = border_len + 1;
+
+    const unsigned x = get_global_id(0) + r;
+    const unsigned y = get_global_id(1) + r;
+
+    if (x < idim1 - r && y < idim0 - r) {
+        const T v = resp_in[x * idim0 + y];
+
+        // Find maximum neighborhood response
+        T max_v;
+        max_v = MAX_VAL(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
+        max_v = MAX_VAL(max_v, resp_in[(x+1) * idim0 + y-1]);
+        max_v = MAX_VAL(max_v, resp_in[(x-1) * idim0 + y  ]);
+        max_v = MAX_VAL(max_v, resp_in[(x+1) * idim0 + y  ]);
+        max_v = MAX_VAL(max_v, resp_in[(x-1) * idim0 + y+1]);
+        max_v = MAX_VAL(max_v, resp_in[(x)   * idim0 + y+1]);
+        max_v = MAX_VAL(max_v, resp_in[(x+1) * idim0 + y+1]);
+
+        // Stores corner to {x,y,resp}_out if it's response is maximum compared
+        // to its 8-neighborhood and greater or equal minimum response
+        if (v > max_v && v >= min_resp) {
+            const unsigned idx = atomic_inc(count);
+            if (idx < max_corners) {
+                x_out[idx]    = (float)x;
+                y_out[idx]    = (float)y;
+                resp_out[idx] = (float)v;
+            }
+        }
+    }
+}
+
+__kernel void keep_corners(
+    __global float* x_out,
+    __global float* y_out,
+    __global float* score_out,
+    __global const float* x_in,
+    __global const float* y_in,
+    __global const float* score_in,
+    __global const unsigned* score_idx,
+    const unsigned n_feat)
+{
+    unsigned f = get_global_id(0);
+
+    if (f < n_feat) {
+        x_out[f] = x_in[score_idx[f]];
+        y_out[f] = y_in[score_idx[f]];
+        score_out[f] = score_in[f];
+    }
+}
diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp
new file mode 100644
index 0000000..eeeddd4
--- /dev/null
+++ b/src/backend/opencl/kernel/harris.hpp
@@ -0,0 +1,374 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <af/constants.h>
+#include <program.hpp>
+#include <dispatch.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+#include <convolve_common.hpp>
+#include <kernel/convolve_separable.hpp>
+#include <kernel/gradient.hpp>
+#include <kernel/sort_index.hpp>
+#include <kernel_headers/harris.hpp>
+#include <memory.hpp>
+#include <map>
+
+using cl::Buffer;
+using cl::Program;
+using cl::Kernel;
+using cl::EnqueueArgs;
+using cl::LocalSpaceArg;
+using cl::NDRange;
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+static const unsigned HARRIS_THREADS_PER_GROUP = 256;
+static const unsigned HARRIS_THREADS_X = 16;
+static const unsigned HARRIS_THREADS_Y = HARRIS_THREADS_PER_GROUP / HARRIS_THREADS_X;
+
+template<typename T>
+void gaussian1D(T* out, const int dim, double sigma=0.0)
+{
+    if(!(sigma>0)) sigma = 0.25*dim;
+
+    T sum = (T)0;
+    for(int i=0;i<dim;i++)
+    {
+        int x = i-(dim-1)/2;
+        T el = 1. / sqrt(2 * af::Pi * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
+        out[i] = el;
+        sum   += el;
+    }
+
+    for(int k=0;k<dim;k++)
+        out[k] /= sum;
+}
+
+template<typename T, typename convAccT, unsigned fLen>
+void conv_helper(Param &ixx, Param &ixy, Param &iyy, Param &filter)
+{
+    Param ixx_tmp, ixy_tmp, iyy_tmp;
+    ixx_tmp.info.offset = ixy_tmp.info.offset = iyy_tmp.info.offset = 0;
+    for (dim_t i = 0; i < 4; i++) {
+        ixx_tmp.info.dims[i] = ixx.info.dims[i];
+        ixy_tmp.info.dims[i] = ixy.info.dims[i];
+        iyy_tmp.info.dims[i] = iyy.info.dims[i];
+        ixx_tmp.info.strides[i] = ixx.info.strides[i];
+        ixy_tmp.info.strides[i] = ixy.info.strides[i];
+        iyy_tmp.info.strides[i] = iyy.info.strides[i];
+    }
+    ixx_tmp.data = bufferAlloc(ixx_tmp.info.dims[3] * ixx_tmp.info.strides[3] * sizeof(convAccT));
+    ixy_tmp.data = bufferAlloc(ixy_tmp.info.dims[3] * ixy_tmp.info.strides[3] * sizeof(convAccT));
+    iyy_tmp.data = bufferAlloc(iyy_tmp.info.dims[3] * iyy_tmp.info.strides[3] * sizeof(convAccT));
+
+    convolve2<T, convAccT, 0, false, fLen>(ixx_tmp, ixx, filter);
+    convolve2<T, convAccT, 1, false, fLen>(ixx, ixx_tmp, filter);
+    convolve2<T, convAccT, 0, false, fLen>(ixy_tmp, ixy, filter);
+    convolve2<T, convAccT, 1, false, fLen>(ixy, ixy_tmp, filter);
+    convolve2<T, convAccT, 0, false, fLen>(iyy_tmp, iyy, filter);
+    convolve2<T, convAccT, 1, false, fLen>(iyy, iyy_tmp, filter);
+
+    bufferFree(ixx_tmp.data);
+    bufferFree(ixy_tmp.data);
+    bufferFree(iyy_tmp.data);
+}
+
+template<typename T, typename convAccT>
+void harris(unsigned* corners_out,
+            Param &x_out,
+            Param &y_out,
+            Param &resp_out,
+            Param in,
+            const unsigned max_corners,
+            const float min_response,
+            const float sigma,
+            const unsigned filter_len,
+            const float k_thr)
+{
+    try {
+        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+        static std::map<int, Program*> harrisProgs;
+        static std::map<int, Kernel*>  soKernel;
+        static std::map<int, Kernel*>  kcKernel;
+        static std::map<int, Kernel*>  hrKernel;
+        static std::map<int, Kernel*>  nmKernel;
+
+        int device = getActiveDeviceId();
+
+        std::call_once( compileFlags[device], [device] () {
+
+                std::ostringstream options;
+                options << " -D T=" << dtype_traits<T>::getName();
+
+                if (std::is_same<T, double>::value ||
+                    std::is_same<T, cdouble>::value) {
+                    options << " -D USE_DOUBLE";
+                }
+
+                cl::Program prog;
+                buildProgram(prog, harris_cl, harris_cl_len, options.str());
+                harrisProgs[device] = new Program(prog);
+
+                soKernel[device] = new Kernel(*harrisProgs[device], "second_order_deriv");
+                kcKernel[device] = new Kernel(*harrisProgs[device], "keep_corners");
+                hrKernel[device] = new Kernel(*harrisProgs[device], "harris_responses");
+                nmKernel[device] = new Kernel(*harrisProgs[device], "non_maximal");
+            });
+
+        // Window filter
+        convAccT* h_filter = new convAccT[filter_len];
+        // Decide between rectangular or circular filter
+        if (sigma < 0.5f) {
+            for (unsigned i = 0; i < filter_len; i++)
+                h_filter[i] = (T)1.f / (filter_len);
+        }
+        else {
+            gaussian1D<convAccT>(h_filter, (int)filter_len, sigma);
+        }
+
+        const unsigned border_len = filter_len / 2 + 1;
+
+        // Copy filter to device object
+        Param filter;
+        filter.info.dims[0] = filter_len;
+        filter.info.strides[0] = 1;
+        filter.info.offset = 0;
+
+        for (int k = 1; k < 4; k++) {
+            filter.info.dims[k] = 1;
+            filter.info.strides[k] = filter.info.dims[k - 1] * filter.info.strides[k - 1];
+        }
+
+        int filter_elem = filter.info.strides[3] * filter.info.dims[3];
+        filter.data = bufferAlloc(filter_elem * sizeof(convAccT));
+        getQueue().enqueueWriteBuffer(*filter.data, CL_TRUE, 0, filter_elem * sizeof(convAccT), h_filter);
+
+        Param ix, iy;
+        ix.info.offset = iy.info.offset = 0;
+        for (dim_t i = 0; i < 4; i++) {
+            ix.info.dims[i] = iy.info.dims[i] = in.info.dims[i];
+            ix.info.strides[i] = iy.info.strides[i] = in.info.strides[i];
+        }
+        ix.data = bufferAlloc(ix.info.dims[3] * ix.info.strides[3] * sizeof(T));
+        iy.data = bufferAlloc(iy.info.dims[3] * iy.info.strides[3] * sizeof(T));
+
+        // Compute first-order derivatives as gradients
+        gradient<T>(iy, ix, in);
+
+        Param ixx, ixy, iyy;
+        ixx.info.offset = ixy.info.offset = iyy.info.offset = 0;
+        for (dim_t i = 0; i < 4; i++) {
+            ixx.info.dims[i] = ixy.info.dims[i] = iyy.info.dims[i] = in.info.dims[i];
+            ixx.info.strides[i] = ixy.info.strides[i] = iyy.info.strides[i] = in.info.strides[i];
+        }
+        ixx.data = bufferAlloc(ixx.info.dims[3] * ixx.info.strides[3] * sizeof(T));
+        ixy.data = bufferAlloc(ixy.info.dims[3] * ixy.info.strides[3] * sizeof(T));
+        iyy.data = bufferAlloc(iyy.info.dims[3] * iyy.info.strides[3] * sizeof(T));
+
+        // Second order-derivatives kernel sizes
+        const unsigned blk_x_so = divup(in.info.dims[3] * in.info.strides[3], HARRIS_THREADS_PER_GROUP);
+        const NDRange local_so(HARRIS_THREADS_PER_GROUP, 1);
+        const NDRange global_so(blk_x_so * HARRIS_THREADS_PER_GROUP, 1);
+
+        auto soOp = make_kernel<Buffer, Buffer, Buffer,
+                                unsigned, Buffer, Buffer> (*soKernel[device]);
+
+        // Compute second-order derivatives
+        soOp(EnqueueArgs(getQueue(), global_so, local_so),
+             *ixx.data, *ixy.data, *iyy.data,
+             in.info.dims[3] * in.info.strides[3], *ix.data, *iy.data);
+        CL_DEBUG_FINISH(getQueue());
+
+        bufferFree(ix.data);
+        bufferFree(iy.data);
+
+        // Convolve second order derivatives with proper window filter
+        switch (filter_len) {
+            case 3:  conv_helper<T, convAccT, 3 >(ixx, ixy, iyy, filter); break;
+            case 4:  conv_helper<T, convAccT, 4 >(ixx, ixy, iyy, filter); break;
+            case 5:  conv_helper<T, convAccT, 5 >(ixx, ixy, iyy, filter); break;
+            case 6:  conv_helper<T, convAccT, 6 >(ixx, ixy, iyy, filter); break;
+            case 7:  conv_helper<T, convAccT, 7 >(ixx, ixy, iyy, filter); break;
+            case 8:  conv_helper<T, convAccT, 8 >(ixx, ixy, iyy, filter); break;
+            case 9:  conv_helper<T, convAccT, 9 >(ixx, ixy, iyy, filter); break;
+            case 10: conv_helper<T, convAccT, 10>(ixx, ixy, iyy, filter); break;
+            case 11: conv_helper<T, convAccT, 11>(ixx, ixy, iyy, filter); break;
+            case 12: conv_helper<T, convAccT, 12>(ixx, ixy, iyy, filter); break;
+            case 13: conv_helper<T, convAccT, 13>(ixx, ixy, iyy, filter); break;
+            case 14: conv_helper<T, convAccT, 14>(ixx, ixy, iyy, filter); break;
+            case 15: conv_helper<T, convAccT, 15>(ixx, ixy, iyy, filter); break;
+            case 16: conv_helper<T, convAccT, 16>(ixx, ixy, iyy, filter); break;
+            case 17: conv_helper<T, convAccT, 17>(ixx, ixy, iyy, filter); break;
+            case 18: conv_helper<T, convAccT, 18>(ixx, ixy, iyy, filter); break;
+            case 19: conv_helper<T, convAccT, 19>(ixx, ixy, iyy, filter); break;
+            case 20: conv_helper<T, convAccT, 20>(ixx, ixy, iyy, filter); break;
+            case 21: conv_helper<T, convAccT, 21>(ixx, ixy, iyy, filter); break;
+            case 22: conv_helper<T, convAccT, 22>(ixx, ixy, iyy, filter); break;
+            case 23: conv_helper<T, convAccT, 23>(ixx, ixy, iyy, filter); break;
+            case 24: conv_helper<T, convAccT, 24>(ixx, ixy, iyy, filter); break;
+            case 25: conv_helper<T, convAccT, 25>(ixx, ixy, iyy, filter); break;
+            case 26: conv_helper<T, convAccT, 26>(ixx, ixy, iyy, filter); break;
+            case 27: conv_helper<T, convAccT, 27>(ixx, ixy, iyy, filter); break;
+            case 28: conv_helper<T, convAccT, 28>(ixx, ixy, iyy, filter); break;
+            case 29: conv_helper<T, convAccT, 29>(ixx, ixy, iyy, filter); break;
+            case 30: conv_helper<T, convAccT, 30>(ixx, ixy, iyy, filter); break;
+            case 31: conv_helper<T, convAccT, 31>(ixx, ixy, iyy, filter); break;
+        }
+
+        bufferFree(filter.data);
+
+        cl::Buffer *d_responses = bufferAlloc(in.info.dims[3] * in.info.strides[3] * sizeof(T));
+
+        // Harris responses kernel sizes
+        unsigned blk_x_hr = divup(in.info.dims[0] - border_len*2, HARRIS_THREADS_X);
+        unsigned blk_y_hr = divup(in.info.dims[1] - border_len*2, HARRIS_THREADS_Y);
+        const NDRange local_hr(HARRIS_THREADS_X, HARRIS_THREADS_Y);
+        const NDRange global_hr(blk_x_hr * HARRIS_THREADS_X, blk_y_hr * HARRIS_THREADS_Y);
+
+        auto hrOp = make_kernel<Buffer, unsigned, unsigned,
+                                Buffer, Buffer, Buffer,
+                                float, unsigned> (*hrKernel[device]);
+
+        // Calculate Harris responses for all pixels
+        hrOp(EnqueueArgs(getQueue(), global_hr, local_hr),
+             *d_responses, in.info.dims[0], in.info.dims[1],
+             *ixx.data, *ixy.data, *iyy.data, k_thr, border_len);
+        CL_DEBUG_FINISH(getQueue());
+
+        bufferFree(ixx.data);
+        bufferFree(ixy.data);
+        bufferFree(iyy.data);
+
+        // Number of corners is not known a priori, limit maximum number of corners
+        // according to image dimensions
+        unsigned corner_lim = in.info.dims[3] * in.info.strides[3] * 0.2f;
+
+        unsigned corners_found = 0;
+        cl::Buffer *d_corners_found = bufferAlloc(sizeof(unsigned));
+        getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);
+
+        cl::Buffer *d_x_corners = bufferAlloc(corner_lim * sizeof(float));
+        cl::Buffer *d_y_corners = bufferAlloc(corner_lim * sizeof(float));
+        cl::Buffer *d_resp_corners = bufferAlloc(corner_lim * sizeof(float));
+
+        const float min_r = (max_corners > 0) ? 0.f : min_response;
+
+        auto nmOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
+                                Buffer, unsigned, unsigned,
+                                float, unsigned, unsigned> (*nmKernel[device]);
+
+        // Perform non-maximal suppression
+        nmOp(EnqueueArgs(getQueue(), global_hr, local_hr),
+             *d_x_corners, *d_y_corners, *d_resp_corners, *d_corners_found,
+             *d_responses, in.info.dims[0], in.info.dims[1],
+             min_r, border_len, corner_lim);
+        CL_DEBUG_FINISH(getQueue());
+
+        getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found);
+
+        bufferFree(d_responses);
+        bufferFree(d_corners_found);
+
+        *corners_out = (max_corners > 0) ?
+                       min(corners_found, max_corners) :
+                       min(corners_found, corner_lim);
+
+        if (*corners_out == 0)
+            return;
+
+        // Set output Param info
+        x_out.info.dims[0] = y_out.info.dims[0] = resp_out.info.dims[0] = *corners_out;
+        x_out.info.strides[0] = y_out.info.strides[0] = resp_out.info.strides[0] = 1;
+        x_out.info.offset = y_out.info.offset = resp_out.info.offset = 0;
+        for (int k = 1; k < 4; k++) {
+            x_out.info.dims[k] = y_out.info.dims[k] = resp_out.info.dims[k] =  1;
+            x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
+            y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
+            resp_out.info.strides[k] = resp_out.info.dims[k - 1] * resp_out.info.strides[k - 1];
+        }
+
+        if (max_corners > 0 && corners_found > *corners_out) {
+            Param harris_resp;
+            Param harris_idx;
+
+            harris_resp.info.dims[0] = harris_idx.info.dims[0] = corners_found;
+            harris_resp.info.strides[0] = harris_idx.info.strides[0] = 1;
+
+            for (int k = 1; k < 4; k++) {
+                harris_resp.info.dims[k] = 1;
+                harris_resp.info.strides[k] = harris_resp.info.dims[k - 1] * harris_resp.info.strides[k - 1];
+                harris_idx.info.dims[k] = 1;
+                harris_idx.info.strides[k] = harris_idx.info.dims[k - 1] * harris_idx.info.strides[k - 1];
+            }
+
+            int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3];
+            harris_resp.data = d_resp_corners;
+            harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned));
+
+            // Sort Harris responses
+            sort0_index<float, false>(harris_resp, harris_idx);
+
+            x_out.data = bufferAlloc(*corners_out * sizeof(float));
+            y_out.data = bufferAlloc(*corners_out * sizeof(float));
+            resp_out.data = bufferAlloc(*corners_out * sizeof(float));
+
+            // Keep corners kernel sizes
+            const unsigned blk_x_kc = divup(*corners_out, HARRIS_THREADS_PER_GROUP);
+            const NDRange local_kc(blk_x_kc * HARRIS_THREADS_PER_GROUP, 1);
+            const NDRange global_kc(blk_x_kc * HARRIS_THREADS_PER_GROUP, 1);
+
+            auto kcOp = make_kernel<Buffer, Buffer, Buffer,
+                                    Buffer, Buffer, Buffer, Buffer,
+                                    unsigned> (*kcKernel[device]);
+
+            // Keep only the first corners_to_keep corners with higher Harris
+            // responses
+            kcOp(EnqueueArgs(getQueue(), global_kc, local_kc),
+                 *x_out.data, *y_out.data, *resp_out.data,
+                 *d_x_corners, *d_y_corners, *harris_resp.data, *harris_idx.data,
+                 *corners_out);
+            CL_DEBUG_FINISH(getQueue());
+
+            bufferFree(d_x_corners);
+            bufferFree(d_y_corners);
+            bufferFree(harris_resp.data);
+            bufferFree(harris_idx.data);
+        }
+        else if (max_corners == 0 && corners_found < corner_lim) {
+            x_out.data = bufferAlloc(*corners_out * sizeof(float));
+            y_out.data = bufferAlloc(*corners_out * sizeof(float));
+            resp_out.data = bufferAlloc(*corners_out * sizeof(float));
+            getQueue().enqueueCopyBuffer(*d_x_corners, *x_out.data, 0, 0, *corners_out * sizeof(float));
+            getQueue().enqueueCopyBuffer(*d_y_corners, *y_out.data, 0, 0, *corners_out * sizeof(float));
+            getQueue().enqueueCopyBuffer(*d_resp_corners, *resp_out.data, 0, 0, *corners_out * sizeof(float));
+
+            bufferFree(d_x_corners);
+            bufferFree(d_y_corners);
+            bufferFree(d_resp_corners);
+        }
+        else {
+            x_out.data = d_x_corners;
+            y_out.data = d_y_corners;
+            resp_out.data = d_resp_corners;
+        }
+    } catch (cl::Error err) {
+        CL_TO_AF_ERROR(err);
+        throw;
+    }
+}
+
+} //namespace kernel
+
+} //namespace opencl

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git