[arrayfire] 77/408: Added CUDA backend for Harris corner detector
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:21 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit e9e8474b52c6961ce2fb4afc51fa035fa2e0e138
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date: Thu Jul 2 13:38:39 2015 -0400
Added CUDA backend for Harris corner detector
---
src/backend/cuda/harris.cu | 59 ++++++
src/backend/cuda/harris.hpp | 23 +++
src/backend/cuda/kernel/harris.hpp | 380 +++++++++++++++++++++++++++++++++++++
3 files changed, 462 insertions(+)
diff --git a/src/backend/cuda/harris.cu b/src/backend/cuda/harris.cu
new file mode 100644
index 0000000..2a5f272
--- /dev/null
+++ b/src/backend/cuda/harris.cu
@@ -0,0 +1,59 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <af/features.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_cuda.hpp>
+#include <handle.hpp>
+#include <kernel/harris.hpp>
+
+using af::dim4;
+using af::features;
+
+namespace cuda
+{
+
+template<typename T, typename convAccT>
+unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
+ const Array<T> &in, const unsigned max_corners, const float min_response,
+ const float sigma, const unsigned filter_len, const float k_thr)
+{
+ const dim4 dims = in.dims();
+
+ unsigned nfeat;
+ float *d_x_out;
+ float *d_y_out;
+ float *d_score_out;
+
+ kernel::harris<T, convAccT>(&nfeat, &d_x_out, &d_y_out, &d_score_out, in,
+ max_corners, min_response, sigma, filter_len, k_thr);
+
+ if (nfeat > 0) {
+ const dim4 out_dims(nfeat);
+
+ x_out = createDeviceDataArray<float>(out_dims, d_x_out);
+ y_out = createDeviceDataArray<float>(out_dims, d_y_out);
+ score_out = createDeviceDataArray<float>(out_dims, d_score_out);
+ }
+
+ return nfeat;
+}
+
+#define INSTANTIATE(T, convAccT) \
+ template unsigned harris<T, convAccT>(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out, \
+ const Array<T> &in, const unsigned max_corners, const float min_response, \
+ const float sigma, const unsigned filter_len, const float k_thr);
+
+INSTANTIATE(double, double)
+INSTANTIATE(float , float)
+
+}
diff --git a/src/backend/cuda/harris.hpp b/src/backend/cuda/harris.hpp
new file mode 100644
index 0000000..6bc3ace
--- /dev/null
+++ b/src/backend/cuda/harris.hpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/features.h>
+#include <Array.hpp>
+
+using af::features;
+
+namespace cuda
+{
+
+template<typename T, typename convAccT>
+unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
+ const Array<T> &in, const unsigned max_corners, const float min_response,
+ const float sigma, const unsigned filter_len, const float k_thr);
+
+}
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
new file mode 100644
index 0000000..cae58ad
--- /dev/null
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -0,0 +1,380 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <af/constants.h>
+#include <dispatch.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <memory.hpp>
+
+#include "config.hpp"
+#include <convolve_common.hpp>
+#include "convolve.hpp"
+#include "gradient.hpp"
+#include "sort_index.hpp"
+
+namespace cuda
+{
+
+namespace kernel
+{
+
+static const unsigned BLOCK_SIZE = 16;
+
+template<typename T>
+void gaussian1D(T* out, const int dim, double sigma=0.0)
+{
+ if(!(sigma>0)) sigma = 0.25*dim;
+
+ T sum = (T)0;
+ for(int i=0;i<dim;i++)
+ {
+ int x = i-(dim-1)/2;
+ T el = 1. / sqrt(2 * af::Pi * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
+ out[i] = el;
+ sum += el;
+ }
+
+ for(int k=0;k<dim;k++)
+ out[k] /= sum;
+}
+
+// max_val()
+// Returns max of x and y
+inline __device__ int max_val(const int x, const int y)
+{
+ return max(x, y);
+}
+inline __device__ unsigned max_val(const unsigned x, const unsigned y)
+{
+ return max(x, y);
+}
+inline __device__ float max_val(const float x, const float y)
+{
+ return fmax(x, y);
+}
+inline __device__ double max_val(const double x, const double y)
+{
+ return fmax(x, y);
+}
+
+template<typename T>
+__global__ void second_order_deriv(
+ T* ixx_out,
+ T* ixy_out,
+ T* iyy_out,
+ const unsigned in_len,
+ const T* ix_in,
+ const T* iy_in)
+{
+ const unsigned x = blockDim.x * blockIdx.x + threadIdx.x;
+
+ if (x < in_len) {
+ ixx_out[x] = ix_in[x] * ix_in[x];
+ ixy_out[x] = ix_in[x] * iy_in[x];
+ iyy_out[x] = iy_in[x] * iy_in[x];
+ }
+}
+
+template<typename T>
+__global__ void harris_responses(
+ T* resp_out,
+ const unsigned idim0,
+ const unsigned idim1,
+ const T* ixx_in,
+ const T* ixy_in,
+ const T* iyy_in,
+ const float k_thr,
+ const unsigned border_len)
+{
+ const unsigned r = border_len;
+
+ const unsigned x = blockDim.x * blockIdx.x + threadIdx.x + r;
+ const unsigned y = blockDim.y * blockIdx.y + threadIdx.y + r;
+
+ if (x < idim1 - r && y < idim0 - r) {
+ const unsigned idx = x * idim0 + y;
+
+ // Calculates matrix trace and determinant
+ T tr = ixx_in[idx] + iyy_in[idx];
+ T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx];
+
+ // Calculates local Harris response
+ resp_out[idx] = det - k_thr * (tr*tr);
+ }
+}
+
+template<typename T>
+__global__ void non_maximal(
+ float* x_out,
+ float* y_out,
+ float* resp_out,
+ unsigned* count,
+ const unsigned idim0,
+ const unsigned idim1,
+ const T* resp_in,
+ const float min_resp,
+ const unsigned border_len,
+ const unsigned max_corners)
+{
+ // Responses on the border don't have 8-neighbors to compare, discard them
+ const unsigned r = border_len + 1;
+
+ const unsigned x = blockDim.x * blockIdx.x + threadIdx.x + r;
+ const unsigned y = blockDim.y * blockIdx.y + threadIdx.y + r;
+
+ if (x < idim1 - r && y < idim0 - r) {
+ const T v = resp_in[x * idim0 + y];
+
+ // Find maximum neighborhood response
+ T max_v;
+ max_v = max_val(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
+ max_v = max_val(max_v, resp_in[(x+1) * idim0 + y-1]);
+ max_v = max_val(max_v, resp_in[(x-1) * idim0 + y ]);
+ max_v = max_val(max_v, resp_in[(x+1) * idim0 + y ]);
+ max_v = max_val(max_v, resp_in[(x-1) * idim0 + y+1]);
+ max_v = max_val(max_v, resp_in[(x) * idim0 + y+1]);
+ max_v = max_val(max_v, resp_in[(x+1) * idim0 + y+1]);
+
+ // Stores corner to {x,y,resp}_out if it's response is maximum compared
+ // to its 8-neighborhood and greater or equal minimum response
+ if (v > max_v && v >= (T)min_resp) {
+ unsigned idx = atomicAdd(count, 1u);
+ if (idx < max_corners) {
+ x_out[idx] = (float)x;
+ y_out[idx] = (float)y;
+ resp_out[idx] = (float)v;
+ }
+ }
+ }
+}
+
+__global__ void keep_corners(
+ float* x_out,
+ float* y_out,
+ float* resp_out,
+ const float* x_in,
+ const float* y_in,
+ const float* resp_in,
+ const unsigned* resp_idx,
+ const unsigned n_corners)
+{
+ const unsigned f = blockDim.x * blockIdx.x + threadIdx.x;
+
+ // Keep only the first n_feat features
+ if (f < n_corners) {
+ x_out[f] = x_in[(unsigned)resp_idx[f]];
+ y_out[f] = y_in[(unsigned)resp_idx[f]];
+ resp_out[f] = resp_in[f];
+ }
+}
+
+int compare(const void* a, const void* b)
+{
+ return *(float*)a > *(float*)b;
+}
+
+template<typename T, typename convAccT>
+void harris(unsigned* corners_out,
+ float** x_out,
+ float** y_out,
+ float** resp_out,
+ CParam<T> in,
+ const unsigned max_corners,
+ const float min_response,
+ const float sigma,
+ const unsigned filter_len,
+ const float k_thr)
+{
+ // Window filter
+ convAccT *h_filter = new convAccT[filter_len];
+ // Decide between rectangular or circular filter
+ if (sigma < 0.5f) {
+ for (unsigned i = 0; i < filter_len; i++)
+ h_filter[i] = (T)1.f / (filter_len);
+ }
+ else {
+ gaussian1D<convAccT>(h_filter, (int)filter_len, sigma);
+ }
+
+ // Copy filter to device object
+ Param<convAccT> filter;
+ filter.dims[0] = filter_len;
+ filter.strides[0] = 1;
+
+ for (int k = 1; k < 4; k++) {
+ filter.dims[k] = 1;
+ filter.strides[k] = filter.dims[k - 1] * filter.strides[k - 1];
+ }
+
+ int filter_elem = filter.strides[3] * filter.dims[3];
+ filter.ptr = memAlloc<convAccT>(filter_elem);
+ CUDA_CHECK(cudaMemcpy(filter.ptr, h_filter, filter_elem * sizeof(convAccT), cudaMemcpyHostToDevice));
+
+ delete[] h_filter;
+
+ const unsigned border_len = filter_len / 2 + 1;
+
+ Param<T> ix, iy;
+ for (dim_t i = 0; i < 4; i++) {
+ ix.dims[i] = iy.dims[i] = in.dims[i];
+ ix.strides[i] = iy.strides[i] = in.strides[i];
+ }
+ ix.ptr = memAlloc<T>(ix.dims[3] * ix.strides[3]);
+ iy.ptr = memAlloc<T>(iy.dims[3] * iy.strides[3]);
+
+ // Compute first-order derivatives as gradients
+ gradient<T>(iy, ix, in);
+
+ Param<T> ixx, ixy, iyy;
+ Param<T> ixx_tmp, ixy_tmp, iyy_tmp;
+ for (dim_t i = 0; i < 4; i++) {
+ ixx.dims[i] = ixy.dims[i] = iyy.dims[i] = in.dims[i];
+ ixx_tmp.dims[i] = ixy_tmp.dims[i] = iyy_tmp.dims[i] = in.dims[i];
+ ixx.strides[i] = ixy.strides[i] = iyy.strides[i] = in.strides[i];
+ ixx_tmp.strides[i] = ixy_tmp.strides[i] = iyy_tmp.strides[i] = in.strides[i];
+ }
+ ixx.ptr = memAlloc<T>(ixx.dims[3] * ixx.strides[3]);
+ ixy.ptr = memAlloc<T>(ixy.dims[3] * ixy.strides[3]);
+ iyy.ptr = memAlloc<T>(iyy.dims[3] * iyy.strides[3]);
+
+ // Compute second-order derivatives
+ dim3 threads(THREADS_PER_BLOCK, 1);
+ dim3 blocks(divup(in.dims[3] * in.strides[3], threads.x), 1);
+ second_order_deriv<T><<<blocks, threads>>>(ixx.ptr, ixy.ptr, iyy.ptr,
+ in.dims[3] * in.strides[3], ix.ptr, iy.ptr);
+
+ memFree(ix.ptr);
+ memFree(iy.ptr);
+
+ ixx_tmp.ptr = memAlloc<T>(ixx_tmp.dims[3] * ixx_tmp.strides[3]);
+ ixy_tmp.ptr = memAlloc<T>(ixy_tmp.dims[3] * ixy_tmp.strides[3]);
+ iyy_tmp.ptr = memAlloc<T>(iyy_tmp.dims[3] * iyy_tmp.strides[3]);
+
+ // Convolve second-order derivatives with proper window filter
+ convolve2<T, convAccT, 0, false>(ixx_tmp, CParam<T>(ixx), filter);
+ convolve2<T, convAccT, 1, false>(ixx, CParam<T>(ixx_tmp), filter);
+ convolve2<T, convAccT, 0, false>(ixy_tmp, CParam<T>(ixy), filter);
+ convolve2<T, convAccT, 1, false>(ixy, CParam<T>(ixy_tmp), filter);
+ convolve2<T, convAccT, 0, false>(iyy_tmp, CParam<T>(iyy), filter);
+ convolve2<T, convAccT, 1, false>(iyy, CParam<T>(iyy_tmp), filter);
+
+ memFree(ixx_tmp.ptr);
+ memFree(ixy_tmp.ptr);
+ memFree(iyy_tmp.ptr);
+
+ // Number of corners is not known a priori, limit maximum number of corners
+ // according to image dimensions
+ unsigned corner_lim = in.dims[3] * in.strides[3] * 0.2f;
+
+ unsigned* d_corners_found = memAlloc<unsigned>(1);
+ CUDA_CHECK(cudaMemset(d_corners_found, 0, sizeof(unsigned)));
+
+ float* d_x_corners = memAlloc<float>(corner_lim);
+ float* d_y_corners = memAlloc<float>(corner_lim);
+ float* d_resp_corners = memAlloc<float>(corner_lim);
+
+ T* d_responses = memAlloc<T>(in.dims[3] * in.strides[3]);
+
+ // Calculate Harris responses for all pixels
+ threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
+ blocks = dim3(divup(in.dims[1] - border_len*2, threads.x),
+ divup(in.dims[0] - border_len*2, threads.y));
+ harris_responses<T><<<blocks, threads>>>(d_responses,
+ in.dims[0], in.dims[1],
+ ixx.ptr, ixy.ptr, iyy.ptr,
+ k_thr, border_len);
+
+ memFree(ixx.ptr);
+ memFree(ixy.ptr);
+ memFree(iyy.ptr);
+
+ const float min_r = (max_corners > 0) ? 0.f : min_response;
+
+ // Perform non-maximal suppression
+ non_maximal<T><<<blocks, threads>>>(d_x_corners, d_y_corners,
+ d_resp_corners, d_corners_found,
+ in.dims[0], in.dims[1], d_responses,
+ min_r, border_len, corner_lim);
+
+ unsigned corners_found = 0;
+ CUDA_CHECK(cudaMemcpy(&corners_found, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
+
+ memFree(d_responses);
+ memFree(d_corners_found);
+
+ *corners_out = (max_corners > 0) ?
+ min(corners_found, max_corners) :
+ min(corners_found, corner_lim);
+
+ if (*corners_out == 0)
+ return;
+
+ if (max_corners > 0 && corners_found > *corners_out) {
+ Param<float> harris_responses;
+ Param<unsigned> harris_idx;
+
+ harris_responses.dims[0] = harris_idx.dims[0] = corners_found;
+ harris_responses.strides[0] = harris_idx.strides[0] = 1;
+
+ for (int k = 1; k < 4; k++) {
+ harris_responses.dims[k] = 1;
+ harris_responses.strides[k] = harris_responses.dims[k - 1] * harris_responses.strides[k - 1];
+ harris_idx.dims[k] = 1;
+ harris_idx.strides[k] = harris_idx.dims[k - 1] * harris_idx.strides[k - 1];
+ }
+
+ int sort_elem = harris_responses.strides[3] * harris_responses.dims[3];
+ harris_responses.ptr = d_resp_corners;
+ harris_idx.ptr = memAlloc<unsigned>(sort_elem);
+
+ // Sort Harris responses
+ sort0_index<float, false>(harris_responses, harris_idx);
+
+ *x_out = memAlloc<float>(*corners_out);
+ *y_out = memAlloc<float>(*corners_out);
+ *resp_out = memAlloc<float>(*corners_out);
+
+ // Keep only the first corners_to_keep corners with higher Harris
+ // responses
+ threads = dim3(THREADS_PER_BLOCK, 1);
+ blocks = dim3(divup(*corners_out, threads.x), 1);
+ keep_corners<<<blocks, threads>>>(*x_out, *y_out, *resp_out,
+ d_x_corners, d_y_corners,
+ harris_responses.ptr, harris_idx.ptr,
+ *corners_out);
+
+ memFree(d_x_corners);
+ memFree(d_y_corners);
+ memFree(harris_responses.ptr);
+ memFree(harris_idx.ptr);
+ }
+ else if (max_corners == 0 && corners_found < corner_lim) {
+ *x_out = memAlloc<float>(*corners_out);
+ *y_out = memAlloc<float>(*corners_out);
+ *resp_out = memAlloc<float>(*corners_out);
+ CUDA_CHECK(cudaMemcpy(*x_out, d_x_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpy(*y_out, d_y_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaMemcpy(*resp_out, d_resp_corners, *corners_out * sizeof(float), cudaMemcpyDeviceToDevice));
+
+ memFree(d_x_corners);
+ memFree(d_y_corners);
+ memFree(d_resp_corners);
+ }
+ else {
+ *x_out = d_x_corners;
+ *y_out = d_y_corners;
+ *resp_out = d_resp_corners;
+ }
+}
+
+} // namespace kernel
+
+} // namespace cuda
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list