[arrayfire] 395/408: dfsg clean
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:38 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 54749215e4def3bce369bc09ab6cdddb6a3828d5
Author: Ghislain Antony Vaillant <ghisvail at gmail.com>
Date: Tue Sep 15 01:39:02 2015 +0100
dfsg clean
find -name "*nonfree*" -type f -delete
---
src/backend/cpu/sift_nonfree.hpp | 1033 ---------------------
src/backend/cuda/kernel/sift_nonfree.hpp | 1370 ----------------------------
src/backend/opencl/kernel/sift_nonfree.cl | 806 ----------------
src/backend/opencl/kernel/sift_nonfree.hpp | 784 ----------------
test/sift_nonfree.cpp | 336 -------
5 files changed, 4329 deletions(-)
diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp
deleted file mode 100644
index b331e4c..0000000
--- a/src/backend/cpu/sift_nonfree.hpp
+++ /dev/null
@@ -1,1033 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements. Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims. If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement. It does not protect you from
-// committing patent infringement. So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either. If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// * Redistributions of source code must retain the above copyright and
-// patent notices, this list of conditions and the following
-// disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in
-// the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Oregon State University nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-using af::dim4;
-
-namespace cpu
-{
-
- static const float PI_VAL = 3.14159265358979323846f;
-
-// default width of descriptor histogram array
- static const int DescrWidth = 4;
-
-// default number of bins per histogram in descriptor array
- static const int DescrHistBins = 8;
-
-// assumed gaussian blur for input image
- static const float InitSigma = 0.5f;
-
-// width of border in which to ignore keypoints
- static const int ImgBorder = 5;
-
-// maximum steps of keypoint interpolation before failure
- static const int MaxInterpSteps = 5;
-
-// default number of bins in histogram for orientation assignment
- static const int OriHistBins = 36;
-
-// determines gaussian sigma for orientation assignment
- static const float OriSigFctr = 1.5f;
-
-// determines the radius of the region used in orientation assignment */
- static const float OriRadius = 3.0f * OriSigFctr;
-
-// number of passes of orientation histogram smoothing
- static const int SmoothOriPasses = 2;
-
-// orientation magnitude relative to max that results in new feature
- static const float OriPeakRatio = 0.8f;
-
-// determines the size of a single descriptor orientation histogram
- static const float DescrSclFctr = 3.f;
-
-// threshold on magnitude of elements of descriptor vector
- static const float DescrMagThr = 0.2f;
-
-// factor used to convert floating-point descriptor to unsigned char
- static const float IntDescrFctr = 512.f;
-
- typedef struct
- {
- float f[4];
- unsigned l;
- } feat_t;
-
- bool feat_cmp(feat_t i, feat_t j)
- {
- for (int k = 0; k < 4; k++)
- if (i.f[k] != j.f[k])
- return (i.f[k] < j.f[k]);
- if (i.l != j.l)
- return (i.l < j.l);
-
- return true;
- }
-
- void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
- {
- feat.resize(nfeat);
- for (unsigned i = 0; i < feat.size(); i++) {
- feat[i].f[0] = x[i];
- feat[i].f[1] = y[i];
- feat[i].f[2] = resp[i];
- feat[i].f[3] = size[i];
- feat[i].l = layer[i];
- }
- }
-
- template<typename T>
- void gaussian1D(T* out, const int dim, double sigma=0.0)
- {
- if(!(sigma>0)) sigma = 0.25*dim;
-
- T sum = (T)0;
- for(int i=0;i<dim;i++)
- {
- int x = i-(dim-1)/2;
- T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
- out[i] = el;
- sum += el;
- }
-
- for(int k=0;k<dim;k++)
- out[k] /= sum;
- }
-
- template<typename T>
- Array<T> gauss_filter(float sigma)
- {
- // Using 6-sigma rule
- unsigned gauss_len = (unsigned)round(sigma * 6 + 1) | 1;
-
- Array<T> filter = createEmptyArray<T>(gauss_len);
- gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
-
- return filter;
- }
-
- template<int N>
- void gaussianElimination(float* A, float* b, float* x)
- {
- // forward elimination
- for (int i = 0; i < N-1; i++) {
- for (int j = i+1; j < N; j++) {
- float s = A[j*N+i] / A[i*N+i];
-
- for (int k = i; k < N; k++)
- A[j*N+k] -= s * A[i*N+k];
-
- b[j] -= s * b[i];
- }
- }
-
- for (int i = 0; i < N; i++)
- x[i] = 0;
-
- // backward substitution
- float sum = 0;
- for (int i = 0; i <= N-2; i++) {
- sum = b[i];
- for (int j = i+1; j < N; j++)
- sum -= A[i*N+j] * x[j];
- x[i] = sum / A[i*N+i];
- }
- }
-
- template<typename T>
- void sub(
- Array<T>& out,
- const Array<T>& in1,
- const Array<T>& in2)
- {
- size_t nel = in1.elements();
- T* out_ptr = out.get();
- const T* in1_ptr = in1.get();
- const T* in2_ptr = in2.get();
-
- for (size_t i = 0; i < nel; i++) {
- out_ptr[i] = in1_ptr[i] - in2_ptr[i];
- }
- }
-
-#define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)])
-#define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)])
-#define NPTR(Y, X) (next_ptr[(Y) * idims[0] + (X)])
-
-// Determines whether a pixel is a scale-space extremum by comparing it to its
-// 3x3x3 pixel neighborhood.
- template<typename T>
- void detectExtrema(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- unsigned* counter,
- const Array<T>& prev,
- const Array<T>& center,
- const Array<T>& next,
- const unsigned layer,
- const unsigned max_feat,
- const float threshold)
- {
- const af::dim4 idims = center.dims();
- const T* prev_ptr = prev.get();
- const T* center_ptr = center.get();
- const T* next_ptr = next.get();
-
- for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
- for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
- float p = center_ptr[y*idims[0] + x];
-
- // Find extrema
- if (abs((float)p) > threshold &&
- ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
- p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y, x+1) &&
- p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1) &&
- p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1) &&
- p > PPTR(y, x-1) && p > PPTR(y , x) && p > PPTR(y, x+1) &&
- p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1) &&
- p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1) &&
- p > NPTR(y, x-1) && p > NPTR(y , x) && p > NPTR(y, x+1) &&
- p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
- (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
- p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y, x+1) &&
- p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1) &&
- p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1) &&
- p < PPTR(y, x-1) && p < PPTR(y , x) && p < PPTR(y, x+1) &&
- p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1) &&
- p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1) &&
- p < NPTR(y, x-1) && p < NPTR(y , x) && p < NPTR(y, x+1) &&
- p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
-
- if (*counter < max_feat)
- {
- x_out[*counter] = (float)y;
- y_out[*counter] = (float)x;
- layer_out[*counter] = layer;
- (*counter)++;
- }
- }
- }
- }
- }
-
-// Interpolates a scale-space extremum's location and scale to subpixel
-// accuracy to form an image feature. Rejects features with low contrast.
-// Based on Section 4 of Lowe's paper.
- template<typename T>
- void interpolateExtrema(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- unsigned* counter,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const unsigned extrema_feat,
- std::vector< Array<T> >& dog_pyr,
- const unsigned max_feat,
- const unsigned octave,
- const unsigned n_layers,
- const float contrast_thr,
- const float edge_thr,
- const float sigma,
- const float img_scale)
- {
- for (int f = 0; f < (int)extrema_feat; f++) {
- const float first_deriv_scale = img_scale*0.5f;
- const float second_deriv_scale = img_scale;
- const float cross_deriv_scale = img_scale*0.25f;
-
- float xl = 0, xy = 0, xx = 0, contr = 0;
- int i = 0;
-
- unsigned x = x_in[f];
- unsigned y = y_in[f];
- unsigned layer = layer_in[f];
-
- const T* prev_ptr = dog_pyr[octave*(n_layers+2) + layer-1].get();
- const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
- const T* next_ptr = dog_pyr[octave*(n_layers+2) + layer+1].get();
-
- af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
-
- bool converges = true;
-
- for (i = 0; i < MaxInterpSteps; i++) {
- float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
- (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
- (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
-
- float d2 = CPTR(x, y) * 2.f;
- float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
- float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
- float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale;
- float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
- CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
- float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
- PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
- float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
- PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
- float H[9] = {dxx, dxy, dxs,
- dxy, dyy, dys,
- dxs, dys, dss};
-
- float X[3];
- gaussianElimination<3>(H, dD, X);
-
- xl = -X[2];
- xy = -X[1];
- xx = -X[0];
-
- if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
- break;
-
- x += round(xx);
- y += round(xy);
- layer += round(xl);
-
- if (layer < 1 || layer > n_layers ||
- x < ImgBorder || x >= idims[1] - ImgBorder ||
- y < ImgBorder || y >= idims[0] - ImgBorder) {
- converges = false;
- break;
- }
- }
-
- // ensure convergence of interpolation
- if (i >= MaxInterpSteps || !converges)
- continue;
-
- float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
- (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
- (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
- float X[3] = {xx, xy, xl};
-
- float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
- contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
- if(abs(contr) < (contrast_thr / n_layers))
- continue;
-
- // principal curvatures are computed using the trace and det of Hessian
- float d2 = CPTR(x, y) * 2.f;
- float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
- float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
- float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
- CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-
- float tr = dxx + dyy;
- float det = dxx * dyy - dxy * dxy;
-
- // add FLT_EPSILON for double-precision compatibility
- if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
- continue;
-
- if (*counter < max_feat)
- {
- x_out[*counter] = (x + xx) * (1 << octave);
- y_out[*counter] = (y + xy) * (1 << octave);
- layer_out[*counter] = layer;
- response_out[*counter] = abs(contr);
- size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers);
- (*counter)++;
- }
- }
- }
-
-#undef CPTR
-#undef PPTR
-#undef NPTR
-
-// Remove duplicate keypoints
- void removeDuplicates(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- unsigned* counter,
- const std::vector<feat_t>& sorted_feat)
- {
- size_t nfeat = sorted_feat.size();
-
- for (size_t f = 0; f < nfeat; f++) {
- float prec_fctr = 1e4f;
-
- if (f < nfeat-1) {
- if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
- round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
- round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
- round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
- sorted_feat[f].l == sorted_feat[f+1].l)
- continue;
- }
-
- x_out[*counter] = sorted_feat[f].f[0];
- y_out[*counter] = sorted_feat[f].f[1];
- response_out[*counter] = sorted_feat[f].f[2];
- size_out[*counter] = sorted_feat[f].f[3];
- layer_out[*counter] = sorted_feat[f].l;
- (*counter)++;
- }
- }
-
-#define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)])
-
-// Computes a canonical orientation for each image feature in an array. Based
-// on Section 5 of Lowe's paper. This function adds features to the array when
-// there is more than one dominant orientation at a given feature location.
- template<typename T>
- void calcOrientation(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- float* ori_out,
- unsigned* counter,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const unsigned total_feat,
- const std::vector< Array<T> >& gauss_pyr,
- const unsigned max_feat,
- const unsigned octave,
- const unsigned n_layers,
- const bool double_input)
- {
- const int n = OriHistBins;
-
- float hist[OriHistBins];
- float temphist[OriHistBins];
-
- for (unsigned f = 0; f < total_feat; f++) {
- // Load keypoint information
- const float real_x = x_in[f];
- const float real_y = y_in[f];
- const unsigned layer = layer_in[f];
- const float response = response_in[f];
- const float size = size_in[f];
-
- const int pt_x = (int)round(real_x / (1 << octave));
- const int pt_y = (int)round(real_y / (1 << octave));
-
- // Calculate auxiliary parameters
- const float scl_octv = size*0.5f / (1 << octave);
- const int radius = (int)round(OriRadius * scl_octv);
- const float sigma = OriSigFctr * scl_octv;
- const int len = (radius*2+1);
- const float exp_denom = 2.f * sigma * sigma;
-
- // Points img to correct Gaussian pyramid layer
- const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
- const T* img_ptr = img.get();
-
- for (int i = 0; i < OriHistBins; i++)
- hist[i] = 0.f;
-
- af::dim4 idims = img.dims();
-
- // Calculate orientation histogram
- for (int l = 0; l < len*len; l++) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = pt_y + i;
- int x = pt_x + j;
- if (y < 1 || y >= idims[0] - 1 ||
- x < 1 || x >= idims[1] - 1)
- continue;
-
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float mag = sqrt(dx*dx+dy*dy);
- float ori = atan2(dy,dx);
- float w = exp(-(i*i + j*j)/exp_denom);
-
- int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
- bin = bin < n ? bin : 0;
-
- hist[bin] += w*mag;
- }
-
- for (int i = 0; i < SmoothOriPasses; i++) {
- for (int j = 0; j < n; j++) {
- temphist[j] = hist[j];
- }
- for (int j = 0; j < n; j++) {
- float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
- float next = (j+1 == n) ? temphist[0] : temphist[j+1];
- hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
- }
- }
-
- float omax = hist[0];
- for (int i = 1; i < n; i++)
- omax = max(omax, hist[i]);
-
- float mag_thr = (float)(omax * OriPeakRatio);
- int l, r;
- for (int j = 0; j < n; j++) {
- l = (j == 0) ? n - 1 : j - 1;
- r = (j + 1) % n;
- if (hist[j] > hist[l] &&
- hist[j] > hist[r] &&
- hist[j] >= mag_thr) {
- if (*counter < max_feat) {
- float bin = j + 0.5f * (hist[l] - hist[r]) /
- (hist[l] - 2.0f*hist[j] + hist[r]);
- bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
- float ori = 360.f - ((360.f/n) * bin);
-
- float new_real_x = real_x;
- float new_real_y = real_y;
- float new_size = size;
-
- if (double_input) {
- float scale = 0.5f;
- new_real_x *= scale;
- new_real_y *= scale;
- new_size *= scale;
- }
-
- x_out[*counter] = new_real_x;
- y_out[*counter] = new_real_y;
- layer_out[*counter] = layer;
- response_out[*counter] = response;
- size_out[*counter] = new_size;
- ori_out[*counter] = ori;
- (*counter)++;
- }
- }
- }
- }
- }
-
- void normalizeDesc(
- float* desc,
- const int histlen)
- {
- float len_sq = 0.0f;
-
- for (int i = 0; i < histlen; i++)
- len_sq += desc[i] * desc[i];
-
- float len_inv = 1.0f / sqrt(len_sq);
-
- for (int i = 0; i < histlen; i++) {
- desc[i] *= len_inv;
- }
- }
-
-// Computes feature descriptors for features in an array. Based on Section 6
-// of Lowe's paper.
- template<typename T>
- void computeDescriptor(
- float* desc_out,
- const unsigned desc_len,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const float* ori_in,
- const unsigned total_feat,
- const std::vector< Array<T> >& gauss_pyr,
- const int d,
- const int n,
- const float scale,
- const float sigma,
- const unsigned octave,
- const unsigned n_layers)
- {
- float desc[128];
-
- for (unsigned f = 0; f < total_feat; f++) {
- const unsigned layer = layer_in[f];
- float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
- ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
- //const float size = size_in[f];
- const int fx = round(x_in[f] * scale);
- const int fy = round(y_in[f] * scale);
-
- // Points img to correct Gaussian pyramid layer
- Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
- const T* img_ptr = img.get();
- af::dim4 idims = img.dims();
-
- float cos_t = cos(ori);
- float sin_t = sin(ori);
- float bins_per_rad = n / (PI_VAL * 2.f);
- float exp_denom = d * d * 0.5f;
- float hist_width = DescrSclFctr * sigma * powf(2.f, layer/n_layers);
- int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
- int len = radius*2+1;
- const int histlen = d*d*n;
-
- for (int i = 0; i < histlen; i++)
- desc[i] = 0.f;
-
- // Calculate orientation histogram
- for (int l = 0; l < len*len; l++) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = fy + i;
- int x = fx + j;
-
- float x_rot = (j * cos_t - i * sin_t) / hist_width;
- float y_rot = (j * sin_t + i * cos_t) / hist_width;
- float xbin = x_rot + d/2 - 0.5f;
- float ybin = y_rot + d/2 - 0.5f;
-
- if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
- y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float grad_mag = sqrt(dx*dx + dy*dy);
- float grad_ori = atan2(dy, dx) - ori;
- while (grad_ori < 0.0f)
- grad_ori += PI_VAL*2;
- while (grad_ori >= PI_VAL*2)
- grad_ori -= PI_VAL*2;
-
- float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
- float obin = grad_ori * bins_per_rad;
- float mag = grad_mag*w;
-
- int x0 = floor(xbin);
- int y0 = floor(ybin);
- int o0 = floor(obin);
- xbin -= x0;
- ybin -= y0;
- obin -= o0;
-
- for (int yl = 0; yl <= 1; yl++) {
- int yb = y0 + yl;
- if (yb >= 0 && yb < d) {
- float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
- for (int xl = 0; xl <= 1; xl++) {
- int xb = x0 + xl;
- if (xb >= 0 && xb < d) {
- float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
- for (int ol = 0; ol <= 1; ol++) {
- int ob = (o0 + ol) % n;
- float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
- desc[(yb*d + xb)*n + ob] += v_o;
- }
- }
- }
- }
- }
- }
- }
-
- normalizeDesc(desc, histlen);
-
- for (int i = 0; i < d*d*n; i++)
- desc[i] = min(desc[i], DescrMagThr);
-
- normalizeDesc(desc, histlen);
-
- // Calculate final descriptor values
- for (int k = 0; k < d*d*n; k++) {
- desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
- }
- }
- }
-
-#undef IPTR
-
- template<typename T, typename convAccT>
- Array<T> createInitialImage(
- const Array<T>& img,
- const float init_sigma,
- const bool double_input)
- {
- af::dim4 idims = img.dims();
-
- Array<T> init_img = createEmptyArray<T>(af::dim4());
-
- float s = (double_input) ? sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4)
- : sqrt(init_sigma * init_sigma - InitSigma * InitSigma);
-
- Array<T> filter = gauss_filter<T>(s);
-
- if (double_input) {
- Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
- init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
- }
- else {
- init_img = convolve2<T, convAccT, false>(init_img, filter, filter);
- }
-
- return init_img;
- }
-
- template<typename T, typename convAccT>
- std::vector< Array<T> > buildGaussPyr(
- const Array<T>& init_img,
- const unsigned n_octaves,
- const unsigned n_layers,
- const float init_sigma)
- {
- // Precompute Gaussian sigmas using the following formula:
- // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
- std::vector<float> sig_layers(n_layers + 3);
- sig_layers[0] = init_sigma;
- float k = std::pow(2.0f, 1.0f / n_layers);
- for (unsigned i = 1; i < n_layers + 3; i++) {
- float sig_prev = std::pow(k, i-1) * init_sigma;
- float sig_total = sig_prev * k;
- sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
- }
-
- // Gaussian Pyramid
- std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
- for (unsigned o = 0; o < n_octaves; o++) {
- for (unsigned l = 0; l < n_layers+3; l++) {
- unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
- unsigned idx = o*(n_layers+3) + l;
-
- if (o == 0 && l == 0) {
- gauss_pyr[idx] = init_img;
- }
- else if (l == 0) {
- af::dim4 sdims = gauss_pyr[src_idx].dims();
- gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
- }
- else {
- Array<T> filter = gauss_filter<T>(sig_layers[l]);
-
- gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
- }
- }
- }
-
- return gauss_pyr;
- }
-
- template<typename T>
- std::vector< Array<T> > buildDoGPyr(
- std::vector< Array<T> >& gauss_pyr,
- const unsigned n_octaves,
- const unsigned n_layers)
- {
- // DoG Pyramid
- std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
- for (unsigned o = 0; o < n_octaves; o++) {
- for (unsigned l = 0; l < n_layers+2; l++) {
- unsigned idx = o*(n_layers+2) + l;
- unsigned bottom = o*(n_layers+3) + l;
- unsigned top = o*(n_layers+3) + l+1;
-
- dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
-
- sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
- }
- }
-
- return dog_pyr;
- }
-
-
- template<typename T, typename convAccT>
- unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
- Array<float>& ori, Array<float>& size, Array<float>& desc,
- const Array<T>& in, const unsigned n_layers,
- const float contrast_thr, const float edge_thr,
- const float init_sigma, const bool double_input,
- const float img_scale, const float feature_ratio)
- {
- af::dim4 idims = in.dims();
-
- const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
- : min(idims[0], idims[1]);
- const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
-
- Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
-
- std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
-
- std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
-
- std::vector<float*> x_pyr(n_octaves, NULL);
- std::vector<float*> y_pyr(n_octaves, NULL);
- std::vector<float*> response_pyr(n_octaves, NULL);
- std::vector<float*> size_pyr(n_octaves, NULL);
- std::vector<float*> ori_pyr(n_octaves, NULL);
- std::vector<float*> desc_pyr(n_octaves, NULL);
- std::vector<unsigned> feat_pyr(n_octaves, 0);
- unsigned total_feat = 0;
-
- const unsigned d = DescrWidth;
- const unsigned n = DescrHistBins;
- const unsigned desc_len = d*d*n;
-
- for (unsigned i = 0; i < n_octaves; i++) {
- af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
- if (ddims[0]-2*ImgBorder < 1 ||
- ddims[1]-2*ImgBorder < 1)
- continue;
-
- const unsigned imel = ddims[0] * ddims[1];
- const unsigned max_feat = ceil(imel * feature_ratio);
-
- float* extrema_x = memAlloc<float>(max_feat);
- float* extrema_y = memAlloc<float>(max_feat);
- unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
- unsigned extrema_feat = 0;
-
- for (unsigned j = 1; j <= n_layers; j++) {
- unsigned prev = i*(n_layers+2) + j-1;
- unsigned center = i*(n_layers+2) + j;
- unsigned next = i*(n_layers+2) + j+1;
-
- unsigned layer = j;
-
- float extrema_thr = 0.5f * contrast_thr / n_layers;
- detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
- dog_pyr[prev], dog_pyr[center], dog_pyr[next],
- layer, max_feat, extrema_thr);
- }
-
- extrema_feat = min(extrema_feat, max_feat);
-
- if (extrema_feat == 0) {
- memFree(extrema_x);
- memFree(extrema_y);
- memFree(extrema_layer);
-
- continue;
- }
-
- unsigned interp_feat = 0;
-
- float* interp_x = memAlloc<float>(extrema_feat);
- float* interp_y = memAlloc<float>(extrema_feat);
- unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
- float* interp_response = memAlloc<float>(extrema_feat);
- float* interp_size = memAlloc<float>(extrema_feat);
-
- interpolateExtrema<T>(interp_x, interp_y, interp_layer,
- interp_response, interp_size, &interp_feat,
- extrema_x, extrema_y, extrema_layer, extrema_feat,
- dog_pyr, max_feat, i, n_layers,
- contrast_thr, edge_thr, init_sigma, img_scale);
-
- interp_feat = min(interp_feat, max_feat);
-
- if (interp_feat == 0) {
- memFree(interp_x);
- memFree(interp_y);
- memFree(interp_layer);
- memFree(interp_response);
- memFree(interp_size);
-
- continue;
- }
-
- std::vector<feat_t> sorted_feat;
- array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
- std::sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
-
- memFree(interp_x);
- memFree(interp_y);
- memFree(interp_layer);
- memFree(interp_response);
- memFree(interp_size);
-
- unsigned nodup_feat = 0;
-
- float* nodup_x = memAlloc<float>(interp_feat);
- float* nodup_y = memAlloc<float>(interp_feat);
- unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
- float* nodup_response = memAlloc<float>(interp_feat);
- float* nodup_size = memAlloc<float>(interp_feat);
-
- removeDuplicates(nodup_x, nodup_y, nodup_layer,
- nodup_response, nodup_size, &nodup_feat,
- sorted_feat);
-
- const unsigned max_oriented_feat = nodup_feat * 3;
-
- float* oriented_x = memAlloc<float>(max_oriented_feat);
- float* oriented_y = memAlloc<float>(max_oriented_feat);
- unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
- float* oriented_response = memAlloc<float>(max_oriented_feat);
- float* oriented_size = memAlloc<float>(max_oriented_feat);
- float* oriented_ori = memAlloc<float>(max_oriented_feat);
-
- unsigned oriented_feat = 0;
-
- calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
- oriented_response, oriented_size, oriented_ori, &oriented_feat,
- nodup_x, nodup_y, nodup_layer,
- nodup_response, nodup_size, nodup_feat,
- gauss_pyr, max_oriented_feat, i, n_layers, double_input);
-
- memFree(nodup_x);
- memFree(nodup_y);
- memFree(nodup_layer);
- memFree(nodup_response);
- memFree(nodup_size);
-
- if (oriented_feat == 0) {
- memFree(oriented_x);
- memFree(oriented_y);
- memFree(oriented_layer);
- memFree(oriented_response);
- memFree(oriented_size);
- memFree(oriented_ori);
-
- continue;
- }
-
- float* desc = memAlloc<float>(oriented_feat * desc_len);
-
- float scale = 1.f/(1 << i);
- if (double_input) scale *= 2.f;
-
- computeDescriptor<T>(desc, desc_len,
- oriented_x, oriented_y, oriented_layer,
- oriented_response, oriented_size, oriented_ori,
- oriented_feat, gauss_pyr, d, n, scale, init_sigma, i, n_layers);
-
- total_feat += oriented_feat;
- feat_pyr[i] = oriented_feat;
-
- if (oriented_feat > 0) {
- x_pyr[i] = oriented_x;
- y_pyr[i] = oriented_y;
- response_pyr[i] = oriented_response;
- ori_pyr[i] = oriented_ori;
- size_pyr[i] = oriented_size;
- desc_pyr[i] = desc;
- }
- }
-
- if (total_feat > 0) {
- const af::dim4 total_feat_dims(total_feat);
- const af::dim4 desc_dims(desc_len, total_feat);
-
- // Allocate output memory
- x = createEmptyArray<float>(total_feat_dims);
- y = createEmptyArray<float>(total_feat_dims);
- score = createEmptyArray<float>(total_feat_dims);
- ori = createEmptyArray<float>(total_feat_dims);
- size = createEmptyArray<float>(total_feat_dims);
- desc = createEmptyArray<float>(desc_dims);
-
- float* x_ptr = x.get();
- float* y_ptr = y.get();
- float* score_ptr = score.get();
- float* ori_ptr = ori.get();
- float* size_ptr = size.get();
- float* desc_ptr = desc.get();
-
- unsigned offset = 0;
- for (unsigned i = 0; i < n_octaves; i++) {
- if (feat_pyr[i] == 0)
- continue;
-
- memcpy(x_ptr+offset, x_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(y_ptr+offset, y_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(ori_ptr+offset, ori_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(size_ptr+offset, size_pyr[i], feat_pyr[i] * sizeof(float));
-
- memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
-
- memFree(x_pyr[i]);
- memFree(y_pyr[i]);
- memFree(response_pyr[i]);
- memFree(ori_pyr[i]);
- memFree(size_pyr[i]);
- memFree(desc_pyr[i]);
-
- offset += feat_pyr[i];
- }
- }
-
- return total_feat;
- }
-}
diff --git a/src/backend/cuda/kernel/sift_nonfree.hpp b/src/backend/cuda/kernel/sift_nonfree.hpp
deleted file mode 100644
index c8de759..0000000
--- a/src/backend/cuda/kernel/sift_nonfree.hpp
+++ /dev/null
@@ -1,1370 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements. Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims. If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement. It does not protect you from
-// committing patent infringement. So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either. If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// * Redistributions of source code must retain the above copyright and
-// patent notices, this list of conditions and the following
-// disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in
-// the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Oregon State University nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <af/defines.h>
-#include <dispatch.hpp>
-#include <err_cuda.hpp>
-#include <debug_cuda.hpp>
-#include <memory.hpp>
-#include "shared.hpp"
-
-#include <convolve_common.hpp>
-#include "convolve.hpp"
-#include "resize.hpp"
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/generate.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/gather.h>
-#include <thrust/random.h>
-
-#include <cfloat>
-
-namespace cuda
-{
-
-namespace kernel
-{
-
-static const dim_t SIFT_THREADS = 256;
-static const dim_t SIFT_THREADS_X = 32;
-static const dim_t SIFT_THREADS_Y = 8;
-
-#define PI_VAL 3.14159265358979323846f
-
-// default width of descriptor histogram array
-#define DESCR_WIDTH 4
-
-// default number of bins per histogram in descriptor array
-#define DESCR_HIST_BINS 8
-
-// assumed gaussian blur for input image
-#define INIT_SIGMA 0.5f
-
-// width of border in which to ignore keypoints
-#define IMG_BORDER 5
-
-// maximum steps of keypointerpolation before failure
-#define MAX_INTERP_STEPS 5
-
-// default number of bins in histogram for orientation assignment
-#define ORI_HIST_BINS 36
-
-// determines gaussian sigma for orientation assignment
-#define ORI_SIG_FCTR 1.5f
-
-// determines the radius of the region used in orientation assignment */
-#define ORI_RADIUS (3.0f * ORI_SIG_FCTR)
-
-// number of passes of orientation histogram smoothing
-#define SMOOTH_ORI_PASSES 2
-
-// orientation magnitude relative to max that results in new feature
-#define ORI_PEAK_RATIO 0.8f
-
-// determines the size of a single descriptor orientation histogram
-#define DESCR_SCL_FCTR 3.f
-
-// threshold on magnitude of elements of descriptor vector
-#define DESC_MAG_THR 0.2f
-
-// factor used to convert floating-podescriptor to unsigned char
-#define INT_DESCR_FCTR 512.f
-
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
- if(!(sigma>0)) sigma = 0.25*dim;
-
- T sum = (T)0;
- for(int i=0;i<dim;i++)
- {
- int x = i-(dim-1)/2;
- T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
- out[i] = el;
- sum += el;
- }
-
- for(int k=0;k<dim;k++)
- out[k] /= sum;
-}
-
-template<typename T>
-Param<T> gauss_filter(float sigma)
-{
- // Using 6-sigma rule
- unsigned gauss_len = (unsigned)round(sigma * 6 + 1) | 1;
-
- T* h_gauss = new T[gauss_len];
- gaussian1D(h_gauss, gauss_len, sigma);
- Param<T> gauss_filter;
- gauss_filter.dims[0] = gauss_len;
- gauss_filter.strides[0] = 1;
-
- for (int k = 1; k < 4; k++) {
- gauss_filter.dims[k] = 1;
- gauss_filter.strides[k] = gauss_filter.dims[k-1] * gauss_filter.strides[k-1];
- }
-
- dim_t gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
- gauss_filter.ptr = memAlloc<T>(gauss_elem);
- CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(T), cudaMemcpyHostToDevice));
-
- delete[] h_gauss;
-
- return gauss_filter;
-}
-
-template<int N>
-__inline__ __device__ void gaussianElimination(float* A, float* b, float* x)
-{
- // forward elimination
- #pragma unroll
- for (int i = 0; i < N-1; i++) {
- #pragma unroll
- for (int j = i+1; j < N; j++) {
- float s = A[j*N+i] / A[i*N+i];
-
- #pragma unroll
- for (int k = i; k < N; k++)
- A[j*N+k] -= s * A[i*N+k];
-
- b[j] -= s * b[i];
- }
- }
-
- #pragma unroll
- for (int i = 0; i < N; i++)
- x[i] = 0;
-
- // backward substitution
- float sum = 0;
- #pragma unroll
- for (int i = 0; i <= N-2; i++) {
- sum = b[i];
- #pragma unroll
- for (int j = i+1; j < N; j++)
- sum -= A[i*N+j] * x[j];
- x[i] = sum / A[i*N+i];
- }
-}
-
-__inline__ __device__ void normalizeDesc(
- float* desc,
- float* accum,
- const int histlen)
-{
- int tid_x = threadIdx.x;
- int tid_y = threadIdx.y;
- int bsz_x = blockDim.x;
-
- for (int i = tid_x; i < histlen; i += bsz_x)
- accum[tid_x] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
- __syncthreads();
-
- if (tid_x < 64)
- accum[tid_x] += accum[tid_x+64];
- __syncthreads();
- if (tid_x < 32)
- accum[tid_x] += accum[tid_x+32];
- __syncthreads();
- if (tid_x < 16)
- accum[tid_x] += accum[tid_x+16];
- __syncthreads();
- if (tid_x < 8)
- accum[tid_x] += accum[tid_x+8];
- __syncthreads();
- if (tid_x < 4)
- accum[tid_x] += accum[tid_x+4];
- __syncthreads();
- if (tid_x < 2)
- accum[tid_x] += accum[tid_x+2];
- __syncthreads();
- if (tid_x < 1)
- accum[tid_x] += accum[tid_x+1];
- __syncthreads();
-
- float len_sq = accum[0];
- float len_inv = 1.0f / sqrtf(len_sq);
-
- for (int i = tid_x; i < histlen; i += bsz_x) {
- desc[tid_y*histlen+i] *= len_inv;
- }
- __syncthreads();
-}
-
-template<typename T>
-__global__ void sub(
- Param<T> out,
- CParam<T> in,
- const unsigned nel,
- const unsigned n_layers)
-{
- unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
-
- for (unsigned l = 0; l < n_layers; l++)
- out.ptr[l*nel + i] = in.ptr[(l+1)*nel + i] - in.ptr[l*nel + i];
-}
-
-#define SCPTR(Y, X) (s_center[(Y) * s_i + (X)])
-#define SPPTR(Y, X) (s_prev[(Y) * s_i + (X)])
-#define SNPTR(Y, X) (s_next[(Y) * s_i + (X)])
-#define DPTR(Z, Y, X) (dog.ptr[(Z) * imel + (Y) * dim0 + (X)])
-
-// Determines whether a pixel is a scale-space extremum by comparing it to its
-// 3x3x3 pixel neighborhood.
-template<typename T>
-__global__ void detectExtrema(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- unsigned* counter,
- CParam<T> dog,
- const unsigned max_feat,
- const float threshold)
-{
- const int dim0 = dog.dims[0];
- const int dim1 = dog.dims[1];
- const int imel = dim0 * dim1;
-
- const int tid_i = threadIdx.x;
- const int tid_j = threadIdx.y;
- const int bsz_i = blockDim.x;
- const int bsz_j = blockDim.y;
- const int i = blockIdx.x * bsz_i + tid_i+IMG_BORDER;
- const int j = blockIdx.y * bsz_j + tid_j+IMG_BORDER;
-
- const int x = tid_i+1;
- const int y = tid_j+1;
-
- // One pixel border for each side
- const int s_i = bsz_i+2;
- const int s_j = bsz_j+2;
-
- SharedMemory<float> shared;
- float* shrdMem = shared.getPointer();
- float* s_next = shrdMem;
- float* s_center = shrdMem + s_i * s_j;
- float* s_prev = shrdMem + s_i * s_j * 2;
-
- for (int l = 1; l < dog.dims[2]-1; l++) {
- const int s_i_half = s_i/2;
- const int s_j_half = s_j/2;
- if (tid_i < s_i_half && tid_j < s_j_half && i < dim0-IMG_BORDER+1 && j < dim1-IMG_BORDER+1) {
- SNPTR(tid_j, tid_i) = DPTR(l+1, j-1, i-1);
- SCPTR(tid_j, tid_i) = DPTR(l, j-1, i-1);
- SPPTR(tid_j, tid_i) = DPTR(l-1, j-1, i-1);
-
- SNPTR(tid_j, tid_i+s_i_half) = DPTR((l+1), j-1, i-1+s_i_half);
- SCPTR(tid_j, tid_i+s_i_half) = DPTR((l ), j-1, i-1+s_i_half);
- SPPTR(tid_j, tid_i+s_i_half) = DPTR((l-1), j-1, i-1+s_i_half);
-
- SNPTR(tid_j+s_j_half, tid_i) = DPTR(l+1, j-1+s_j_half, i-1);
- SCPTR(tid_j+s_j_half, tid_i) = DPTR(l, j-1+s_j_half, i-1);
- SPPTR(tid_j+s_j_half, tid_i) = DPTR(l-1, j-1+s_j_half, i-1);
-
- SNPTR(tid_j+s_j_half, tid_i+s_i_half) = DPTR(l+1, j-1+s_j_half, i-1+s_i_half);
- SCPTR(tid_j+s_j_half, tid_i+s_i_half) = DPTR(l, j-1+s_j_half, i-1+s_i_half);
- SPPTR(tid_j+s_j_half, tid_i+s_i_half) = DPTR(l-1, j-1+s_j_half, i-1+s_i_half);
- }
- __syncthreads();
-
- float p = SCPTR(y, x);
-
- if (abs(p) > threshold && i < dim0-IMG_BORDER && j < dim1-IMG_BORDER &&
- ((p > 0 && p > SCPTR(y-1, x-1) && p > SCPTR(y-1, x) &&
- p > SCPTR(y-1, x+1) && p > SCPTR(y, x-1) && p > SCPTR(y, x+1) &&
- p > SCPTR(y+1, x-1) && p > SCPTR(y+1, x) && p > SCPTR(y+1, x+1) &&
- p > SPPTR(y-1, x-1) && p > SPPTR(y-1, x) && p > SPPTR(y-1, x+1) &&
- p > SPPTR(y, x-1) && p > SPPTR(y , x) && p > SPPTR(y, x+1) &&
- p > SPPTR(y+1, x-1) && p > SPPTR(y+1, x) && p > SPPTR(y+1, x+1) &&
- p > SNPTR(y-1, x-1) && p > SNPTR(y-1, x) && p > SNPTR(y-1, x+1) &&
- p > SNPTR(y, x-1) && p > SNPTR(y , x) && p > SNPTR(y, x+1) &&
- p > SNPTR(y+1, x-1) && p > SNPTR(y+1, x) && p > SNPTR(y+1, x+1)) ||
- (p < 0 && p < SCPTR(y-1, x-1) && p < SCPTR(y-1, x) &&
- p < SCPTR(y-1, x+1) && p < SCPTR(y, x-1) && p < SCPTR(y, x+1) &&
- p < SCPTR(y+1, x-1) && p < SCPTR(y+1, x) && p < SCPTR(y+1, x+1) &&
- p < SPPTR(y-1, x-1) && p < SPPTR(y-1, x) && p < SPPTR(y-1, x+1) &&
- p < SPPTR(y, x-1) && p < SPPTR(y , x) && p < SPPTR(y, x+1) &&
- p < SPPTR(y+1, x-1) && p < SPPTR(y+1, x) && p < SPPTR(y+1, x+1) &&
- p < SNPTR(y-1, x-1) && p < SNPTR(y-1, x) && p < SNPTR(y-1, x+1) &&
- p < SNPTR(y, x-1) && p < SNPTR(y , x) && p < SNPTR(y, x+1) &&
- p < SNPTR(y+1, x-1) && p < SNPTR(y+1, x) && p < SNPTR(y+1, x+1)))) {
-
- unsigned idx = atomicAdd(counter, 1u);
- if (idx < max_feat)
- {
- x_out[idx] = (float)j;
- y_out[idx] = (float)i;
- layer_out[idx] = l;
- }
- }
- __syncthreads();
- }
-}
-
-#undef SCPTR
-#undef SPPTR
-#undef SNPTR
-#define CPTR(Y, X) (center_ptr[(Y) * dim0 + (X)])
-#define PPTR(Y, X) (prev_ptr[(Y) * dim0 + (X)])
-#define NPTR(Y, X) (next_ptr[(Y) * dim0 + (X)])
-
-// Interpolates a scale-space extremum's location and scale to subpixel
-// accuracy to form an image feature. Rejects features with low contrast.
-// Based on Section 4 of Lowe's paper.
-template<typename T>
-__global__ void interpolateExtrema(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- unsigned* counter,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const unsigned extrema_feat,
- const CParam<T> dog_octave,
- const unsigned max_feat,
- const unsigned octave,
- const unsigned n_layers,
- const float contrast_thr,
- const float edge_thr,
- const float sigma,
- const float img_scale)
-{
- const unsigned f = blockIdx.x * blockDim.x + threadIdx.x;
-
- if (f < extrema_feat) {
- const float first_deriv_scale = img_scale*0.5f;
- const float second_deriv_scale = img_scale;
- const float cross_deriv_scale = img_scale*0.25f;
-
- float xl = 0, xy = 0, xx = 0, contr = 0;
- int i = 0;
-
- unsigned x = x_in[f];
- unsigned y = y_in[f];
- unsigned layer = layer_in[f];
-
- const int dim0 = dog_octave.dims[0];
- const int dim1 = dog_octave.dims[1];
- const int imel = dim0 * dim1;
-
- const T* prev_ptr = dog_octave.ptr + (layer-1) * imel;
- const T* center_ptr = dog_octave.ptr + (layer) * imel;
- const T* next_ptr = dog_octave.ptr + (layer+1) * imel;
-
- for(i = 0; i < MAX_INTERP_STEPS; i++) {
- float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
- (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
- (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
-
- float d2 = CPTR(x, y) * 2.f;
- float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
- float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
- float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale;
- float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
- CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
- float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
- PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
- float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
- PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
- float H[9] = {dxx, dxy, dxs,
- dxy, dyy, dys,
- dxs, dys, dss};
-
- float X[3];
- gaussianElimination<3>(H, dD, X);
-
- xl = -X[2];
- xy = -X[1];
- xx = -X[0];
-
- if (abs(xl) < 0.5f && abs(xy) < 0.5f && abs(xx) < 0.5f)
- break;
-
- x += round(xx);
- y += round(xy);
- layer += round(xl);
-
- if (layer < 1 || layer > n_layers ||
- x < IMG_BORDER || x >= dim1 - IMG_BORDER ||
- y < IMG_BORDER || y >= dim0 - IMG_BORDER)
- return;
- }
-
- // ensure convergence of interpolation
- if (i >= MAX_INTERP_STEPS)
- return;
-
- float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
- (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
- (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
- float X[3] = {xx, xy, xl};
-
- float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
- contr = CPTR(x, y)*img_scale + P * 0.5f;
- if (abs(contr) < (contrast_thr / n_layers))
- return;
-
- // principal curvatures are computed using the trace and det of Hessian
- float d2 = CPTR(x, y) * 2.f;
- float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
- float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
- float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
- CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-
- float tr = dxx + dyy;
- float det = dxx * dyy - dxy * dxy;
-
- // add FLT_EPSILON for double-precision compatibility
- if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
- return;
-
- unsigned ridx = atomicAdd(counter, 1u);
-
- if (ridx < max_feat)
- {
- x_out[ridx] = (x + xx) * (1 << octave);
- y_out[ridx] = (y + xy) * (1 << octave);
- layer_out[ridx] = layer;
- response_out[ridx] = abs(contr);
- size_out[ridx] = sigma*pow(2.f, octave + (layer + xl) / n_layers);
- }
- }
-}
-
-#undef CPTR
-#undef PPTR
-#undef NPTR
-
-// Remove duplicate keypoints
-__global__ void removeDuplicates(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- unsigned* counter,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const unsigned total_feat)
-{
- const unsigned f = blockIdx.x * blockDim.x + threadIdx.x;
-
- if (f >= total_feat)
- return;
-
- float prec_fctr = 1e4f;
-
- if (f < total_feat-1) {
- if (round(x_in[f]*prec_fctr) == round(x_in[f+1]*prec_fctr) &&
- round(y_in[f]*prec_fctr) == round(y_in[f+1]*prec_fctr) &&
- layer_in[f] == layer_in[f+1] &&
- round(response_in[f]*prec_fctr) == round(response_in[f+1]*prec_fctr) &&
- round(size_in[f]*prec_fctr) == round(size_in[f+1]*prec_fctr))
- return;
- }
-
- unsigned idx = atomicAdd(counter, 1);
-
- x_out[idx] = x_in[f];
- y_out[idx] = y_in[f];
- layer_out[idx] = layer_in[f];
- response_out[idx] = response_in[f];
- size_out[idx] = size_in[f];
-}
-
-#define IPTR(Y, X) (img_ptr[(Y) * dim0 + (X)])
-
-// Computes a canonical orientation for each image feature in an array. Based
-// on Section 5 of Lowe's paper. This function adds features to the array when
-// there is more than one dominant orientation at a given feature location.
-template<typename T>
-__global__ void calcOrientation(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- float* ori_out,
- unsigned* counter,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const unsigned total_feat,
- const CParam<T> gauss_octave,
- const unsigned max_feat,
- const unsigned octave,
- const bool double_input)
-{
- const int tid_x = threadIdx.x;
- const int tid_y = threadIdx.y;
- const int bsz_x = blockDim.x;
- const int bsz_y = blockDim.y;
-
- const unsigned f = blockIdx.y * bsz_y + tid_y;
-
- const int n = ORI_HIST_BINS;
-
- SharedMemory<float> shared;
- float* shrdMem = shared.getPointer();
- float* hist = shrdMem;
- float* temphist = shrdMem + n*8;
-
- // Initialize temporary histogram
- for (int i = tid_x; i < ORI_HIST_BINS; i += bsz_x)
- hist[tid_y*n + i] = 0.f;
- __syncthreads();
-
- float real_x, real_y, response, size;
- unsigned layer;
-
- if (f < total_feat) {
- // Load keypoint information
- real_x = x_in[f];
- real_y = y_in[f];
- layer = layer_in[f];
- response = response_in[f];
- size = size_in[f];
-
- const int pt_x = (int)round(real_x / (1 << octave));
- const int pt_y = (int)round(real_y / (1 << octave));
-
- // Calculate auxiliary parameters
- const float scl_octv = size*0.5f / (1 << octave);
- const int radius = (int)round(ORI_RADIUS * scl_octv);
- const float sigma = ORI_SIG_FCTR * scl_octv;
- const int len = (radius*2+1);
- const float exp_denom = 2.f * sigma * sigma;
-
- const int dim0 = gauss_octave.dims[0];
- const int dim1 = gauss_octave.dims[1];
- const int imel = dim0 * dim1;
-
- // Points img to correct Gaussian pyramid layer
- const T* img_ptr = gauss_octave.ptr + layer * imel;
-
- // Calculate orientation histogram
- for (int l = tid_x; l < len*len; l += bsz_x) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = pt_y + i;
- int x = pt_x + j;
- if (y < 1 || y >= dim0 - 1 ||
- x < 1 || x >= dim1 - 1)
- continue;
-
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float mag = sqrt(dx*dx+dy*dy);
- float ori = atan2(dy,dx);
- float w = exp(-(i*i + j*j)/exp_denom);
-
- int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
- bin = bin < n ? bin : 0;
-
- atomicAdd(&hist[tid_y*n+bin], w*mag);
- }
- }
- __syncthreads();
-
- for (int i = 0; i < SMOOTH_ORI_PASSES; i++) {
- for (int j = tid_x; j < n; j += bsz_x) {
- temphist[tid_y*n+j] = hist[tid_y*n+j];
- }
- __syncthreads();
- for (int j = tid_x; j < n; j += bsz_x) {
- float prev = (j == 0) ? temphist[tid_y*n+n-1] : temphist[tid_y*n+j-1];
- float next = (j+1 == n) ? temphist[tid_y*n] : temphist[tid_y*n+j+1];
- hist[tid_y*n+j] = 0.25f * prev + 0.5f * temphist[tid_y*n+j] + 0.25f * next;
- }
- __syncthreads();
- }
-
- for (int i = tid_x; i < n; i += bsz_x)
- temphist[tid_y*n+i] = hist[tid_y*n+i];
- __syncthreads();
-
- if (tid_x < 16)
- temphist[tid_y*n+tid_x] = fmax(hist[tid_y*n+tid_x], hist[tid_y*n+tid_x+16]);
- __syncthreads();
- if (tid_x < 8)
- temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+8]);
- __syncthreads();
- if (tid_x < 4) {
- temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], hist[tid_y*n+tid_x+32]);
- temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+4]);
- }
- __syncthreads();
- if (tid_x < 2)
- temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+2]);
- __syncthreads();
- if (tid_x < 1)
- temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+1]);
- __syncthreads();
- float omax = temphist[tid_y*n];
-
- if (f < total_feat) {
- float mag_thr = (float)(omax * ORI_PEAK_RATIO);
- int l, r;
- for (int j = tid_x; j < n; j+=bsz_x) {
- l = (j == 0) ? n - 1 : j - 1;
- r = (j + 1) % n;
- if (hist[tid_y*n+j] > hist[tid_y*n+l] &&
- hist[tid_y*n+j] > hist[tid_y*n+r] &&
- hist[tid_y*n+j] >= mag_thr) {
- int idx = atomicAdd(counter, 1);
-
- if (idx < max_feat) {
- float bin = j + 0.5f * (hist[tid_y*n+l] - hist[tid_y*n+r]) /
- (hist[tid_y*n+l] - 2.0f*hist[tid_y*n+j] + hist[tid_y*n+r]);
- bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
- float ori = 360.f - ((360.f/n) * bin);
-
- float new_real_x = real_x;
- float new_real_y = real_y;
- float new_size = size;
-
- if (double_input) {
- float scale = 0.5f;
- new_real_x *= scale;
- new_real_y *= scale;
- new_size *= scale;
- }
-
- x_out[idx] = new_real_x;
- y_out[idx] = new_real_y;
- layer_out[idx] = layer;
- response_out[idx] = response;
- size_out[idx] = new_size;
- ori_out[idx] = ori;
- }
- }
- }
- }
-}
-
-// Computes feature descriptors for features in an array. Based on Section 6
-// of Lowe's paper.
-template<typename T>
-__global__ void computeDescriptor(
- float* desc_out,
- const unsigned desc_len,
- const unsigned histsz,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const float* ori_in,
- const unsigned total_feat,
- const CParam<T> gauss_octave,
- const int d,
- const int n,
- const float scale,
- const float sigma,
- const int n_layers)
-{
- const int tid_x = threadIdx.x;
- const int tid_y = threadIdx.y;
- const int bsz_x = blockDim.x;
- const int bsz_y = blockDim.y;
-
- const int f = blockIdx.y * bsz_y + tid_y;
-
- SharedMemory<float> shared;
- float* shrdMem = shared.getPointer();
- float* desc = shrdMem;
- float* accum = shrdMem + desc_len * histsz;
-
- const int histlen = (d)*(d)*(n);
-
- for (int i = tid_x; i < histlen*histsz; i += bsz_x)
- desc[tid_y*histlen+i] = 0.f;
- __syncthreads();
-
- if (f < total_feat) {
- const unsigned layer = layer_in[f];
- float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
- ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
- //const float size = size_in[f];
- const int fx = round(x_in[f] * scale);
- const int fy = round(y_in[f] * scale);
-
- const int dim0 = gauss_octave.dims[0];
- const int dim1 = gauss_octave.dims[1];
- const int imel = dim0 * dim1;
-
- // Points img to correct Gaussian pyramid layer
- const T* img_ptr = gauss_octave.ptr + layer * imel;
-
- float cos_t = cosf(ori);
- float sin_t = sinf(ori);
- float bins_per_rad = n / (PI_VAL * 2.f);
- float exp_denom = d * d * 0.5f;
- float hist_width = DESCR_SCL_FCTR * sigma * powf(2.f, layer/n_layers);
- int radius = hist_width * sqrtf(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
- int len = radius*2+1;
- const int hist_off = (tid_x % histsz) * desc_len;
-
- // Calculate orientation histogram
- for (int l = tid_x; l < len*len; l += bsz_x) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = fy + i;
- int x = fx + j;
-
- float x_rot = (j * cos_t - i * sin_t) / hist_width;
- float y_rot = (j * sin_t + i * cos_t) / hist_width;
- float xbin = x_rot + d/2 - 0.5f;
- float ybin = y_rot + d/2 - 0.5f;
-
- if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
- y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float grad_mag = sqrtf(dx*dx + dy*dy);
- float grad_ori = atan2f(dy, dx) - ori;
- while (grad_ori < 0.0f)
- grad_ori += PI_VAL*2;
- while (grad_ori >= PI_VAL*2)
- grad_ori -= PI_VAL*2;
-
- float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
- float obin = grad_ori * bins_per_rad;
- float mag = grad_mag*w;
-
- int x0 = floor(xbin);
- int y0 = floor(ybin);
- int o0 = floor(obin);
- xbin -= x0;
- ybin -= y0;
- obin -= o0;
-
- for (int yl = 0; yl <= 1; yl++) {
- int yb = y0 + yl;
- if (yb >= 0 && yb < d) {
- float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
- for (int xl = 0; xl <= 1; xl++) {
- int xb = x0 + xl;
- if (xb >= 0 && xb < d) {
- float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
- for (int ol = 0; ol <= 1; ol++) {
- int ob = (o0 + ol) % n;
- float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
- atomicAdd(&desc[hist_off + tid_y*desc_len + (yb*d + xb)*n + ob], v_o);
- }
- }
- }
- }
- }
- }
- }
- }
- __syncthreads();
-
- // Combine histograms (reduces previous atomicAdd overhead)
- for (int l = tid_x; l < desc_len*4; l += bsz_x)
- desc[l] += desc[l+4*desc_len];
- __syncthreads();
- for (int l = tid_x; l < desc_len*2; l += bsz_x)
- desc[l ] += desc[l+2*desc_len];
- __syncthreads();
- for (int l = tid_x; l < desc_len; l += bsz_x)
- desc[l] += desc[l+desc_len];
- __syncthreads();
-
- normalizeDesc(desc, accum, histlen);
-
- for (int i = tid_x; i < d*d*n; i += bsz_x)
- desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
- __syncthreads();
-
- normalizeDesc(desc, accum, histlen);
-
- if (f < total_feat) {
- // Calculate final descriptor values
- for (int k = tid_x; k < d*d*n; k += bsz_x)
- desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
- }
-}
-
-#undef IPTR
-
-template<typename T, typename convAccT>
-Param<T> createInitialImage(
- CParam<T> img,
- const float init_sigma,
- const bool double_input)
-{
- Param<T> init_img, init_tmp;
- init_img.dims[0] = init_tmp.dims[0] = (double_input) ? img.dims[0] * 2 : img.dims[0];
- init_img.dims[1] = init_tmp.dims[1] = (double_input) ? img.dims[1] * 2 : img.dims[1];
- init_img.strides[0] = init_tmp.strides[0] = 1;
- init_img.strides[1] = init_tmp.strides[1] = init_img.dims[0];
-
- for (int k = 2; k < 4; k++) {
- init_img.dims[k] = 1;
- init_img.strides[k] = init_img.dims[k-1] * init_img.strides[k-1];
- init_tmp.dims[k] = 1;
- init_tmp.strides[k] = init_tmp.dims[k-1] * init_tmp.strides[k-1];
- }
-
- dim_t init_img_el = init_img.strides[3] * init_img.dims[3];
- init_img.ptr = memAlloc<T>(init_img_el);
- init_tmp.ptr = memAlloc<T>(init_img_el);
-
- float s = (double_input) ? sqrt(init_sigma * init_sigma - INIT_SIGMA * INIT_SIGMA * 4)
- : sqrt(init_sigma * init_sigma - INIT_SIGMA * INIT_SIGMA);
-
- Param<convAccT> filter = gauss_filter<convAccT>(s);
-
- if (double_input) {
- resize<T, AF_INTERP_BILINEAR>(init_img, img);
- convolve2<T, convAccT, 0, false>(init_tmp, init_img, filter);
- }
- else
- convolve2<T, convAccT, 0, false>(init_tmp, img, filter);
-
- convolve2<T, convAccT, 1, false>(init_img, CParam<T>(init_tmp), filter);
-
- memFree(init_tmp.ptr);
- memFree(filter.ptr);
-
- return init_img;
-}
-
-template<typename T, typename convAccT>
-std::vector< Param<T> > buildGaussPyr(
- Param<T> init_img,
- const unsigned n_octaves,
- const unsigned n_layers,
- const float init_sigma)
-{
- // Precompute Gaussian sigmas using the following formula:
- // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
- std::vector<float> sig_layers(n_layers + 3);
- sig_layers[0] = init_sigma;
- float k = std::pow(2.0f, 1.0f / n_layers);
- for (unsigned i = 1; i < n_layers + 3; i++) {
- float sig_prev = std::pow(k, i-1) * init_sigma;
- float sig_total = sig_prev * k;
- sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
- }
-
- // Gaussian Pyramid
- std::vector<Param<T> > gauss_pyr(n_octaves);
- std::vector<Param<T> > tmp_pyr(n_octaves * (n_layers+3));
- for (unsigned o = 0; o < n_octaves; o++) {
- gauss_pyr[o].dims[0] = (o == 0) ? init_img.dims[0] : gauss_pyr[o-1].dims[0] / 2;
- gauss_pyr[o].dims[1] = (o == 0) ? init_img.dims[1] : gauss_pyr[o-1].dims[1] / 2;
- gauss_pyr[o].dims[2] = n_layers+3;
- gauss_pyr[o].dims[3] = 1;
-
- gauss_pyr[o].strides[0] = 1;
- gauss_pyr[o].strides[1] = gauss_pyr[o].dims[0] * gauss_pyr[o].strides[0];
- gauss_pyr[o].strides[2] = gauss_pyr[o].dims[1] * gauss_pyr[o].strides[1];
- gauss_pyr[o].strides[3] = gauss_pyr[o].dims[2] * gauss_pyr[o].strides[2];
-
- const unsigned nel = gauss_pyr[o].dims[3] * gauss_pyr[o].strides[3];
- gauss_pyr[o].ptr = memAlloc<T>(nel);
-
- for (unsigned l = 0; l < n_layers+3; l++) {
- unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
- unsigned idx = o*(n_layers+3) + l;
-
- if (o == 0 && l == 0) {
- for (int k = 0; k < 4; k++) {
- tmp_pyr[idx].dims[k] = init_img.dims[k];
- tmp_pyr[idx].strides[k] = init_img.strides[k];
- }
- tmp_pyr[idx].ptr = init_img.ptr;
- }
- else if (l == 0) {
- tmp_pyr[idx].dims[0] = tmp_pyr[src_idx].dims[0] / 2;
- tmp_pyr[idx].dims[1] = tmp_pyr[src_idx].dims[1] / 2;
- tmp_pyr[idx].strides[0] = 1;
- tmp_pyr[idx].strides[1] = tmp_pyr[idx].dims[0];
-
- for (int k = 2; k < 4; k++) {
- tmp_pyr[idx].dims[k] = 1;
- tmp_pyr[idx].strides[k] = tmp_pyr[idx].dims[k-1] * tmp_pyr[idx].strides[k-1];
- }
-
- dim_t lvl_el = tmp_pyr[idx].strides[3] * tmp_pyr[idx].dims[3];
- tmp_pyr[idx].ptr = memAlloc<T>(lvl_el);
-
- resize<T, AF_INTERP_BILINEAR>(tmp_pyr[idx], tmp_pyr[src_idx]);
- }
- else {
- for (int k = 0; k < 4; k++) {
- tmp_pyr[idx].dims[k] = tmp_pyr[src_idx].dims[k];
- tmp_pyr[idx].strides[k] = tmp_pyr[src_idx].strides[k];
- }
- dim_t lvl_el = tmp_pyr[idx].strides[3] * tmp_pyr[idx].dims[3];
- tmp_pyr[idx].ptr = memAlloc<T>(lvl_el);
-
- Param<T> tmp;
- for (int k = 0; k < 4; k++) {
- tmp.dims[k] = tmp_pyr[idx].dims[k];
- tmp.strides[k] = tmp_pyr[idx].strides[k];
- }
- tmp.ptr = memAlloc<T>(lvl_el);
-
- Param<convAccT> filter = gauss_filter<convAccT>(sig_layers[l]);
-
- convolve2<T, convAccT, 0, false>(tmp, tmp_pyr[src_idx], filter);
- convolve2<T, convAccT, 1, false>(tmp_pyr[idx], CParam<T>(tmp), filter);
-
- memFree(tmp.ptr);
- memFree(filter.ptr);
- }
-
- const unsigned imel = tmp_pyr[idx].dims[3] * tmp_pyr[idx].strides[3];
- const unsigned offset = imel * l;
-
- //getQueue().enqueueCopyBuffer(*tmp_pyr[idx].data, *gauss_pyr[o].data, 0, offset*sizeof(T), imel * sizeof(T));
- CUDA_CHECK(cudaMemcpy(gauss_pyr[o].ptr + offset, tmp_pyr[idx].ptr, imel * sizeof(T), cudaMemcpyDeviceToDevice));
- }
- }
-
- for (unsigned o = 0; o < n_octaves; o++) {
- for (unsigned l = 0; l < n_layers+3; l++) {
- unsigned idx = o*(n_layers+3) + l;
- memFree(tmp_pyr[idx].ptr);
- }
- }
-
- return gauss_pyr;
-}
-
-template<typename T>
-std::vector< Param<T> > buildDoGPyr(
- std::vector< Param<T> >& gauss_pyr,
- const unsigned n_octaves,
- const unsigned n_layers)
-{
- // DoG Pyramid
- std::vector< Param<T> > dog_pyr(n_octaves);
- for (unsigned o = 0; o < n_octaves; o++) {
- for (int k = 0; k < 4; k++) {
- dog_pyr[o].dims[k] = (k == 2) ? gauss_pyr[o].dims[k]-1 : gauss_pyr[o].dims[k];
- dog_pyr[o].strides[k] = (k == 0) ? 1 : dog_pyr[o].dims[k-1] * dog_pyr[o].strides[k-1];
- }
-
- dog_pyr[o].ptr = memAlloc<T>(dog_pyr[o].dims[3] * dog_pyr[o].strides[3]);
-
- const unsigned nel = dog_pyr[o].dims[1] * dog_pyr[o].strides[1];
- const unsigned dog_layers = n_layers+2;
-
- dim3 threads(SIFT_THREADS);
- dim3 blocks(divup(nel, threads.x));
- CUDA_LAUNCH((sub<T>), blocks, threads,
- dog_pyr[o], gauss_pyr[o], nel, dog_layers);
- POST_LAUNCH_CHECK();
- }
-
- return dog_pyr;
-}
-
-template <typename T>
-void update_permutation(thrust::device_ptr<T>& keys, thrust::device_vector<int>& permutation)
-{
- // temporary storage for keys
- thrust::device_vector<T> temp(permutation.size());
-
- // permute the keys with the current reordering
- THRUST_SELECT((thrust::gather), permutation.begin(), permutation.end(), keys, temp.begin());
-
- // stable_sort the permuted keys and update the permutation
- THRUST_SELECT((thrust::stable_sort_by_key), temp.begin(), temp.end(), permutation.begin());
-}
-
-template <typename T>
-void apply_permutation(thrust::device_ptr<T>& keys, thrust::device_vector<int>& permutation)
-{
- // copy keys to temporary vector
- thrust::device_vector<T> temp(keys, keys+permutation.size());
-
- // permute the keys
- THRUST_SELECT((thrust::gather), permutation.begin(), permutation.end(), temp.begin(), keys);
-}
-
-template<typename T, typename convAccT>
-void sift(unsigned* out_feat,
- unsigned* out_dlen,
- float** d_x,
- float** d_y,
- float** d_score,
- float** d_ori,
- float** d_size,
- float** d_desc,
- CParam<T> img,
- const unsigned n_layers,
- const float contrast_thr,
- const float edge_thr,
- const float init_sigma,
- const bool double_input,
- const float img_scale,
- const float feature_ratio)
-{
- const unsigned min_dim = (double_input) ? min(img.dims[0]*2, img.dims[1]*2)
- : min(img.dims[0], img.dims[1]);
- const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
-
- Param<T> init_img = createInitialImage<T, convAccT>(img, init_sigma, double_input);
-
- std::vector< Param<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
-
- std::vector< Param<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
-
- std::vector<float*> d_x_pyr(n_octaves, NULL);
- std::vector<float*> d_y_pyr(n_octaves, NULL);
- std::vector<float*> d_response_pyr(n_octaves, NULL);
- std::vector<float*> d_size_pyr(n_octaves, NULL);
- std::vector<float*> d_ori_pyr(n_octaves, NULL);
- std::vector<float*> d_desc_pyr(n_octaves, NULL);
- std::vector<unsigned> feat_pyr(n_octaves, 0);
- unsigned total_feat = 0;
-
- const unsigned d = DESCR_WIDTH;
- const unsigned n = DESCR_HIST_BINS;
- const unsigned desc_len = d*d*n;
-
- unsigned* d_count = memAlloc<unsigned>(1);
- for (unsigned i = 0; i < n_octaves; i++) {
- if (dog_pyr[i].dims[0]-2*IMG_BORDER < 1 ||
- dog_pyr[i].dims[1]-2*IMG_BORDER < 1)
- continue;
-
- const unsigned imel = dog_pyr[i].dims[0] * dog_pyr[i].dims[1];
- const unsigned max_feat = ceil(imel * feature_ratio);
-
- CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
- cuda::getStream(cuda::getActiveDeviceId())));
-
- float* d_extrema_x = memAlloc<float>(max_feat);
- float* d_extrema_y = memAlloc<float>(max_feat);
- unsigned* d_extrema_layer = memAlloc<unsigned>(max_feat);
-
- int dim0 = dog_pyr[i].dims[0];
- int dim1 = dog_pyr[i].dims[1];
-
- dim3 threads(SIFT_THREADS_X, SIFT_THREADS_Y);
- dim3 blocks(divup(dim0-2*IMG_BORDER, threads.x), divup(dim1-2*IMG_BORDER, threads.y));
-
- float extrema_thr = 0.5f * contrast_thr / n_layers;
- const size_t extrema_shared_size = (threads.x+2) * (threads.y+2) * 3 * sizeof(float);
- CUDA_LAUNCH_SMEM((detectExtrema<T>), blocks, threads, extrema_shared_size,
- d_extrema_x, d_extrema_y, d_extrema_layer, d_count,
- CParam<T>(dog_pyr[i]), max_feat, extrema_thr);
- POST_LAUNCH_CHECK();
-
- unsigned extrema_feat = 0;
- CUDA_CHECK(cudaMemcpy(&extrema_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
- extrema_feat = min(extrema_feat, max_feat);
-
- if (extrema_feat == 0) {
- memFree(d_extrema_x);
- memFree(d_extrema_y);
- memFree(d_extrema_layer);
-
- continue;
- }
-
- CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
- cuda::getStream(cuda::getActiveDeviceId())));
-
- unsigned interp_feat = 0;
-
- float* d_interp_x = memAlloc<float>(extrema_feat);
- float* d_interp_y = memAlloc<float>(extrema_feat);
- unsigned* d_interp_layer = memAlloc<unsigned>(extrema_feat);
- float* d_interp_response = memAlloc<float>(extrema_feat);
- float* d_interp_size = memAlloc<float>(extrema_feat);
-
- threads = dim3(SIFT_THREADS, 1);
- blocks = dim3(divup(extrema_feat, threads.x), 1);
-
- CUDA_LAUNCH((interpolateExtrema<T>), blocks, threads,
- d_interp_x, d_interp_y, d_interp_layer,
- d_interp_response, d_interp_size, d_count,
- d_extrema_x, d_extrema_y, d_extrema_layer, extrema_feat,
- dog_pyr[i], max_feat, i, n_layers,
- contrast_thr, edge_thr, init_sigma, img_scale);
- POST_LAUNCH_CHECK();
-
- CUDA_CHECK(cudaMemcpy(&interp_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
- interp_feat = min(interp_feat, max_feat);
-
- CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
- cuda::getStream(cuda::getActiveDeviceId())));
-
- if (interp_feat == 0) {
- memFree(d_interp_x);
- memFree(d_interp_y);
- memFree(d_interp_layer);
- memFree(d_interp_response);
- memFree(d_interp_size);
-
- continue;
- }
-
- thrust::device_ptr<float> interp_x_ptr = thrust::device_pointer_cast(d_interp_x);
- thrust::device_ptr<float> interp_y_ptr = thrust::device_pointer_cast(d_interp_y);
- thrust::device_ptr<unsigned> interp_layer_ptr = thrust::device_pointer_cast(d_interp_layer);
- thrust::device_ptr<float> interp_response_ptr = thrust::device_pointer_cast(d_interp_response);
- thrust::device_ptr<float> interp_size_ptr = thrust::device_pointer_cast(d_interp_size);
-
- thrust::device_vector<int> permutation(interp_feat);
- thrust::sequence(permutation.begin(), permutation.end());
-
- update_permutation<float>(interp_size_ptr, permutation);
- update_permutation<float>(interp_response_ptr, permutation);
- update_permutation<unsigned>(interp_layer_ptr, permutation);
- update_permutation<float>(interp_y_ptr, permutation);
- update_permutation<float>(interp_x_ptr, permutation);
-
- apply_permutation<float>(interp_size_ptr, permutation);
- apply_permutation<float>(interp_response_ptr, permutation);
- apply_permutation<unsigned>(interp_layer_ptr, permutation);
- apply_permutation<float>(interp_y_ptr, permutation);
- apply_permutation<float>(interp_x_ptr, permutation);
-
- memFree(d_extrema_x);
- memFree(d_extrema_y);
- memFree(d_extrema_layer);
-
- float* d_nodup_x = memAlloc<float>(interp_feat);
- float* d_nodup_y = memAlloc<float>(interp_feat);
- unsigned* d_nodup_layer = memAlloc<unsigned>(interp_feat);
- float* d_nodup_response = memAlloc<float>(interp_feat);
- float* d_nodup_size = memAlloc<float>(interp_feat);
-
- threads = dim3(SIFT_THREADS, 1);
- blocks = dim3(divup(interp_feat, threads.x), 1);
-
- CUDA_LAUNCH((removeDuplicates), blocks, threads,
- d_nodup_x, d_nodup_y, d_nodup_layer,
- d_nodup_response, d_nodup_size, d_count,
- d_interp_x, d_interp_y, d_interp_layer,
- d_interp_response, d_interp_size, interp_feat);
- POST_LAUNCH_CHECK();
-
- memFree(d_interp_x);
- memFree(d_interp_y);
- memFree(d_interp_layer);
- memFree(d_interp_response);
- memFree(d_interp_size);
-
- unsigned nodup_feat = 0;
- CUDA_CHECK(cudaMemcpy(&nodup_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
- CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
- cuda::getStream(cuda::getActiveDeviceId())));
-
- const unsigned max_oriented_feat = nodup_feat * 3;
-
- float* d_oriented_x = memAlloc<float>(max_oriented_feat);
- float* d_oriented_y = memAlloc<float>(max_oriented_feat);
- unsigned* d_oriented_layer = memAlloc<unsigned>(max_oriented_feat);
- float* d_oriented_response = memAlloc<float>(max_oriented_feat);
- float* d_oriented_size = memAlloc<float>(max_oriented_feat);
- float* d_oriented_ori = memAlloc<float>(max_oriented_feat);
-
- threads = dim3(SIFT_THREADS_X, SIFT_THREADS_Y);
- blocks = dim3(1, divup(nodup_feat, threads.y));
-
- const size_t ori_shared_size = ORI_HIST_BINS * threads.y * 2 * sizeof(float);
- CUDA_LAUNCH_SMEM((calcOrientation<T>), blocks, threads, ori_shared_size,
- d_oriented_x, d_oriented_y, d_oriented_layer,
- d_oriented_response, d_oriented_size, d_oriented_ori, d_count,
- d_nodup_x, d_nodup_y, d_nodup_layer,
- d_nodup_response, d_nodup_size, nodup_feat,
- gauss_pyr[i], max_oriented_feat, i, double_input);
- POST_LAUNCH_CHECK();
-
- memFree(d_nodup_x);
- memFree(d_nodup_y);
- memFree(d_nodup_layer);
- memFree(d_nodup_response);
- memFree(d_nodup_size);
-
- unsigned oriented_feat = 0;
- CUDA_CHECK(cudaMemcpy(&oriented_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
- oriented_feat = min(oriented_feat, max_oriented_feat);
-
- if (oriented_feat == 0) {
- memFree(d_oriented_x);
- memFree(d_oriented_y);
- memFree(d_oriented_layer);
- memFree(d_oriented_response);
- memFree(d_oriented_size);
- memFree(d_oriented_ori);
-
- continue;
- }
-
- float* d_desc = memAlloc<float>(oriented_feat * desc_len);
-
- float scale = 1.f/(1 << i);
- if (double_input) scale *= 2.f;
-
- threads = dim3(SIFT_THREADS, 1);
- blocks = dim3(1, divup(oriented_feat, threads.y));
-
- const unsigned histsz = 8;
- const size_t shared_size = desc_len * (histsz+1) * sizeof(float);
-
- CUDA_LAUNCH_SMEM((computeDescriptor<T>), blocks, threads, shared_size,
- d_desc, desc_len, histsz,
- d_oriented_x, d_oriented_y, d_oriented_layer,
- d_oriented_response, d_oriented_size, d_oriented_ori,
- oriented_feat, gauss_pyr[i], d, n, scale, init_sigma, n_layers);
- POST_LAUNCH_CHECK();
-
- total_feat += oriented_feat;
- feat_pyr[i] = oriented_feat;
-
- if (oriented_feat > 0) {
- d_x_pyr[i] = d_oriented_x;
- d_y_pyr[i] = d_oriented_y;
- d_response_pyr[i] = d_oriented_response;
- d_ori_pyr[i] = d_oriented_ori;
- d_size_pyr[i] = d_oriented_size;
- d_desc_pyr[i] = d_desc;
- }
- }
-
- memFree(d_count);
-
- for (size_t i = 0; i < gauss_pyr.size(); i++)
- memFree(gauss_pyr[i].ptr);
- for (size_t i = 0; i < dog_pyr.size(); i++)
- memFree(dog_pyr[i].ptr);
-
- // Allocate output memory
- *d_x = memAlloc<float>(total_feat);
- *d_y = memAlloc<float>(total_feat);
- *d_score = memAlloc<float>(total_feat);
- *d_ori = memAlloc<float>(total_feat);
- *d_size = memAlloc<float>(total_feat);
- *d_desc = memAlloc<float>(total_feat * desc_len);
-
- unsigned offset = 0;
- for (unsigned i = 0; i < n_octaves; i++) {
- if (feat_pyr[i] == 0)
- continue;
-
- CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_score+offset, d_response_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
- CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-
- CUDA_CHECK(cudaMemcpy(*d_desc+(offset*desc_len), d_desc_pyr[i],
- feat_pyr[i] * desc_len * sizeof(float), cudaMemcpyDeviceToDevice));
-
- memFree(d_x_pyr[i]);
- memFree(d_y_pyr[i]);
- memFree(d_response_pyr[i]);
- memFree(d_ori_pyr[i]);
- memFree(d_size_pyr[i]);
- memFree(d_desc_pyr[i]);
-
- offset += feat_pyr[i];
- }
-
- // Sets number of output features
- *out_feat = total_feat;
- *out_dlen = desc_len;
-}
-
-} // namespace kernel
-
-} // namespace cuda
diff --git a/src/backend/opencl/kernel/sift_nonfree.cl b/src/backend/opencl/kernel/sift_nonfree.cl
deleted file mode 100644
index ef63cab..0000000
--- a/src/backend/opencl/kernel/sift_nonfree.cl
+++ /dev/null
@@ -1,806 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements. Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims. If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement. It does not protect you from
-// committing patent infringement. So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either. If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// * Redistributions of source code must retain the above copyright and
-// patent notices, this list of conditions and the following
-// disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in
-// the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Oregon State University nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// width of border in which to ignore keypoints
-#define IMG_BORDER 5
-
-// maximum steps of keypoint interpolation before failure
-#define MAX_INTERP_STEPS 5
-
-// default number of bins in histogram for orientation assignment
-#define ORI_HIST_BINS 36
-
-// determines gaussian sigma for orientation assignment
-#define ORI_SIG_FCTR 1.5f
-
-// determines the radius of the region used in orientation assignment
-#define ORI_RADIUS (3 * ORI_SIG_FCTR)
-
-// number of passes of orientation histogram smoothing
-#define SMOOTH_ORI_PASSES 2
-
-// orientation magnitude relative to max that results in new feature
-#define ORI_PEAK_RATIO 0.8f
-
-// determines the size of a single descriptor orientation histogram
-#define DESCR_SCL_FCTR 3.f
-
-// threshold on magnitude of elements of descriptor vector
-#define DESCR_MAG_THR 0.2f
-
-// factor used to convert floating-point descriptor to unsigned char
-#define INT_DESCR_FCTR 512.f
-
-#define PI_VAL 3.14159265358979323846f
-
-void gaussianElimination(float* A, float* b, float* x, const int n)
-{
- // forward elimination
- for (int i = 0; i < n-1; i++) {
- for (int j = i+1; j < n; j++) {
- float s = A[j*n+i] / A[i*n+i];
-
- //for (int k = i+1; k < n; k++)
- for (int k = i; k < n; k++)
- A[j*n+k] -= s * A[i*n+k];
-
- b[j] -= s * b[i];
- }
- }
-
- for (int i = 0; i < n; i++)
- x[i] = 0;
-
- // backward substitution
- float sum = 0;
- for (int i = 0; i <= n-2; i++) {
- sum = b[i];
- for (int j = i+1; j < n; j++)
- sum -= A[i*n+j] * x[j];
- x[i] = sum / A[i*n+i];
- }
-}
-
-inline void fatomic_add(volatile __local float *source, const float operand) {
- union {
- unsigned int intVal;
- float floatVal;
- } newVal;
- union {
- unsigned int intVal;
- float floatVal;
- } prevVal;
- do {
- prevVal.floatVal = *source;
- newVal.floatVal = prevVal.floatVal + operand;
- } while (atomic_cmpxchg((volatile __local unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
-}
-
-inline void normalizeDesc(
- __local float* desc,
- __local float* accum,
- const int histlen,
- int lid_x,
- int lid_y,
- int lsz_x)
-{
- for (int i = lid_x; i < histlen; i += lsz_x)
- accum[i] = desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- float sum = 0.0f;
- for (int i = 0; i < histlen; i++)
- sum += desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lid_x < 64)
- accum[lid_x] += accum[lid_x+64];
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 32)
- accum[lid_x] += accum[lid_x+32];
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 16)
- accum[lid_x] += accum[lid_x+16];
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 8)
- accum[lid_x] += accum[lid_x+8];
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 4)
- accum[lid_x] += accum[lid_x+4];
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 2)
- accum[lid_x] += accum[lid_x+2];
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 1)
- accum[lid_x] += accum[lid_x+1];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- float len_sq = accum[0];
- float len_inv = 1.0f / sqrt(len_sq);
-
- for (int i = lid_x; i < histlen; i += lsz_x) {
- desc[lid_y*histlen+i] *= len_inv;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-__kernel void sub(
- __global T* out,
- __global const T* in,
- unsigned nel,
- unsigned n_layers)
-{
- unsigned i = get_global_id(0);
-
- for (unsigned l = 0; l < n_layers; l++)
- out[l*nel + i] = in[l*nel + i] - in[(l+1)*nel + i];
-}
-
-#define LCPTR(Y, X) (l_center[(Y) * l_i + (X)])
-#define LPPTR(Y, X) (l_prev[(Y) * l_i + (X)])
-#define LNPTR(Y, X) (l_next[(Y) * l_i + (X)])
-
-// Determines whether a pixel is a scale-space extremum by comparing it to its
-// 3x3x3 pixel neighborhood.
-__kernel void detectExtrema(
- __global float* x_out,
- __global float* y_out,
- __global unsigned* layer_out,
- __global unsigned* counter,
- __global const T* dog,
- KParam iDoG,
- const unsigned max_feat,
- const float threshold,
- __local float* l_mem)
-{
- const int dim0 = iDoG.dims[0];
- const int dim1 = iDoG.dims[1];
- const int imel = iDoG.dims[0]*iDoG.dims[1];
-
- const int lid_i = get_local_id(0);
- const int lid_j = get_local_id(1);
- const int lsz_i = get_local_size(0);
- const int lsz_j = get_local_size(1);
- const int i = get_group_id(0) * lsz_i + lid_i+IMG_BORDER;
- const int j = get_group_id(1) * lsz_j + lid_j+IMG_BORDER;
-
- // One pixel border for each side
- const int l_i = lsz_i+2;
- const int l_j = lsz_j+2;
-
- __local float* l_prev = l_mem;
- __local float* l_center = l_mem + l_i * l_j;
- __local float* l_next = l_mem + l_i * l_j * 2;
-
- const int x = lid_i+1;
- const int y = lid_j+1;
-
- for (int l = 1; l < iDoG.dims[2]-1; l++) {
- const int l_i_half = l_i/2;
- const int l_j_half = l_j/2;
- if (lid_i < l_i_half && lid_j < l_j_half && i < dim0-IMG_BORDER+1 && j < dim1-IMG_BORDER+1) {
- l_next [lid_j*l_i + lid_i] = (float)dog[(l+1)*imel+(j-1)*dim0+i-1];
- l_center[lid_j*l_i + lid_i] = (float)dog[(l )*imel+(j-1)*dim0+i-1];
- l_prev [lid_j*l_i + lid_i] = (float)dog[(l-1)*imel+(j-1)*dim0+i-1];
-
- l_next [lid_j*l_i + lid_i+l_i_half] = (float)dog[(l+1)*imel+(j-1)*dim0+i-1+l_i_half];
- l_center[lid_j*l_i + lid_i+l_i_half] = (float)dog[(l )*imel+(j-1)*dim0+i-1+l_i_half];
- l_prev [lid_j*l_i + lid_i+l_i_half] = (float)dog[(l-1)*imel+(j-1)*dim0+i-1+l_i_half];
-
- l_next [(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1];
- l_center[(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l )*imel+(j-1+l_j_half)*dim0+i-1];
- l_prev [(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1];
-
- l_next [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
- l_center[(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l )*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
- l_prev [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (i < dim0-IMG_BORDER && j < dim1-IMG_BORDER) {
- const int l_i_half = l_i/2;
- float p = l_center[y*l_i + x];
-
- if (fabs((float)p) > threshold &&
- ((p > 0 && p > LCPTR(y-1, x-1) && p > LCPTR(y-1, x) &&
- p > LCPTR(y-1, x+1) && p > LCPTR(y, x-1) && p > LCPTR(y, x+1) &&
- p > LCPTR(y+1, x-1) && p > LCPTR(y+1, x) && p > LCPTR(y+1, x+1) &&
- p > LPPTR(y-1, x-1) && p > LPPTR(y-1, x) && p > LPPTR(y-1, x+1) &&
- p > LPPTR(y, x-1) && p > LPPTR(y , x) && p > LPPTR(y, x+1) &&
- p > LPPTR(y+1, x-1) && p > LPPTR(y+1, x) && p > LPPTR(y+1, x+1) &&
- p > LNPTR(y-1, x-1) && p > LNPTR(y-1, x) && p > LNPTR(y-1, x+1) &&
- p > LNPTR(y, x-1) && p > LNPTR(y , x) && p > LNPTR(y, x+1) &&
- p > LNPTR(y+1, x-1) && p > LNPTR(y+1, x) && p > LNPTR(y+1, x+1)) ||
- (p < 0 && p < LCPTR(y-1, x-1) && p < LCPTR(y-1, x) &&
- p < LCPTR(y-1, x+1) && p < LCPTR(y, x-1) && p < LCPTR(y, x+1) &&
- p < LCPTR(y+1, x-1) && p < LCPTR(y+1, x) && p < LCPTR(y+1, x+1) &&
- p < LPPTR(y-1, x-1) && p < LPPTR(y-1, x) && p < LPPTR(y-1, x+1) &&
- p < LPPTR(y, x-1) && p < LPPTR(y , x) && p < LPPTR(y, x+1) &&
- p < LPPTR(y+1, x-1) && p < LPPTR(y+1, x) && p < LPPTR(y+1, x+1) &&
- p < LNPTR(y-1, x-1) && p < LNPTR(y-1, x) && p < LNPTR(y-1, x+1) &&
- p < LNPTR(y, x-1) && p < LNPTR(y , x) && p < LNPTR(y, x+1) &&
- p < LNPTR(y+1, x-1) && p < LNPTR(y+1, x) && p < LNPTR(y+1, x+1)))) {
-
- unsigned idx = atomic_inc(counter);
- if (idx < max_feat)
- {
- x_out[idx] = (float)j;
- y_out[idx] = (float)i;
- layer_out[idx] = l;
- }
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-#undef LCPTR
-#undef LPPTR
-#undef LNPTR
-#define CPTR(Y, X) (center[(Y) * dim0 + (X)])
-#define PPTR(Y, X) (prev[(Y) * dim0 + (X)])
-#define NPTR(Y, X) (next[(Y) * dim0 + (X)])
-
-// Interpolates a scale-space extremum's location and scale to subpixel
-// accuracy to form an image feature. Rejects features with low contrast.
-// Based on Section 4 of Lowe's paper.
-__kernel void interpolateExtrema(
- __global float* x_out,
- __global float* y_out,
- __global unsigned* layer_out,
- __global float* response_out,
- __global float* size_out,
- __global unsigned* counter,
- __global const float* x_in,
- __global const float* y_in,
- __global const unsigned* layer_in,
- const unsigned extrema_feat,
- __global const T* dog_octave,
- KParam iDoG,
- const unsigned max_feat,
- const unsigned octave,
- const unsigned n_layers,
- const float contrast_thr,
- const float edge_thr,
- const float sigma,
- const float img_scale)
-{
- const unsigned f = get_global_id(0);
-
- if (f < extrema_feat)
- {
- const float first_deriv_scale = img_scale*0.5f;
- const float second_deriv_scale = img_scale;
- const float cross_deriv_scale = img_scale*0.25f;
-
- float xl = 0, xy = 0, xx = 0, contr = 0;
- int i = 0;
-
- unsigned x = x_in[f];
- unsigned y = y_in[f];
- unsigned layer = layer_in[f];
-
- const int dim0 = iDoG.dims[0];
- const int dim1 = iDoG.dims[1];
- const int imel = dim0 * dim1;
-
- __global const T* prev = dog_octave + (int)((layer-1)*imel);
- __global const T* center = dog_octave + (int)((layer )*imel);
- __global const T* next = dog_octave + (int)((layer+1)*imel);
-
- for(i = 0; i < MAX_INTERP_STEPS; i++) {
- float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
- (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
- (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
-
- float d2 = CPTR(x, y) * 2.f;
- float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
- float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
- float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale;
- float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
- CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
- float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
- PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
- float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
- PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
- float H[9] = {dxx, dxy, dxs,
- dxy, dyy, dys,
- dxs, dys, dss};
-
- float X[3];
- gaussianElimination(H, dD, X, 3);
-
- xl = -X[2];
- xy = -X[1];
- xx = -X[0];
-
- if(fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
- break;
-
- x += round(xx);
- y += round(xy);
- layer += round(xl);
-
- if(layer < 1 || layer > n_layers ||
- x < IMG_BORDER || x >= dim1 - IMG_BORDER ||
- y < IMG_BORDER || y >= dim0 - IMG_BORDER)
- return;
- }
-
- // ensure convergence of interpolation
- if (i >= MAX_INTERP_STEPS)
- return;
-
- float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
- (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
- (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
- float X[3] = {xx, xy, xl};
-
- float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
- contr = center[x*dim0+y]*img_scale + P * 0.5f;
- if(fabs(contr) < (contrast_thr / n_layers))
- return;
-
- // principal curvatures are computed using the trace and det of Hessian
- float d2 = CPTR(x, y) * 2.f;
- float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
- float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
- float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
- CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-
- float tr = dxx + dyy;
- float det = dxx * dyy - dxy * dxy;
-
- // add FLT_EPSILON for double-precision compatibility
- if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
- return;
-
- unsigned ridx = atomic_inc(counter);
-
- if (ridx < max_feat)
- {
- x_out[ridx] = (x + xx) * (1 << octave);
- y_out[ridx] = (y + xy) * (1 << octave);
- layer_out[ridx] = layer;
- response_out[ridx] = fabs(contr);
- size_out[ridx] = sigma*pow(2.f, octave + (layer + xl) / n_layers);
- }
- }
-}
-
-#undef CPTR
-#undef PPTR
-#undef NPTR
-
-// Remove duplicate keypoints
-__kernel void removeDuplicates(
- __global float* x_out,
- __global float* y_out,
- __global unsigned* layer_out,
- __global float* response_out,
- __global float* size_out,
- __global unsigned* counter,
- __global const float* x_in,
- __global const float* y_in,
- __global const unsigned* layer_in,
- __global const float* response_in,
- __global const float* size_in,
- const unsigned total_feat)
-{
- const unsigned f = get_global_id(0);
-
- if (f < total_feat) {
- const float prec_fctr = 1e4f;
-
- bool cond = (f < total_feat-1)
- ? !(round(x_in[f]*prec_fctr) == round(x_in[f+1]*prec_fctr) &&
- round(y_in[f]*prec_fctr) == round(y_in[f+1]*prec_fctr) &&
- layer_in[f] == layer_in[f+1] &&
- round(response_in[f]*prec_fctr) == round(response_in[f+1]*prec_fctr) &&
- round(size_in[f]*prec_fctr) == round(size_in[f+1]*prec_fctr))
- : true;
-
- if (cond) {
- unsigned idx = atomic_inc(counter);
-
- x_out[idx] = x_in[f];
- y_out[idx] = y_in[f];
- layer_out[idx] = layer_in[f];
- response_out[idx] = response_in[f];
- size_out[idx] = size_in[f];
- }
- }
-
-}
-
-#define IPTR(Y, X) (img[(Y) * dim0 + X])
-
-// Computes a canonical orientation for each image feature in an array. Based
-// on Section 5 of Lowe's paper. This function adds features to the array when
-// there is more than one dominant orientation at a given feature location.
-__kernel void calcOrientation(
- __global float* x_out,
- __global float* y_out,
- __global unsigned* layer_out,
- __global float* response_out,
- __global float* size_out,
- __global float* ori_out,
- __global unsigned* counter,
- __global const float* x_in,
- __global const float* y_in,
- __global const unsigned* layer_in,
- __global const float* response_in,
- __global const float* size_in,
- const unsigned total_feat,
- __global const T* gauss_octave,
- KParam iGauss,
- const unsigned max_feat,
- const unsigned octave,
- const int double_input,
- __local float* l_mem)
-{
- const int lid_x = get_local_id(0);
- const int lid_y = get_local_id(1);
- const int lsz_x = get_local_size(0);
-
- const unsigned f = get_global_id(1);
-
- const int n = ORI_HIST_BINS;
-
- __local float* hist = l_mem;
- __local float* temphist = l_mem + n*8;
-
- // Initialize temporary histogram
- for (int i = lid_x; i < n; i += lsz_x) {
- hist[lid_y*n + i] = 0.f;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- float real_x, real_y, response, size;
- unsigned layer;
-
- if (f < total_feat) {
- // Load keypoint information
- real_x = x_in[f];
- real_y = y_in[f];
- layer = layer_in[f];
- response = response_in[f];
- size = size_in[f];
-
- const int pt_x = (int)round(real_x / (1 << octave));
- const int pt_y = (int)round(real_y / (1 << octave));
-
- // Calculate auxiliary parameters
- const float scl_octv = size*0.5f / (1 << octave);
- const int radius = (int)round(ORI_RADIUS * scl_octv);
- const float sigma = ORI_SIG_FCTR * scl_octv;
- const int len = (radius*2+1);
- const float exp_denom = 2.f * sigma * sigma;
-
- const int dim0 = iGauss.dims[0];
- const int dim1 = iGauss.dims[1];
-
- // Calculate layer offset
- const int layer_offset = layer * dim0 * dim1;
- __global const T* img = gauss_octave + layer_offset;
-
- // Calculate orientation histogram
- for (int l = lid_x; l < len*len; l += lsz_x) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = pt_y + i;
- int x = pt_x + j;
- if (y < 1 || y >= dim0 - 1 ||
- x < 1 || x >= dim1 - 1)
- continue;
-
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float mag = sqrt(dx*dx+dy*dy);
- float ori = atan2(dy,dx);
- float w = exp(-(i*i + j*j)/exp_denom);
-
- int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
- bin = bin < n ? bin : 0;
- bin = (bin < 0) ? 0 : (bin >= n) ? n-1 : bin;
-
- fatomic_add(&hist[lid_y*n+bin], w*mag);
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for (int i = 0; i < SMOOTH_ORI_PASSES; i++) {
- for (int j = lid_x; j < n; j += lsz_x) {
- temphist[lid_y*n+j] = hist[lid_y*n+j];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int j = lid_x; j < n; j += lsz_x) {
- float prev = (j == 0) ? temphist[lid_y*n+n-1] : temphist[lid_y*n+j-1];
- float next = (j+1 == n) ? temphist[lid_y*n] : temphist[lid_y*n+j+1];
- hist[lid_y*n+j] = 0.25f * prev + 0.5f * temphist[lid_y*n+j] + 0.25f * next;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- for (int i = lid_x; i < n; i += lsz_x)
- temphist[lid_y*n+i] = hist[lid_y*n+i];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lid_x < 16)
- temphist[lid_y*n+lid_x] = fmax(hist[lid_y*n+lid_x], hist[lid_y*n+lid_x+16]);
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 8)
- temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+8]);
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 4) {
- temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], hist[lid_y*n+lid_x+32]);
- temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+4]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 2)
- temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+2]);
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid_x < 1)
- temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+1]);
- barrier(CLK_LOCAL_MEM_FENCE);
- float omax = temphist[lid_y*n];
-
- if (f < total_feat) {
- float mag_thr = (float)(omax * ORI_PEAK_RATIO);
- int l, r;
- float bin;
- for (int j = lid_x; j < n; j+=lsz_x) {
- l = (j == 0) ? n - 1 : j - 1;
- r = (j + 1) % n;
- if (hist[lid_y*n+j] > hist[lid_y*n+l] &&
- hist[lid_y*n+j] > hist[lid_y*n+r] &&
- hist[lid_y*n+j] >= mag_thr) {
- unsigned idx = atomic_inc(counter);
-
- if (idx < max_feat) {
- float bin = j + 0.5f * (hist[lid_y*n+l] - hist[lid_y*n+r]) /
- (hist[lid_y*n+l] - 2.0f*hist[lid_y*n+j] + hist[lid_y*n+r]);
- bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
- float ori = 360.f - ((360.f/n) * bin);
-
- float new_real_x = real_x;
- float new_real_y = real_y;
- float new_size = size;
-
- if (double_input != 0) {
- float scale = 0.5f;
- new_real_x *= scale;
- new_real_y *= scale;
- new_size *= scale;
- }
-
- x_out[idx] = new_real_x;
- y_out[idx] = new_real_y;
- layer_out[idx] = layer;
- response_out[idx] = response;
- size_out[idx] = new_size;
- ori_out[idx] = ori;
- }
- }
- }
- }
-}
-
-// Computes feature descriptors for features in an array. Based on Section 6
-// of Lowe's paper.
-__kernel void computeDescriptor(
- __global float* desc_out,
- const unsigned desc_len,
- const unsigned histsz,
- __global const float* x_in,
- __global const float* y_in,
- __global const unsigned* layer_in,
- __global const float* response_in,
- __global const float* size_in,
- __global const float* ori_in,
- const unsigned total_feat,
- __global const T* gauss_octave,
- KParam iGauss,
- const int d,
- const int n,
- const float scale,
- const float sigma,
- const int n_layers,
- __local float* l_mem)
-{
- const int lid_x = get_local_id(0);
- const int lid_y = get_local_id(1);
- const int lsz_x = get_local_size(0);
-
- const int f = get_global_id(1);
-
- __local float* desc = l_mem;
- __local float* accum = l_mem + desc_len * histsz;
-
- const int histlen = d*d*n;
-
- for (int i = lid_x; i < histlen*histsz; i += lsz_x)
- desc[lid_y*histlen+i] = 0.f;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (f < total_feat) {
- const unsigned layer = layer_in[f];
- float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
- ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
- const float size = size_in[f];
- const int fx = round(x_in[f] * scale);
- const int fy = round(y_in[f] * scale);
-
- // Points img to correct Gaussian pyramid layer
- const int dim0 = iGauss.dims[0];
- const int dim1 = iGauss.dims[1];
- __global const T* img = gauss_octave + (layer * dim0 * dim1);
-
- float cos_t = cos(ori);
- float sin_t = sin(ori);
- float bins_per_rad = n / (PI_VAL * 2.f);
- float exp_denom = d * d * 0.5f;
- float hist_width = DESCR_SCL_FCTR * sigma * pow(2.f, layer/n_layers);
- int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
- int len = radius*2+1;
- const int hist_off = (lid_x % histsz) * desc_len;
-
- // Calculate orientation histogram
- for (int l = lid_x; l < len*len; l += lsz_x) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = fy + i;
- int x = fx + j;
-
- float x_rot = (j * cos_t - i * sin_t) / hist_width;
- float y_rot = (j * sin_t + i * cos_t) / hist_width;
- float xbin = x_rot + d/2 - 0.5f;
- float ybin = y_rot + d/2 - 0.5f;
-
- if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
- y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float grad_mag = sqrt(dx*dx + dy*dy);
- float grad_ori = atan2(dy, dx) - ori;
- while (grad_ori < 0.0f)
- grad_ori += PI_VAL*2;
- while (grad_ori >= PI_VAL*2)
- grad_ori -= PI_VAL*2;
-
- float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
- float obin = grad_ori * bins_per_rad;
- float mag = grad_mag*w;
-
- int x0 = floor(xbin);
- int y0 = floor(ybin);
- int o0 = floor(obin);
- xbin -= x0;
- ybin -= y0;
- obin -= o0;
-
- for (int yl = 0; yl <= 1; yl++) {
- int yb = y0 + yl;
- if (yb >= 0 && yb < d) {
- float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
- for (int xl = 0; xl <= 1; xl++) {
- int xb = x0 + xl;
- if (xb >= 0 && xb < d) {
- float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
- for (int ol = 0; ol <= 1; ol++) {
- int ob = (o0 + ol) % n;
- float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
- fatomic_add(&desc[hist_off + lid_y*desc_len + (yb*d + xb)*n + ob], v_o);
- }
- }
- }
- }
- }
- }
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Combine histograms (reduces previous atomicAdd overhead)
- for (int l = lid_x; l < desc_len*4; l += lsz_x)
- desc[l] += desc[l+4*desc_len];
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int l = lid_x; l < desc_len*2; l += lsz_x)
- desc[l ] += desc[l+2*desc_len];
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int l = lid_x; l < desc_len; l += lsz_x)
- desc[l] += desc[l+desc_len];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
-
- for (int i = lid_x; i < d*d*n; i += lsz_x)
- desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
-
- if (f < total_feat) {
- // Calculate final descriptor values
- for (int k = lid_x; k < d*d*n; k += lsz_x)
- desc_out[f*desc_len+k] = round(min(255.f, desc[lid_y*desc_len+k] * INT_DESCR_FCTR));
- }
-}
-
-#undef IPTR
diff --git a/src/backend/opencl/kernel/sift_nonfree.hpp b/src/backend/opencl/kernel/sift_nonfree.hpp
deleted file mode 100644
index 9c36a3f..0000000
--- a/src/backend/opencl/kernel/sift_nonfree.hpp
+++ /dev/null
@@ -1,784 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements. Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims. If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement. It does not protect you from
-// committing patent infringement. So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either. If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// * Redistributions of source code must retain the above copyright and
-// patent notices, this list of conditions and the following
-// disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in
-// the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Oregon State University nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <af/defines.h>
-#include <program.hpp>
-#include <dispatch.hpp>
-#include <err_opencl.hpp>
-#include <debug_opencl.hpp>
-
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-
-#include <boost/compute/core.hpp>
-#include <boost/compute/iterator/buffer_iterator.hpp>
-#include <boost/compute/algorithm/gather.hpp>
-#include <boost/compute/algorithm/iota.hpp>
-#include <boost/compute/algorithm/sort_by_key.hpp>
-
-#pragma GCC diagnostic pop
-
-#include <convolve_common.hpp>
-#include <kernel/convolve_separable.hpp>
-#include <kernel/fast.hpp>
-#include <kernel/resize.hpp>
-#include <kernel/sort_index.hpp>
-#include <kernel_headers/sift_nonfree.hpp>
-#include <memory.hpp>
-#include <vector>
-
-using cl::Buffer;
-using cl::Program;
-using cl::Kernel;
-using cl::EnqueueArgs;
-using cl::LocalSpaceArg;
-using cl::NDRange;
-using std::vector;
-
-namespace opencl
-{
-
-namespace kernel
-{
-
-static const int SIFT_THREADS = 256;
-static const int SIFT_THREADS_X = 32;
-static const int SIFT_THREADS_Y = 8;
-
-// assumed gaussian blur for input image
-static const float InitSigma = 0.5f;
-
-// width of border in which to ignore keypoints
-static const int ImgBorder = 5;
-
-// default width of descriptor histogram array
-static const int DescrWidth = 4;
-
-// default number of bins per histogram in descriptor array
-static const int DescrHistBins = 8;
-
-// default number of bins in histogram for orientation assignment
-static const int OriHistBins = 36;
-
-static const float PI_VAL = 3.14159265358979323846f;
-
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
- if(!(sigma>0)) sigma = 0.25*dim;
-
- T sum = (T)0;
- for(int i=0;i<dim;i++)
- {
- int x = i-(dim-1)/2;
- T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
- out[i] = el;
- sum += el;
- }
-
- for(int k=0;k<dim;k++)
- out[k] /= sum;
-}
-
-template<typename T>
-Param gaussFilter(float sigma)
-{
- // Using 6-sigma rule
- unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
-
- T* h_gauss = new T[gauss_len];
- gaussian1D(h_gauss, gauss_len, sigma);
-
- Param gauss_filter;
- gauss_filter.info.offset = 0;
- gauss_filter.info.dims[0] = gauss_len;
- gauss_filter.info.strides[0] = 1;
-
- for (int k = 1; k < 4; k++) {
- gauss_filter.info.dims[k] = 1;
- gauss_filter.info.strides[k] = gauss_filter.info.dims[k-1] * gauss_filter.info.strides[k-1];
- }
-
- dim_t gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3];
- gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T));
- getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
- getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
-
- delete[] h_gauss;
-
- return gauss_filter;
-}
-
-template<typename T, typename convAccT>
-void convSepFull(Param& dst, Param src, Param filter)
-{
- Param tmp;
- tmp.info.offset = 0;
- for (int k = 0; k < 4; k++) {
- tmp.info.dims[k] = src.info.dims[k];
- tmp.info.strides[k] = src.info.strides[k];
- }
-
- const dim_t src_el = src.info.dims[3] * src.info.strides[3];
- tmp.data = bufferAlloc(src_el * sizeof(T));
-
- convSep<T, convAccT, 0, false>(tmp, src, filter);
- convSep<T, convAccT, 1, false>(dst, tmp, filter);
-
- bufferFree(tmp.data);
-}
-
-template<typename T, typename convAccT>
-Param createInitialImage(
- Param img,
- const float init_sigma,
- const bool double_input)
-{
- Param init_img;
- init_img.info.offset = 0;
- init_img.info.dims[0] = (double_input) ? img.info.dims[0] * 2 : img.info.dims[0];
- init_img.info.dims[1] = (double_input) ? img.info.dims[1] * 2 : img.info.dims[1];
- init_img.info.strides[0] = 1;
- init_img.info.strides[1] = init_img.info.dims[0];
-
- for (int k = 2; k < 4; k++) {
- init_img.info.dims[k] = 1;
- init_img.info.strides[k] = init_img.info.dims[k-1] * init_img.info.strides[k-1];
- }
-
- dim_t init_img_el = init_img.info.strides[3] * init_img.info.dims[3];
- init_img.data = bufferAlloc(init_img_el * sizeof(T));
-
- float s = (double_input) ? sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4)
- : sqrt(init_sigma * init_sigma - InitSigma * InitSigma);
-
- const Param filter = gaussFilter<convAccT>(s);
-
- if (double_input)
- resize<T, AF_INTERP_BILINEAR>(init_img, img);
-
- convSepFull<T, convAccT>(init_img, (double_input) ? init_img : img, filter);
-
- bufferFree(filter.data);
-
- return init_img;
-}
-
-template<typename T, typename convAccT>
-std::vector<Param> buildGaussPyr(
- Param init_img,
- const unsigned n_octaves,
- const unsigned n_layers,
- const float init_sigma)
-{
- // Precompute Gaussian sigmas using the following formula:
- // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
- std::vector<float> sig_layers(n_layers + 3);
- sig_layers[0] = init_sigma;
- float k = std::pow(2.0f, 1.0f / n_layers);
- for (unsigned i = 1; i < n_layers + 3; i++) {
- float sig_prev = std::pow(k, i-1) * init_sigma;
- float sig_total = sig_prev * k;
- sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
- }
-
- // Gaussian Pyramid
- std::vector<Param> gauss_pyr(n_octaves);
- std::vector<Param> tmp_pyr(n_octaves * (n_layers+3));
- for (unsigned o = 0; o < n_octaves; o++) {
- gauss_pyr[o].info.offset = 0;
- gauss_pyr[o].info.dims[0] = (o == 0) ? init_img.info.dims[0] : gauss_pyr[o-1].info.dims[0] / 2;
- gauss_pyr[o].info.dims[1] = (o == 0) ? init_img.info.dims[1] : gauss_pyr[o-1].info.dims[1] / 2;
- gauss_pyr[o].info.dims[2] = n_layers+3;
- gauss_pyr[o].info.dims[3] = 1;
-
- gauss_pyr[o].info.strides[0] = 1;
- gauss_pyr[o].info.strides[1] = gauss_pyr[o].info.dims[0] * gauss_pyr[o].info.strides[0];
- gauss_pyr[o].info.strides[2] = gauss_pyr[o].info.dims[1] * gauss_pyr[o].info.strides[1];
- gauss_pyr[o].info.strides[3] = gauss_pyr[o].info.dims[2] * gauss_pyr[o].info.strides[2];
-
- const unsigned nel = gauss_pyr[o].info.dims[3] * gauss_pyr[o].info.strides[3];
- gauss_pyr[o].data = bufferAlloc(nel * sizeof(T));
-
- for (unsigned l = 0; l < n_layers+3; l++) {
- unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
- unsigned idx = o*(n_layers+3) + l;
-
- tmp_pyr[o].info.offset = 0;
- if (o == 0 && l == 0) {
- for (int k = 0; k < 4; k++) {
- tmp_pyr[idx].info.dims[k] = init_img.info.dims[k];
- tmp_pyr[idx].info.strides[k] = init_img.info.strides[k];
- }
- tmp_pyr[idx].data = init_img.data;
- }
- else if (l == 0) {
- tmp_pyr[idx].info.dims[0] = tmp_pyr[src_idx].info.dims[0] / 2;
- tmp_pyr[idx].info.dims[1] = tmp_pyr[src_idx].info.dims[1] / 2;
- tmp_pyr[idx].info.strides[0] = 1;
- tmp_pyr[idx].info.strides[1] = tmp_pyr[idx].info.dims[0];
-
- for (int k = 2; k < 4; k++) {
- tmp_pyr[idx].info.dims[k] = 1;
- tmp_pyr[idx].info.strides[k] = tmp_pyr[idx].info.dims[k-1] * tmp_pyr[idx].info.strides[k-1];
- }
-
- dim_t lvl_el = tmp_pyr[idx].info.strides[3] * tmp_pyr[idx].info.dims[3];
- tmp_pyr[idx].data = bufferAlloc(lvl_el * sizeof(T));
-
- resize<T, AF_INTERP_BILINEAR>(tmp_pyr[idx], tmp_pyr[src_idx]);
- }
- else {
- for (int k = 0; k < 4; k++) {
- tmp_pyr[idx].info.dims[k] = tmp_pyr[src_idx].info.dims[k];
- tmp_pyr[idx].info.strides[k] = tmp_pyr[src_idx].info.strides[k];
- }
- dim_t lvl_el = tmp_pyr[idx].info.strides[3] * tmp_pyr[idx].info.dims[3];
- tmp_pyr[idx].data = bufferAlloc(lvl_el * sizeof(T));
-
- Param filter = gaussFilter<convAccT>(sig_layers[l]);
-
- convSepFull<T, convAccT>(tmp_pyr[idx], tmp_pyr[src_idx], filter);
-
- bufferFree(filter.data);
- }
-
- const unsigned imel = tmp_pyr[idx].info.dims[3] * tmp_pyr[idx].info.strides[3];
- const unsigned offset = imel * l;
-
- getQueue().enqueueCopyBuffer(*tmp_pyr[idx].data, *gauss_pyr[o].data, 0, offset*sizeof(T), imel * sizeof(T));
- }
- }
-
- for (unsigned o = 0; o < n_octaves; o++) {
- for (unsigned l = 0; l < n_layers+3; l++) {
- unsigned idx = o*(n_layers+3) + l;
- bufferFree(tmp_pyr[idx].data);
- }
- }
-
- return gauss_pyr;
-}
-
-template<typename T>
-std::vector<Param> buildDoGPyr(
- std::vector<Param> gauss_pyr,
- const unsigned n_octaves,
- const unsigned n_layers,
- Kernel* suKernel)
-{
- // DoG Pyramid
- std::vector<Param> dog_pyr(n_octaves);
- for (unsigned o = 0; o < n_octaves; o++) {
- for (int k = 0; k < 4; k++) {
- dog_pyr[o].info.dims[k] = (k == 2) ? gauss_pyr[o].info.dims[k]-1 : gauss_pyr[o].info.dims[k];
- dog_pyr[o].info.strides[k] = (k == 0) ? 1 : dog_pyr[o].info.dims[k-1] * dog_pyr[o].info.strides[k-1];
- }
- dog_pyr[o].info.offset = 0;
-
- dog_pyr[o].data = bufferAlloc(dog_pyr[o].info.dims[3] * dog_pyr[o].info.strides[3] * sizeof(T));
-
- const unsigned nel = dog_pyr[o].info.dims[1] * dog_pyr[o].info.strides[1];
- const unsigned dog_layers = n_layers+2;
-
- const int blk_x = divup(nel, SIFT_THREADS);
- const NDRange local(SIFT_THREADS, 1);
- const NDRange global(blk_x * SIFT_THREADS, 1);
-
- auto suOp = make_kernel<Buffer, Buffer, unsigned, unsigned> (*suKernel);
-
- suOp(EnqueueArgs(getQueue(), global, local),
- *dog_pyr[o].data, *gauss_pyr[o].data, nel, dog_layers);
- CL_DEBUG_FINISH(getQueue());
- }
-
- return dog_pyr;
-}
-
-template <typename T>
-void update_permutation(compute::buffer_iterator<T>& keys, compute::vector<int>& permutation, compute::command_queue& queue)
-{
- // temporary storage for keys
- compute::vector<T> temp(permutation.size(), 0, queue);
-
- // permute the keys with the current reordering
- compute::gather(permutation.begin(), permutation.end(), keys, temp.begin(), queue);
-
- // stable_sort the permuted keys and update the permutation
- compute::sort_by_key(temp.begin(), temp.end(), permutation.begin(), queue);
-}
-
-template <typename T>
-void apply_permutation(compute::buffer_iterator<T>& keys, compute::vector<int>& permutation, compute::command_queue& queue)
-{
- // copy keys to temporary vector
- compute::vector<T> temp(keys, keys+permutation.size(), queue);
-
- // permute the keys
- compute::gather(permutation.begin(), permutation.end(), temp.begin(), keys, queue);
-}
-
-template<typename T, typename convAccT>
-void sift(unsigned* out_feat,
- unsigned* out_dlen,
- Param& x_out,
- Param& y_out,
- Param& score_out,
- Param& ori_out,
- Param& size_out,
- Param& desc_out,
- Param img,
- const unsigned n_layers,
- const float contrast_thr,
- const float edge_thr,
- const float init_sigma,
- const bool double_input,
- const float img_scale,
- const float feature_ratio)
-{
- try {
- static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static std::map<int, Program*> siftProgs;
- static std::map<int, Kernel*> suKernel;
- static std::map<int, Kernel*> deKernel;
- static std::map<int, Kernel*> ieKernel;
- static std::map<int, Kernel*> coKernel;
- static std::map<int, Kernel*> rdKernel;
- static std::map<int, Kernel*> cdKernel;
-
- int device = getActiveDeviceId();
-
- std::call_once( compileFlags[device], [device] () {
-
- std::ostringstream options;
- options << " -D T=" << dtype_traits<T>::getName();
-
- if (std::is_same<T, double>::value ||
- std::is_same<T, cdouble>::value) {
- options << " -D USE_DOUBLE";
- }
-
- cl::Program prog;
- buildProgram(prog, sift_nonfree_cl, sift_nonfree_cl_len, options.str());
- siftProgs[device] = new Program(prog);
-
- suKernel[device] = new Kernel(*siftProgs[device], "sub");
- deKernel[device] = new Kernel(*siftProgs[device], "detectExtrema");
- ieKernel[device] = new Kernel(*siftProgs[device], "interpolateExtrema");
- coKernel[device] = new Kernel(*siftProgs[device], "calcOrientation");
- rdKernel[device] = new Kernel(*siftProgs[device], "removeDuplicates");
- cdKernel[device] = new Kernel(*siftProgs[device], "computeDescriptor");
- });
-
- const unsigned min_dim = (double_input) ? min(img.info.dims[0]*2, img.info.dims[1]*2)
- : min(img.info.dims[0], img.info.dims[1]);
- const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
-
- Param init_img = createInitialImage<T, convAccT>(img, init_sigma, double_input);
-
- std::vector<Param> gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
-
- std::vector<Param> dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers, suKernel[device]);
-
- std::vector<cl::Buffer*> d_x_pyr(n_octaves, NULL);
- std::vector<cl::Buffer*> d_y_pyr(n_octaves, NULL);
- std::vector<cl::Buffer*> d_response_pyr(n_octaves, NULL);
- std::vector<cl::Buffer*> d_size_pyr(n_octaves, NULL);
- std::vector<cl::Buffer*> d_ori_pyr(n_octaves, NULL);
- std::vector<cl::Buffer*> d_desc_pyr(n_octaves, NULL);
- std::vector<unsigned> feat_pyr(n_octaves, 0);
- unsigned total_feat = 0;
-
- const unsigned d = DescrWidth;
- const unsigned n = DescrHistBins;
- const unsigned desc_len = d*d*n;
-
- cl::Buffer* d_count = bufferAlloc(sizeof(unsigned));
-
- for (unsigned o = 0; o < n_octaves; o++) {
- if (dog_pyr[o].info.dims[0]-2*ImgBorder < 1 ||
- dog_pyr[o].info.dims[1]-2*ImgBorder < 1)
- continue;
-
- const unsigned imel = dog_pyr[o].info.dims[0] * dog_pyr[o].info.dims[1];
- const unsigned max_feat = ceil(imel * feature_ratio);
-
- cl::Buffer* d_extrema_x = bufferAlloc(max_feat * sizeof(float));
- cl::Buffer* d_extrema_y = bufferAlloc(max_feat * sizeof(float));
- cl::Buffer* d_extrema_layer = bufferAlloc(max_feat * sizeof(unsigned));
-
- unsigned extrema_feat = 0;
- getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &extrema_feat);
-
- int dim0 = dog_pyr[o].info.dims[0];
- int dim1 = dog_pyr[o].info.dims[1];
-
- const int blk_x = divup(dim0-2*ImgBorder, SIFT_THREADS_X);
- const int blk_y = divup(dim1-2*ImgBorder, SIFT_THREADS_Y);
- const NDRange local(SIFT_THREADS_X, SIFT_THREADS_Y);
- const NDRange global(blk_x * SIFT_THREADS_X, blk_y * SIFT_THREADS_Y);
-
- float extrema_thr = 0.5f * contrast_thr / n_layers;
-
- auto deOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
- Buffer, KParam, unsigned, float,
- LocalSpaceArg> (*deKernel[device]);
-
- deOp(EnqueueArgs(getQueue(), global, local),
- *d_extrema_x, *d_extrema_y, *d_extrema_layer, *d_count,
- *dog_pyr[o].data, dog_pyr[o].info, max_feat, extrema_thr,
- cl::Local((SIFT_THREADS_X+2) * (SIFT_THREADS_Y+2) * 3 * sizeof(float)));
- CL_DEBUG_FINISH(getQueue());
-
- getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &extrema_feat);
- extrema_feat = min(extrema_feat, max_feat);
-
- if (extrema_feat == 0) {
- bufferFree(d_extrema_x);
- bufferFree(d_extrema_y);
- bufferFree(d_extrema_layer);
-
- continue;
- }
-
- unsigned interp_feat = 0;
- getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &interp_feat);
-
- cl::Buffer* d_interp_x = bufferAlloc(extrema_feat * sizeof(float));
- cl::Buffer* d_interp_y = bufferAlloc(extrema_feat * sizeof(float));
- cl::Buffer* d_interp_layer = bufferAlloc(extrema_feat * sizeof(unsigned));
- cl::Buffer* d_interp_response = bufferAlloc(extrema_feat * sizeof(float));
- cl::Buffer* d_interp_size = bufferAlloc(extrema_feat * sizeof(float));
-
- const int blk_x_interp = divup(extrema_feat, SIFT_THREADS);
- const NDRange local_interp(SIFT_THREADS, 1);
- const NDRange global_interp(blk_x_interp * SIFT_THREADS, 1);
-
- auto ieOp = make_kernel<Buffer, Buffer, Buffer,
- Buffer, Buffer, Buffer,
- Buffer, Buffer, Buffer, unsigned,
- Buffer, KParam, unsigned, unsigned, unsigned,
- float, float, float, float> (*ieKernel[device]);
-
- ieOp(EnqueueArgs(getQueue(), global_interp, local_interp),
- *d_interp_x, *d_interp_y, *d_interp_layer,
- *d_interp_response, *d_interp_size, *d_count,
- *d_extrema_x, *d_extrema_y, *d_extrema_layer, extrema_feat,
- *dog_pyr[o].data, dog_pyr[o].info, extrema_feat, o, n_layers,
- contrast_thr, edge_thr, init_sigma, img_scale);
- CL_DEBUG_FINISH(getQueue());
-
- getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &interp_feat);
- interp_feat = min(interp_feat, extrema_feat);
-
- if (interp_feat == 0) {
- bufferFree(d_interp_x);
- bufferFree(d_interp_y);
- bufferFree(d_interp_layer);
- bufferFree(d_interp_response);
- bufferFree(d_interp_size);
-
- continue;
- }
-
- compute::command_queue queue(getQueue()());
- compute::context context(getContext()());
-
- compute::buffer buf_interp_x((*d_interp_x)(), true);
- compute::buffer buf_interp_y((*d_interp_y)(), true);
- compute::buffer buf_interp_layer((*d_interp_layer)(), true);
- compute::buffer buf_interp_response((*d_interp_response)(), true);
- compute::buffer buf_interp_size((*d_interp_size)(), true);
-
- compute::buffer_iterator<float> interp_x_begin = compute::make_buffer_iterator<float>(buf_interp_x, 0);
- compute::buffer_iterator<float> interp_y_begin = compute::make_buffer_iterator<float>(buf_interp_y, 0);
- compute::buffer_iterator<unsigned> interp_layer_begin = compute::make_buffer_iterator<unsigned>(buf_interp_layer, 0);
- compute::buffer_iterator<float> interp_response_begin = compute::make_buffer_iterator<float>(buf_interp_response, 0);
- compute::buffer_iterator<float> interp_size_begin = compute::make_buffer_iterator<float>(buf_interp_size, 0);
-
- compute::vector<int> permutation(interp_feat, context);
- compute::iota(permutation.begin(), permutation.end(), 0, queue);
-
- update_permutation<float>(interp_x_begin, permutation, queue);
- update_permutation<float>(interp_y_begin, permutation, queue);
- update_permutation<unsigned>(interp_layer_begin, permutation, queue);
- update_permutation<float>(interp_response_begin, permutation, queue);
- update_permutation<float>(interp_size_begin, permutation, queue);
-
- apply_permutation<float>(interp_x_begin, permutation, queue);
- apply_permutation<float>(interp_y_begin, permutation, queue);
- apply_permutation<unsigned>(interp_layer_begin, permutation, queue);
- apply_permutation<float>(interp_response_begin, permutation, queue);
- apply_permutation<float>(interp_size_begin, permutation, queue);
-
- unsigned nodup_feat = 0;
- getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &nodup_feat);
-
- cl::Buffer* d_nodup_x = bufferAlloc(interp_feat * sizeof(float));
- cl::Buffer* d_nodup_y = bufferAlloc(interp_feat * sizeof(float));
- cl::Buffer* d_nodup_layer = bufferAlloc(interp_feat * sizeof(unsigned));
- cl::Buffer* d_nodup_response = bufferAlloc(interp_feat * sizeof(float));
- cl::Buffer* d_nodup_size = bufferAlloc(interp_feat * sizeof(float));
-
- const int blk_x_nodup = divup(extrema_feat, SIFT_THREADS);
- const NDRange local_nodup(SIFT_THREADS, 1);
- const NDRange global_nodup(blk_x_nodup * SIFT_THREADS, 1);
-
- auto rdOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
- Buffer, Buffer, Buffer, Buffer, Buffer,
- unsigned> (*rdKernel[device]);
-
- rdOp(EnqueueArgs(getQueue(), global_nodup, local_nodup),
- *d_nodup_x, *d_nodup_y, *d_nodup_layer,
- *d_nodup_response, *d_nodup_size, *d_count,
- *d_interp_x, *d_interp_y, *d_interp_layer,
- *d_interp_response, *d_interp_size, interp_feat);
- CL_DEBUG_FINISH(getQueue());
-
- getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &nodup_feat);
- nodup_feat = min(nodup_feat, interp_feat);
-
- bufferFree(d_interp_x);
- bufferFree(d_interp_y);
- bufferFree(d_interp_layer);
- bufferFree(d_interp_response);
- bufferFree(d_interp_size);
-
- unsigned oriented_feat = 0;
- getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &oriented_feat);
- const unsigned max_oriented_feat = nodup_feat * 3;
-
- cl::Buffer* d_oriented_x = bufferAlloc(max_oriented_feat * sizeof(float));
- cl::Buffer* d_oriented_y = bufferAlloc(max_oriented_feat * sizeof(float));
- cl::Buffer* d_oriented_layer = bufferAlloc(max_oriented_feat * sizeof(unsigned));
- cl::Buffer* d_oriented_response = bufferAlloc(max_oriented_feat * sizeof(float));
- cl::Buffer* d_oriented_size = bufferAlloc(max_oriented_feat * sizeof(float));
- cl::Buffer* d_oriented_ori = bufferAlloc(max_oriented_feat * sizeof(float));
-
- const int blk_x_ori = divup(nodup_feat, SIFT_THREADS_Y);
- const NDRange local_ori(SIFT_THREADS_X, SIFT_THREADS_Y);
- const NDRange global_ori(SIFT_THREADS_X, blk_x_ori * SIFT_THREADS_Y);
-
- auto coOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
- Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
- Buffer, KParam, unsigned, unsigned, int,
- LocalSpaceArg> (*coKernel[device]);
-
- coOp(EnqueueArgs(getQueue(), global_ori, local_ori),
- *d_oriented_x, *d_oriented_y, *d_oriented_layer,
- *d_oriented_response, *d_oriented_size, *d_oriented_ori, *d_count,
- *d_nodup_x, *d_nodup_y, *d_nodup_layer,
- *d_nodup_response, *d_nodup_size, nodup_feat,
- *gauss_pyr[o].data, gauss_pyr[o].info, max_oriented_feat, o, (int)double_input,
- cl::Local(OriHistBins * SIFT_THREADS_Y * 2 * sizeof(float)));
- CL_DEBUG_FINISH(getQueue());
-
- getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &oriented_feat);
- oriented_feat = min(oriented_feat, max_oriented_feat);
-
- if (oriented_feat == 0) {
- bufferFree(d_oriented_x);
- bufferFree(d_oriented_y);
- bufferFree(d_oriented_layer);
- bufferFree(d_oriented_response);
- bufferFree(d_oriented_size);
-
- continue;
- }
-
- cl::Buffer* d_desc = bufferAlloc(oriented_feat * desc_len * sizeof(float));
-
- float scale = 1.f/(1 << o);
- if (double_input) scale *= 2.f;
-
- const int blk_x_desc = divup(oriented_feat, 1);
- const NDRange local_desc(SIFT_THREADS, 1);
- const NDRange global_desc(SIFT_THREADS, blk_x_desc);
-
- const unsigned histsz = 8;
-
- auto cdOp = make_kernel<Buffer, unsigned, unsigned,
- Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
- Buffer, KParam, int, int, float, float, int,
- LocalSpaceArg> (*cdKernel[device]);
-
- cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
- *d_desc, desc_len, histsz,
- *d_oriented_x, *d_oriented_y, *d_oriented_layer,
- *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
- *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, init_sigma, n_layers,
- cl::Local(desc_len * (histsz+1) * sizeof(float)));
- CL_DEBUG_FINISH(getQueue());
-
- total_feat += oriented_feat;
- feat_pyr[o] = oriented_feat;
-
- if (oriented_feat > 0) {
- d_x_pyr[o] = d_oriented_x;
- d_y_pyr[o] = d_oriented_y;
- d_response_pyr[o] = d_oriented_response;
- d_ori_pyr[o] = d_oriented_ori;
- d_size_pyr[o] = d_oriented_size;
- d_desc_pyr[o] = d_desc;
- }
- }
-
- bufferFree(d_count);
-
- // If no features are found, set found features to 0 and return
- if (total_feat == 0) {
- *out_feat = 0;
- return;
- }
-
- // Allocate output memory
- x_out.info.dims[0] = total_feat;
- x_out.info.strides[0] = 1;
- y_out.info.dims[0] = total_feat;
- y_out.info.strides[0] = 1;
- score_out.info.dims[0] = total_feat;
- score_out.info.strides[0] = 1;
- ori_out.info.dims[0] = total_feat;
- ori_out.info.strides[0] = 1;
- size_out.info.dims[0] = total_feat;
- size_out.info.strides[0] = 1;
-
- desc_out.info.dims[0] = desc_len;
- desc_out.info.strides[0] = 1;
- desc_out.info.dims[1] = total_feat;
- desc_out.info.strides[1] = desc_out.info.dims[0];
-
- for (int k = 1; k < 4; k++) {
- x_out.info.dims[k] = 1;
- x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
- y_out.info.dims[k] = 1;
- y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
- score_out.info.dims[k] = 1;
- score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1];
- ori_out.info.dims[k] = 1;
- ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1];
- size_out.info.dims[k] = 1;
- size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1];
- if (k > 1) {
- desc_out.info.dims[k] = 1;
- desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1];
- }
- }
-
- if (total_feat > 0) {
- size_t out_sz = total_feat * sizeof(float);
- x_out.data = bufferAlloc(out_sz);
- y_out.data = bufferAlloc(out_sz);
- score_out.data = bufferAlloc(out_sz);
- ori_out.data = bufferAlloc(out_sz);
- size_out.data = bufferAlloc(out_sz);
-
- size_t desc_sz = total_feat * desc_len * sizeof(unsigned);
- desc_out.data = bufferAlloc(desc_sz);
- }
-
- unsigned offset = 0;
- for (unsigned i = 0; i < n_octaves; i++) {
- if (feat_pyr[i] == 0)
- continue;
-
- getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
- getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
- getQueue().enqueueCopyBuffer(*d_response_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
- getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
- getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-
- getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*desc_len*sizeof(unsigned), feat_pyr[i] * desc_len * sizeof(unsigned));
-
- bufferFree(d_x_pyr[i]);
- bufferFree(d_y_pyr[i]);
- bufferFree(d_response_pyr[i]);
- bufferFree(d_ori_pyr[i]);
- bufferFree(d_size_pyr[i]);
- bufferFree(d_desc_pyr[i]);
-
- offset += feat_pyr[i];
- }
-
- // Sets number of output features and descriptor length
- *out_feat = total_feat;
- *out_dlen = desc_len;
- } catch (cl::Error err) {
- CL_TO_AF_ERROR(err);
- throw;
- }
-}
-
-} //namespace kernel
-
-} //namespace opencl
diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp
deleted file mode 100644
index f5bd8a4..0000000
--- a/test/sift_nonfree.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <gtest/gtest.h>
-#include <arrayfire.h>
-#include <af/dim4.hpp>
-#include <af/traits.hpp>
-#include <af/compatible.h>
-#include <string>
-#include <vector>
-#include <cmath>
-#include <testHelpers.hpp>
-#include <typeinfo>
-
-using std::string;
-using std::vector;
-using af::dim4;
-
-typedef struct
-{
- float f[5];
- unsigned d[128];
-} feat_desc_t;
-
-typedef struct
-{
- float f[5];
-} feat_t;
-
-typedef struct
-{
- float d[128];
-} desc_t;
-
-bool feat_cmp(feat_desc_t i, feat_desc_t j)
-{
- for (int k = 0; k < 5; k++)
- if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f))
- return (round(i.f[k]*1e1f) < round(j.f[k]*1e1f));
-
- return true;
-}
-
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat)
-{
- feat.resize(nfeat);
- for (size_t i = 0; i < feat.size(); i++) {
- feat[i].f[0] = x[i];
- feat[i].f[1] = y[i];
- feat[i].f[2] = score[i];
- feat[i].f[3] = ori[i];
- feat[i].f[4] = size[i];
- for (unsigned j = 0; j < 128; j++)
- feat[i].d[j] = desc[i * 128 + j];
- }
-}
-
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<float> >& desc, unsigned nfeat)
-{
- feat.resize(nfeat);
- for (size_t i = 0; i < feat.size(); i++) {
- feat[i].f[0] = x[i];
- feat[i].f[1] = y[i];
- feat[i].f[2] = score[i];
- feat[i].f[3] = ori[i];
- feat[i].f[4] = size[i];
- for (unsigned j = 0; j < 128; j++)
- feat[i].d[j] = desc[i][j];
- }
-}
-
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
-{
- feat.resize(nfeat);
- for (unsigned i = 0; i < feat.size(); i++) {
- feat[i].f[0] = x[i];
- feat[i].f[1] = y[i];
- feat[i].f[2] = score[i];
- feat[i].f[3] = ori[i];
- feat[i].f[4] = size[i];
- }
-}
-
-void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
-{
- f.resize(fd.size());
- d.resize(fd.size());
- for (size_t i = 0; i < fd.size(); i++) {
- f[i].f[0] = fd[i].f[0];
- f[i].f[1] = fd[i].f[1];
- f[i].f[2] = fd[i].f[2];
- f[i].f[3] = fd[i].f[3];
- f[i].f[4] = fd[i].f[4];
- for (unsigned j = 0; j < 128; j++)
- d[i].d[j] = fd[i].d[j];
- }
-}
-
-unsigned popcount(unsigned x)
-{
- x = x - ((x >> 1) & 0x55555555);
- x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
- x = (x + (x >> 4)) & 0x0F0F0F0F;
- x = x + (x >> 8);
- x = x + (x >> 16);
- return x & 0x0000003F;
-}
-
-bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f)
-{
- bool ret = true;
- float sum = 0.0f;
-
- for (dim_t i = 0; i < ndesc; i++) {
- sum = 0.0f;
- for (dim_t l = 0; l < desc_len; l++) {
- dim_t idx = i * desc_len + l;
- float x = (cpu[idx] - gpu[idx]);
- sum += x*x;
- if (abs(x) > (float)unit_thr) {
- ret = false;
- std::cout<<std::endl<<"@compareEuclidean: unit mismatch."<<std::endl;
- std::cout<<"(cpu,gpu,cpu-gpu)["<<i<<","<<l<<"] : {"<<cpu[idx]<<","<<gpu[idx]<<","<<cpu[idx]-gpu[idx]<<"}"<<std::endl;
- std::cout<<std::endl;
- break;
- }
- }
- if (sqrt(sum) > euc_thr) {
- ret = false;
- std::cout<<std::endl<<"@compareEuclidean: distance mismatch."<<std::endl;
- std::cout<<"Euclidean distance: "<<sqrt(sum)<<std::endl;
- }
- if (ret == false)
- return ret;
- }
-
- return ret;
-}
-
-template<typename T>
-class SIFT : public ::testing::Test
-{
- public:
- virtual void SetUp() {}
-};
-
-typedef ::testing::Types<float, double> TestTypes;
-
-TYPED_TEST_CASE(SIFT, TestTypes);
-
-template<typename T>
-void siftTest(string pTestFile)
-{
-#ifdef AF_BUILD_SIFT
- if (noDoubleTests<T>()) return;
-
- vector<dim4> inDims;
- vector<string> inFiles;
- vector<vector<float> > goldFeat;
- vector<vector<float> > goldDesc;
-
- readImageFeaturesDescriptors<float>(pTestFile, inDims, inFiles, goldFeat, goldDesc);
-
- size_t testCount = inDims.size();
-
- for (size_t testId=0; testId<testCount; ++testId) {
- af_array inArray_f32 = 0;
- af_array inArray = 0;
- af_array desc = 0;
- af_features feat;
-
- inFiles[testId].insert(0,string(TEST_DIR"/sift/"));
-
- ASSERT_EQ(AF_SUCCESS, af_load_image(&inArray_f32, inFiles[testId].c_str(), false));
- ASSERT_EQ(AF_SUCCESS, conv_image<T>(&inArray, inArray_f32));
-
- ASSERT_EQ(AF_SUCCESS, af_sift(&feat, &desc, inArray, 3, 0.04f, 10.0f, 1.6f, true, 1.f/256.f, 0.05f));
-
- dim_t n = 0;
- af_array x, y, score, orientation, size;
-
- ASSERT_EQ(AF_SUCCESS, af_get_features_num(&n, feat));
- ASSERT_EQ(AF_SUCCESS, af_get_features_xpos(&x, feat));
- ASSERT_EQ(AF_SUCCESS, af_get_features_ypos(&y, feat));
- ASSERT_EQ(AF_SUCCESS, af_get_features_score(&score, feat));
- ASSERT_EQ(AF_SUCCESS, af_get_features_orientation(&orientation, feat));
- ASSERT_EQ(AF_SUCCESS, af_get_features_size(&size, feat));
-
- float * outX = new float[n];
- float * outY = new float[n];
- float * outScore = new float[n];
- float * outOrientation = new float[n];
- float * outSize = new float[n];
- dim_t descSize;
- dim_t descDims[4];
- ASSERT_EQ(AF_SUCCESS, af_get_elements(&descSize, desc));
- ASSERT_EQ(AF_SUCCESS, af_get_dims(&descDims[0], &descDims[1], &descDims[2], &descDims[3], desc));
- float * outDesc = new float[descSize];
- ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outX, x));
- ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outY, y));
- ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outScore, score));
- ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outOrientation, orientation));
- ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outSize, size));
- ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outDesc, desc));
-
- vector<feat_desc_t> out_feat_desc;
- array_to_feat_desc(out_feat_desc, outX, outY, outScore, outOrientation, outSize, outDesc, n);
-
- vector<feat_desc_t> gold_feat_desc;
- array_to_feat_desc(gold_feat_desc, &goldFeat[0].front(), &goldFeat[1].front(), &goldFeat[2].front(), &goldFeat[3].front(), &goldFeat[4].front(), goldDesc, goldFeat[0].size());
-
- std::stable_sort(out_feat_desc.begin(), out_feat_desc.end(), feat_cmp);
- std::stable_sort(gold_feat_desc.begin(), gold_feat_desc.end(), feat_cmp);
-
- vector<feat_t> out_feat;
- vector<desc_t> v_out_desc;
- vector<feat_t> gold_feat;
- vector<desc_t> v_gold_desc;
-
- split_feat_desc(out_feat_desc, out_feat, v_out_desc);
- split_feat_desc(gold_feat_desc, gold_feat, v_gold_desc);
-
- for (int elIter = 0; elIter < (int)n; elIter++) {
- ASSERT_LE(fabs(out_feat[elIter].f[0] - gold_feat[elIter].f[0]), 1e-3) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[1] - gold_feat[elIter].f[1]), 1e-3) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[2] - gold_feat[elIter].f[2]), 1e-3) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[3] - gold_feat[elIter].f[3]), 0.5f) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
- }
-
- EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 1.f, 2.f));
-
- ASSERT_EQ(AF_SUCCESS, af_release_array(inArray));
- ASSERT_EQ(AF_SUCCESS, af_release_array(inArray_f32));
-
- ASSERT_EQ(AF_SUCCESS, af_release_array(x));
- ASSERT_EQ(AF_SUCCESS, af_release_array(y));
- ASSERT_EQ(AF_SUCCESS, af_release_array(score));
- ASSERT_EQ(AF_SUCCESS, af_release_array(orientation));
- ASSERT_EQ(AF_SUCCESS, af_release_array(size));
- ASSERT_EQ(AF_SUCCESS, af_release_array(desc));
-
- delete[] outX;
- delete[] outY;
- delete[] outScore;
- delete[] outOrientation;
- delete[] outSize;
- delete[] outDesc;
- }
-#endif
-}
-
-#define SIFT_INIT(desc, image) \
- TYPED_TEST(SIFT, desc) \
- { \
- siftTest<TypeParam>(string(TEST_DIR"/sift/"#image".test")); \
- }
-
- SIFT_INIT(man, man);
-
-///////////////////////////////////// CPP ////////////////////////////////
-//
-TEST(SIFT, CPP)
-{
-#ifdef AF_BUILD_SIFT
- if (noDoubleTests<float>()) return;
-
- vector<dim4> inDims;
- vector<string> inFiles;
- vector<vector<float> > goldFeat;
- vector<vector<float> > goldDesc;
-
- readImageFeaturesDescriptors<float>(string(TEST_DIR"/sift/man.test"), inDims, inFiles, goldFeat, goldDesc);
- inFiles[0].insert(0,string(TEST_DIR"/sift/"));
-
- af::array in = af::loadImage(inFiles[0].c_str(), false);
-
- af::features feat;
- af::array desc;
- af::sift(feat, desc, in, 3, 0.04f, 10.0f, 1.6f, true, 1.f/256.f, 0.05f);
-
- float * outX = new float[feat.getNumFeatures()];
- float * outY = new float[feat.getNumFeatures()];
- float * outScore = new float[feat.getNumFeatures()];
- float * outOrientation = new float[feat.getNumFeatures()];
- float * outSize = new float[feat.getNumFeatures()];
- float * outDesc = new float[desc.elements()];
- af::dim4 descDims = desc.dims();
- feat.getX().host(outX);
- feat.getY().host(outY);
- feat.getScore().host(outScore);
- feat.getOrientation().host(outOrientation);
- feat.getSize().host(outSize);
- desc.host(outDesc);
-
- vector<feat_desc_t> out_feat_desc;
- array_to_feat_desc(out_feat_desc, outX, outY, outScore, outOrientation, outSize, outDesc, feat.getNumFeatures());
-
- vector<feat_desc_t> gold_feat_desc;
- array_to_feat_desc(gold_feat_desc, &goldFeat[0].front(), &goldFeat[1].front(), &goldFeat[2].front(), &goldFeat[3].front(), &goldFeat[4].front(), goldDesc, goldFeat[0].size());
-
- std::stable_sort(out_feat_desc.begin(), out_feat_desc.end(), feat_cmp);
- std::stable_sort(gold_feat_desc.begin(), gold_feat_desc.end(), feat_cmp);
-
- vector<feat_t> out_feat;
- vector<desc_t> v_out_desc;
- vector<feat_t> gold_feat;
- vector<desc_t> v_gold_desc;
-
- split_feat_desc(out_feat_desc, out_feat, v_out_desc);
- split_feat_desc(gold_feat_desc, gold_feat, v_gold_desc);
-
- for (int elIter = 0; elIter < (int)feat.getNumFeatures(); elIter++) {
- ASSERT_LE(fabs(out_feat[elIter].f[0] - gold_feat[elIter].f[0]), 1e-3) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[1] - gold_feat[elIter].f[1]), 1e-3) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[2] - gold_feat[elIter].f[2]), 1e-3) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[3] - gold_feat[elIter].f[3]), 0.5f) << "at: " << elIter << std::endl;
- ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
- }
-
- EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 1.f, 2.f));
-
- delete[] outX;
- delete[] outY;
- delete[] outScore;
- delete[] outOrientation;
- delete[] outSize;
- delete[] outDesc;
-#endif
-}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list