[arrayfire] 395/408: dfsg clean

Mon Sep 21 19:12:38 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 54749215e4def3bce369bc09ab6cdddb6a3828d5
Author: Ghislain Antony Vaillant <ghisvail at gmail.com>
Date:   Tue Sep 15 01:39:02 2015 +0100

    dfsg clean
    
    find -name "*nonfree*" -type f -delete
---
 src/backend/cpu/sift_nonfree.hpp           | 1033 ---------------------
 src/backend/cuda/kernel/sift_nonfree.hpp   | 1370 ----------------------------
 src/backend/opencl/kernel/sift_nonfree.cl  |  806 ----------------
 src/backend/opencl/kernel/sift_nonfree.hpp |  784 ----------------
 test/sift_nonfree.cpp                      |  336 -------
 5 files changed, 4329 deletions(-)

diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp
deleted file mode 100644
index b331e4c..0000000
--- a/src/backend/cpu/sift_nonfree.hpp
+++ /dev/null
@@ -1,1033 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements.  Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims.  If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement.  It does not protect you from
-// committing patent infringement.  So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either.  If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//     * Redistributions of source code must retain the above copyright and
-//       patent notices, this list of conditions and the following
-//       disclaimer.
-//     * Redistributions in binary form must reproduce the above copyright
-//       notice, this list of conditions and the following disclaimer in
-//       the documentation and/or other materials provided with the
-//       distribution.
-//     * Neither the name of Oregon State University nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-using af::dim4;
-
-namespace cpu
-{
-
-    static const float PI_VAL = 3.14159265358979323846f;
-
-// default width of descriptor histogram array
-    static const int DescrWidth = 4;
-
-// default number of bins per histogram in descriptor array
-    static const int DescrHistBins = 8;
-
-// assumed gaussian blur for input image
-    static const float InitSigma = 0.5f;
-
-// width of border in which to ignore keypoints
-    static const int ImgBorder = 5;
-
-// maximum steps of keypoint interpolation before failure
-    static const int MaxInterpSteps = 5;
-
-// default number of bins in histogram for orientation assignment
-    static const int OriHistBins = 36;
-
-// determines gaussian sigma for orientation assignment
-    static const float OriSigFctr = 1.5f;
-
-// determines the radius of the region used in orientation assignment */
-    static const float OriRadius = 3.0f * OriSigFctr;
-
-// number of passes of orientation histogram smoothing
-    static const int SmoothOriPasses = 2;
-
-// orientation magnitude relative to max that results in new feature
-    static const float OriPeakRatio = 0.8f;
-
-// determines the size of a single descriptor orientation histogram
-    static const float DescrSclFctr = 3.f;
-
-// threshold on magnitude of elements of descriptor vector
-    static const float DescrMagThr = 0.2f;
-
-// factor used to convert floating-point descriptor to unsigned char
-    static const float IntDescrFctr = 512.f;
-
-    typedef struct
-    {
-        float    f[4];
-        unsigned l;
-    } feat_t;
-
-    bool feat_cmp(feat_t i, feat_t j)
-    {
-        for (int k = 0; k < 4; k++)
-            if (i.f[k] != j.f[k])
-                return (i.f[k] < j.f[k]);
-        if (i.l != j.l)
-            return (i.l < j.l);
-
-        return true;
-    }
-
-    void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
-    {
-        feat.resize(nfeat);
-        for (unsigned i = 0; i < feat.size(); i++) {
-            feat[i].f[0] = x[i];
-            feat[i].f[1] = y[i];
-            feat[i].f[2] = resp[i];
-            feat[i].f[3] = size[i];
-            feat[i].l    = layer[i];
-        }
-    }
-
-    template<typename T>
-    void gaussian1D(T* out, const int dim, double sigma=0.0)
-    {
-        if(!(sigma>0)) sigma = 0.25*dim;
-
-        T sum = (T)0;
-        for(int i=0;i<dim;i++)
-        {
-            int x = i-(dim-1)/2;
-            T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-            out[i] = el;
-            sum   += el;
-        }
-
-        for(int k=0;k<dim;k++)
-            out[k] /= sum;
-    }
-
-    template<typename T>
-    Array<T> gauss_filter(float sigma)
-    {
-        // Using 6-sigma rule
-        unsigned gauss_len = (unsigned)round(sigma * 6 + 1) | 1;
-
-        Array<T> filter = createEmptyArray<T>(gauss_len);
-        gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
-
-        return filter;
-    }
-
-    template<int N>
-    void gaussianElimination(float* A, float* b, float* x)
-    {
-        // forward elimination
-        for (int i = 0; i < N-1; i++) {
-            for (int j = i+1; j < N; j++) {
-                float s = A[j*N+i] / A[i*N+i];
-
-                for (int k = i; k < N; k++)
-                    A[j*N+k] -= s * A[i*N+k];
-
-                b[j] -= s * b[i];
-            }
-        }
-
-        for (int i = 0; i < N; i++)
-            x[i] = 0;
-
-        // backward substitution
-        float sum = 0;
-        for (int i = 0; i <= N-2; i++) {
-            sum = b[i];
-            for (int j = i+1; j < N; j++)
-                sum -= A[i*N+j] * x[j];
-            x[i] = sum / A[i*N+i];
-        }
-    }
-
-    template<typename T>
-    void sub(
-        Array<T>& out,
-        const Array<T>& in1,
-        const Array<T>& in2)
-    {
-        size_t nel = in1.elements();
-        T* out_ptr = out.get();
-        const T* in1_ptr = in1.get();
-        const T* in2_ptr = in2.get();
-
-        for (size_t i = 0; i < nel; i++) {
-            out_ptr[i] = in1_ptr[i] - in2_ptr[i];
-        }
-    }
-
-#define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)])
-#define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)])
-#define NPTR(Y, X) (next_ptr[(Y) * idims[0] + (X)])
-
-// Determines whether a pixel is a scale-space extremum by comparing it to its
-// 3x3x3 pixel neighborhood.
-    template<typename T>
-    void detectExtrema(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        unsigned* counter,
-        const Array<T>& prev,
-        const Array<T>& center,
-        const Array<T>& next,
-        const unsigned layer,
-        const unsigned max_feat,
-        const float threshold)
-    {
-        const af::dim4 idims = center.dims();
-        const T* prev_ptr    = prev.get();
-        const T* center_ptr  = center.get();
-        const T* next_ptr    = next.get();
-
-        for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
-            for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
-                float p = center_ptr[y*idims[0] + x];
-
-                // Find extrema
-                if (abs((float)p) > threshold &&
-                    ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
-                      p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y,   x+1)  &&
-                      p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1)  &&
-                      p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1)  &&
-                      p > PPTR(y,   x-1) && p > PPTR(y  , x) && p > PPTR(y,   x+1)  &&
-                      p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1)  &&
-                      p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1)  &&
-                      p > NPTR(y,   x-1) && p > NPTR(y  , x) && p > NPTR(y,   x+1)  &&
-                      p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
-                     (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
-                      p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y,   x+1)  &&
-                      p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1)  &&
-                      p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1)  &&
-                      p < PPTR(y,   x-1) && p < PPTR(y  , x) && p < PPTR(y,   x+1)  &&
-                      p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1)  &&
-                      p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1)  &&
-                      p < NPTR(y,   x-1) && p < NPTR(y  , x) && p < NPTR(y,   x+1)  &&
-                      p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
-
-                    if (*counter < max_feat)
-                    {
-                        x_out[*counter] = (float)y;
-                        y_out[*counter] = (float)x;
-                        layer_out[*counter] = layer;
-                        (*counter)++;
-                    }
-                }
-            }
-        }
-    }
-
-// Interpolates a scale-space extremum's location and scale to subpixel
-// accuracy to form an image feature. Rejects features with low contrast.
-// Based on Section 4 of Lowe's paper.
-    template<typename T>
-    void interpolateExtrema(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        unsigned* counter,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const unsigned extrema_feat,
-        std::vector< Array<T> >& dog_pyr,
-        const unsigned max_feat,
-        const unsigned octave,
-        const unsigned n_layers,
-        const float contrast_thr,
-        const float edge_thr,
-        const float sigma,
-        const float img_scale)
-    {
-        for (int f = 0; f < (int)extrema_feat; f++) {
-            const float first_deriv_scale = img_scale*0.5f;
-            const float second_deriv_scale = img_scale;
-            const float cross_deriv_scale = img_scale*0.25f;
-
-            float xl = 0, xy = 0, xx = 0, contr = 0;
-            int i = 0;
-
-            unsigned x = x_in[f];
-            unsigned y = y_in[f];
-            unsigned layer = layer_in[f];
-
-            const T* prev_ptr   = dog_pyr[octave*(n_layers+2) + layer-1].get();
-            const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
-            const T* next_ptr   = dog_pyr[octave*(n_layers+2) + layer+1].get();
-
-            af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
-
-            bool converges = true;
-
-            for (i = 0; i < MaxInterpSteps; i++) {
-                float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                               (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                               (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-
-                float d2  = CPTR(x, y) * 2.f;
-                float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-                float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-                float dss = (NPTR(x, y  ) + PPTR(x, y  ) - d2) * second_deriv_scale;
-                float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                             CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-                float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
-                             PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
-                float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
-                             PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
-                float H[9] = {dxx, dxy, dxs,
-                              dxy, dyy, dys,
-                              dxs, dys, dss};
-
-                float X[3];
-                gaussianElimination<3>(H, dD, X);
-
-                xl = -X[2];
-                xy = -X[1];
-                xx = -X[0];
-
-                if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
-                    break;
-
-                x += round(xx);
-                y += round(xy);
-                layer += round(xl);
-
-                if (layer < 1 || layer > n_layers ||
-                    x < ImgBorder || x >= idims[1] - ImgBorder ||
-                    y < ImgBorder || y >= idims[0] - ImgBorder) {
-                    converges = false;
-                    break;
-                }
-            }
-
-            // ensure convergence of interpolation
-            if (i >= MaxInterpSteps || !converges)
-                continue;
-
-            float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                           (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                           (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-            float X[3] = {xx, xy, xl};
-
-            float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
-            contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
-            if(abs(contr) < (contrast_thr / n_layers))
-                continue;
-
-            // principal curvatures are computed using the trace and det of Hessian
-            float d2  = CPTR(x, y) * 2.f;
-            float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-            float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-            float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                         CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-
-            float tr = dxx + dyy;
-            float det = dxx * dyy - dxy * dxy;
-
-            // add FLT_EPSILON for double-precision compatibility
-            if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
-                continue;
-
-            if (*counter < max_feat)
-            {
-                x_out[*counter] = (x + xx) * (1 << octave);
-                y_out[*counter] = (y + xy) * (1 << octave);
-                layer_out[*counter] = layer;
-                response_out[*counter] = abs(contr);
-                size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers);
-                (*counter)++;
-            }
-        }
-    }
-
-#undef CPTR
-#undef PPTR
-#undef NPTR
-
-// Remove duplicate keypoints
-    void removeDuplicates(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        unsigned* counter,
-        const std::vector<feat_t>& sorted_feat)
-    {
-        size_t nfeat = sorted_feat.size();
-
-        for (size_t f = 0; f < nfeat; f++) {
-            float prec_fctr = 1e4f;
-
-            if (f < nfeat-1) {
-                if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
-                    round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
-                    round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
-                    round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
-                    sorted_feat[f].l == sorted_feat[f+1].l)
-                    continue;
-            }
-
-            x_out[*counter] = sorted_feat[f].f[0];
-            y_out[*counter] = sorted_feat[f].f[1];
-            response_out[*counter] = sorted_feat[f].f[2];
-            size_out[*counter] = sorted_feat[f].f[3];
-            layer_out[*counter] = sorted_feat[f].l;
-            (*counter)++;
-        }
-    }
-
-#define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)])
-
-// Computes a canonical orientation for each image feature in an array.  Based
-// on Section 5 of Lowe's paper.  This function adds features to the array when
-// there is more than one dominant orientation at a given feature location.
-    template<typename T>
-    void calcOrientation(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        float* ori_out,
-        unsigned* counter,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const unsigned max_feat,
-        const unsigned octave,
-        const unsigned n_layers,
-        const bool double_input)
-    {
-        const int n = OriHistBins;
-
-        float hist[OriHistBins];
-        float temphist[OriHistBins];
-
-        for (unsigned f = 0; f < total_feat; f++) {
-            // Load keypoint information
-            const float real_x = x_in[f];
-            const float real_y = y_in[f];
-            const unsigned layer = layer_in[f];
-            const float response = response_in[f];
-            const float size = size_in[f];
-
-            const int pt_x = (int)round(real_x / (1 << octave));
-            const int pt_y = (int)round(real_y / (1 << octave));
-
-            // Calculate auxiliary parameters
-            const float scl_octv = size*0.5f / (1 << octave);
-            const int radius = (int)round(OriRadius * scl_octv);
-            const float sigma = OriSigFctr * scl_octv;
-            const int len = (radius*2+1);
-            const float exp_denom = 2.f * sigma * sigma;
-
-            // Points img to correct Gaussian pyramid layer
-            const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
-
-            for (int i = 0; i < OriHistBins; i++)
-                hist[i] = 0.f;
-
-            af::dim4 idims = img.dims();
-
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
-
-                int y = pt_y + i;
-                int x = pt_x + j;
-                if (y < 1 || y >= idims[0] - 1 ||
-                    x < 1 || x >= idims[1] - 1)
-                    continue;
-
-                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                float mag = sqrt(dx*dx+dy*dy);
-                float ori = atan2(dy,dx);
-                float w = exp(-(i*i + j*j)/exp_denom);
-
-                int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
-                bin = bin < n ? bin : 0;
-
-                hist[bin] += w*mag;
-            }
-
-            for (int i = 0; i < SmoothOriPasses; i++) {
-                for (int j = 0; j < n; j++) {
-                    temphist[j] = hist[j];
-                }
-                for (int j = 0; j < n; j++) {
-                    float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
-                    float next = (j+1 == n) ? temphist[0] : temphist[j+1];
-                    hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
-                }
-            }
-
-            float omax = hist[0];
-            for (int i = 1; i < n; i++)
-                omax = max(omax, hist[i]);
-
-            float mag_thr = (float)(omax * OriPeakRatio);
-            int l, r;
-            for (int j = 0; j < n; j++) {
-                l = (j == 0) ? n - 1 : j - 1;
-                r = (j + 1) % n;
-                if (hist[j] > hist[l] &&
-                    hist[j] > hist[r] &&
-                    hist[j] >= mag_thr) {
-                    if (*counter < max_feat) {
-                        float bin = j + 0.5f * (hist[l] - hist[r]) /
-                            (hist[l] - 2.0f*hist[j] + hist[r]);
-                        bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
-                        float ori = 360.f - ((360.f/n) * bin);
-
-                        float new_real_x = real_x;
-                        float new_real_y = real_y;
-                        float new_size = size;
-
-                        if (double_input) {
-                            float scale = 0.5f;
-                            new_real_x *= scale;
-                            new_real_y *= scale;
-                            new_size *= scale;
-                        }
-
-                        x_out[*counter] = new_real_x;
-                        y_out[*counter] = new_real_y;
-                        layer_out[*counter] = layer;
-                        response_out[*counter] = response;
-                        size_out[*counter] = new_size;
-                        ori_out[*counter] = ori;
-                        (*counter)++;
-                    }
-                }
-            }
-        }
-    }
-
-    void normalizeDesc(
-        float* desc,
-        const int histlen)
-    {
-        float len_sq = 0.0f;
-
-        for (int i = 0; i < histlen; i++)
-            len_sq += desc[i] * desc[i];
-
-        float len_inv = 1.0f / sqrt(len_sq);
-
-        for (int i = 0; i < histlen; i++) {
-            desc[i] *= len_inv;
-        }
-    }
-
-// Computes feature descriptors for features in an array.  Based on Section 6
-// of Lowe's paper.
-    template<typename T>
-    void computeDescriptor(
-        float* desc_out,
-        const unsigned desc_len,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const float* ori_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const int d,
-        const int n,
-        const float scale,
-        const float sigma,
-        const unsigned octave,
-        const unsigned n_layers)
-    {
-        float desc[128];
-
-        for (unsigned f = 0; f < total_feat; f++) {
-            const unsigned layer = layer_in[f];
-            float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
-            ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
-            //const float size = size_in[f];
-            const int fx = round(x_in[f] * scale);
-            const int fy = round(y_in[f] * scale);
-
-            // Points img to correct Gaussian pyramid layer
-            Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
-            af::dim4 idims = img.dims();
-
-            float cos_t = cos(ori);
-            float sin_t = sin(ori);
-            float bins_per_rad = n / (PI_VAL * 2.f);
-            float exp_denom = d * d * 0.5f;
-            float hist_width = DescrSclFctr * sigma * powf(2.f, layer/n_layers);
-            int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
-            int len = radius*2+1;
-            const int histlen = d*d*n;
-
-            for (int i = 0; i < histlen; i++)
-                desc[i] = 0.f;
-
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
-
-                int y = fy + i;
-                int x = fx + j;
-
-                float x_rot = (j * cos_t - i * sin_t) / hist_width;
-                float y_rot = (j * sin_t + i * cos_t) / hist_width;
-                float xbin = x_rot + d/2 - 0.5f;
-                float ybin = y_rot + d/2 - 0.5f;
-
-                if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
-                    y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
-                    float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                    float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                    float grad_mag = sqrt(dx*dx + dy*dy);
-                    float grad_ori = atan2(dy, dx) - ori;
-                    while (grad_ori < 0.0f)
-                        grad_ori += PI_VAL*2;
-                    while (grad_ori >= PI_VAL*2)
-                        grad_ori -= PI_VAL*2;
-
-                    float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
-                    float obin = grad_ori * bins_per_rad;
-                    float mag = grad_mag*w;
-
-                    int x0 = floor(xbin);
-                    int y0 = floor(ybin);
-                    int o0 = floor(obin);
-                    xbin -= x0;
-                    ybin -= y0;
-                    obin -= o0;
-
-                    for (int yl = 0; yl <= 1; yl++) {
-                        int yb = y0 + yl;
-                        if (yb >= 0 && yb < d) {
-                            float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
-                            for (int xl = 0; xl <= 1; xl++) {
-                                int xb = x0 + xl;
-                                if (xb >= 0 && xb < d) {
-                                    float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
-                                    for (int ol = 0; ol <= 1; ol++) {
-                                        int ob = (o0 + ol) % n;
-                                        float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
-                                        desc[(yb*d + xb)*n + ob] += v_o;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            normalizeDesc(desc, histlen);
-
-            for (int i = 0; i < d*d*n; i++)
-                desc[i] = min(desc[i], DescrMagThr);
-
-            normalizeDesc(desc, histlen);
-
-            // Calculate final descriptor values
-            for (int k = 0; k < d*d*n; k++) {
-                desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
-            }
-        }
-    }
-
-#undef IPTR
-
-    template<typename T, typename convAccT>
-    Array<T> createInitialImage(
-        const Array<T>& img,
-        const float init_sigma,
-        const bool double_input)
-    {
-        af::dim4 idims = img.dims();
-
-        Array<T> init_img = createEmptyArray<T>(af::dim4());
-
-        float s = (double_input) ? sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4)
-            : sqrt(init_sigma * init_sigma - InitSigma * InitSigma);
-
-        Array<T> filter = gauss_filter<T>(s);
-
-        if (double_input) {
-            Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
-            init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
-        }
-        else {
-            init_img = convolve2<T, convAccT, false>(init_img, filter, filter);
-        }
-
-        return init_img;
-    }
-
-    template<typename T, typename convAccT>
-    std::vector< Array<T> > buildGaussPyr(
-        const Array<T>& init_img,
-        const unsigned n_octaves,
-        const unsigned n_layers,
-        const float init_sigma)
-    {
-        // Precompute Gaussian sigmas using the following formula:
-        // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
-        std::vector<float> sig_layers(n_layers + 3);
-        sig_layers[0] = init_sigma;
-        float k = std::pow(2.0f, 1.0f / n_layers);
-        for (unsigned i = 1; i < n_layers + 3; i++) {
-            float sig_prev = std::pow(k, i-1) * init_sigma;
-            float sig_total = sig_prev * k;
-            sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
-        }
-
-        // Gaussian Pyramid
-        std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
-        for (unsigned o = 0; o < n_octaves; o++) {
-            for (unsigned l = 0; l < n_layers+3; l++) {
-                unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
-                unsigned idx = o*(n_layers+3) + l;
-
-                if (o == 0 && l == 0) {
-                    gauss_pyr[idx] = init_img;
-                }
-                else if (l == 0) {
-                    af::dim4 sdims = gauss_pyr[src_idx].dims();
-                    gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
-                }
-                else {
-                    Array<T> filter = gauss_filter<T>(sig_layers[l]);
-
-                    gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
-                }
-            }
-        }
-
-        return gauss_pyr;
-    }
-
-    template<typename T>
-    std::vector< Array<T> > buildDoGPyr(
-        std::vector< Array<T> >& gauss_pyr,
-        const unsigned n_octaves,
-        const unsigned n_layers)
-    {
-        // DoG Pyramid
-        std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
-        for (unsigned o = 0; o < n_octaves; o++) {
-            for (unsigned l = 0; l < n_layers+2; l++) {
-                unsigned idx    = o*(n_layers+2) + l;
-                unsigned bottom = o*(n_layers+3) + l;
-                unsigned top    = o*(n_layers+3) + l+1;
-
-                dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
-
-                sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
-            }
-        }
-
-        return dog_pyr;
-    }
-
-
-    template<typename T, typename convAccT>
-    unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
-                       Array<float>& ori, Array<float>& size, Array<float>& desc,
-                       const Array<T>& in, const unsigned n_layers,
-                       const float contrast_thr, const float edge_thr,
-                       const float init_sigma, const bool double_input,
-                       const float img_scale, const float feature_ratio)
-    {
-        af::dim4 idims = in.dims();
-
-        const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
-            : min(idims[0], idims[1]);
-        const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
-
-        Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
-
-        std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
-
-        std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
-
-        std::vector<float*> x_pyr(n_octaves, NULL);
-        std::vector<float*> y_pyr(n_octaves, NULL);
-        std::vector<float*> response_pyr(n_octaves, NULL);
-        std::vector<float*> size_pyr(n_octaves, NULL);
-        std::vector<float*> ori_pyr(n_octaves, NULL);
-        std::vector<float*> desc_pyr(n_octaves, NULL);
-        std::vector<unsigned> feat_pyr(n_octaves, 0);
-        unsigned total_feat = 0;
-
-        const unsigned d = DescrWidth;
-        const unsigned n = DescrHistBins;
-        const unsigned desc_len = d*d*n;
-
-        for (unsigned i = 0; i < n_octaves; i++) {
-            af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
-            if (ddims[0]-2*ImgBorder < 1 ||
-                ddims[1]-2*ImgBorder < 1)
-                continue;
-
-            const unsigned imel = ddims[0] * ddims[1];
-            const unsigned max_feat = ceil(imel * feature_ratio);
-
-            float* extrema_x = memAlloc<float>(max_feat);
-            float* extrema_y = memAlloc<float>(max_feat);
-            unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
-            unsigned extrema_feat = 0;
-
-            for (unsigned j = 1; j <= n_layers; j++) {
-                unsigned prev   = i*(n_layers+2) + j-1;
-                unsigned center = i*(n_layers+2) + j;
-                unsigned next   = i*(n_layers+2) + j+1;
-
-                unsigned layer = j;
-
-                float extrema_thr = 0.5f * contrast_thr / n_layers;
-                detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
-                                 dog_pyr[prev], dog_pyr[center], dog_pyr[next],
-                                 layer, max_feat, extrema_thr);
-            }
-
-            extrema_feat = min(extrema_feat, max_feat);
-
-            if (extrema_feat == 0) {
-                memFree(extrema_x);
-                memFree(extrema_y);
-                memFree(extrema_layer);
-
-                continue;
-            }
-
-            unsigned interp_feat = 0;
-
-            float* interp_x = memAlloc<float>(extrema_feat);
-            float* interp_y = memAlloc<float>(extrema_feat);
-            unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
-            float* interp_response = memAlloc<float>(extrema_feat);
-            float* interp_size = memAlloc<float>(extrema_feat);
-
-            interpolateExtrema<T>(interp_x, interp_y, interp_layer,
-                                  interp_response, interp_size, &interp_feat,
-                                  extrema_x, extrema_y, extrema_layer, extrema_feat,
-                                  dog_pyr, max_feat, i, n_layers,
-                                  contrast_thr, edge_thr, init_sigma, img_scale);
-
-            interp_feat = min(interp_feat, max_feat);
-
-            if (interp_feat == 0) {
-                memFree(interp_x);
-                memFree(interp_y);
-                memFree(interp_layer);
-                memFree(interp_response);
-                memFree(interp_size);
-
-                continue;
-            }
-
-            std::vector<feat_t> sorted_feat;
-            array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
-            std::sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
-
-            memFree(interp_x);
-            memFree(interp_y);
-            memFree(interp_layer);
-            memFree(interp_response);
-            memFree(interp_size);
-
-            unsigned nodup_feat = 0;
-
-            float* nodup_x = memAlloc<float>(interp_feat);
-            float* nodup_y = memAlloc<float>(interp_feat);
-            unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
-            float* nodup_response = memAlloc<float>(interp_feat);
-            float* nodup_size = memAlloc<float>(interp_feat);
-
-            removeDuplicates(nodup_x, nodup_y, nodup_layer,
-                             nodup_response, nodup_size, &nodup_feat,
-                             sorted_feat);
-
-            const unsigned max_oriented_feat = nodup_feat * 3;
-
-            float* oriented_x = memAlloc<float>(max_oriented_feat);
-            float* oriented_y = memAlloc<float>(max_oriented_feat);
-            unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
-            float* oriented_response = memAlloc<float>(max_oriented_feat);
-            float* oriented_size = memAlloc<float>(max_oriented_feat);
-            float* oriented_ori = memAlloc<float>(max_oriented_feat);
-
-            unsigned oriented_feat = 0;
-
-            calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
-                               oriented_response, oriented_size, oriented_ori, &oriented_feat,
-                               nodup_x, nodup_y, nodup_layer,
-                               nodup_response, nodup_size, nodup_feat,
-                               gauss_pyr, max_oriented_feat, i, n_layers, double_input);
-
-            memFree(nodup_x);
-            memFree(nodup_y);
-            memFree(nodup_layer);
-            memFree(nodup_response);
-            memFree(nodup_size);
-
-            if (oriented_feat == 0) {
-                memFree(oriented_x);
-                memFree(oriented_y);
-                memFree(oriented_layer);
-                memFree(oriented_response);
-                memFree(oriented_size);
-                memFree(oriented_ori);
-
-                continue;
-            }
-
-            float* desc = memAlloc<float>(oriented_feat * desc_len);
-
-            float scale = 1.f/(1 << i);
-            if (double_input) scale *= 2.f;
-
-            computeDescriptor<T>(desc, desc_len,
-                                 oriented_x, oriented_y, oriented_layer,
-                                 oriented_response, oriented_size, oriented_ori,
-                                 oriented_feat, gauss_pyr, d, n, scale, init_sigma, i, n_layers);
-
-            total_feat += oriented_feat;
-            feat_pyr[i] = oriented_feat;
-
-            if (oriented_feat > 0) {
-                x_pyr[i] = oriented_x;
-                y_pyr[i] = oriented_y;
-                response_pyr[i] = oriented_response;
-                ori_pyr[i] = oriented_ori;
-                size_pyr[i] = oriented_size;
-                desc_pyr[i] = desc;
-            }
-        }
-
-        if (total_feat > 0) {
-            const af::dim4 total_feat_dims(total_feat);
-            const af::dim4 desc_dims(desc_len, total_feat);
-
-            // Allocate output memory
-            x     = createEmptyArray<float>(total_feat_dims);
-            y     = createEmptyArray<float>(total_feat_dims);
-            score = createEmptyArray<float>(total_feat_dims);
-            ori   = createEmptyArray<float>(total_feat_dims);
-            size  = createEmptyArray<float>(total_feat_dims);
-            desc  = createEmptyArray<float>(desc_dims);
-
-            float* x_ptr = x.get();
-            float* y_ptr = y.get();
-            float* score_ptr = score.get();
-            float* ori_ptr = ori.get();
-            float* size_ptr = size.get();
-            float* desc_ptr = desc.get();
-
-            unsigned offset = 0;
-            for (unsigned i = 0; i < n_octaves; i++) {
-                if (feat_pyr[i] == 0)
-                    continue;
-
-                memcpy(x_ptr+offset,     x_pyr[i],        feat_pyr[i] * sizeof(float));
-                memcpy(y_ptr+offset,     y_pyr[i],        feat_pyr[i] * sizeof(float));
-                memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
-                memcpy(ori_ptr+offset,   ori_pyr[i],      feat_pyr[i] * sizeof(float));
-                memcpy(size_ptr+offset,  size_pyr[i],     feat_pyr[i] * sizeof(float));
-
-                memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
-
-                memFree(x_pyr[i]);
-                memFree(y_pyr[i]);
-                memFree(response_pyr[i]);
-                memFree(ori_pyr[i]);
-                memFree(size_pyr[i]);
-                memFree(desc_pyr[i]);
-
-                offset += feat_pyr[i];
-            }
-        }
-
-        return total_feat;
-    }
-}
diff --git a/src/backend/cuda/kernel/sift_nonfree.hpp b/src/backend/cuda/kernel/sift_nonfree.hpp
deleted file mode 100644
index c8de759..0000000
--- a/src/backend/cuda/kernel/sift_nonfree.hpp
+++ /dev/null
@@ -1,1370 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements.  Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims.  If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement.  It does not protect you from
-// committing patent infringement.  So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either.  If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//     * Redistributions of source code must retain the above copyright and
-//       patent notices, this list of conditions and the following
-//       disclaimer.
-//     * Redistributions in binary form must reproduce the above copyright
-//       notice, this list of conditions and the following disclaimer in
-//       the documentation and/or other materials provided with the
-//       distribution.
-//     * Neither the name of Oregon State University nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <af/defines.h>
-#include <dispatch.hpp>
-#include <err_cuda.hpp>
-#include <debug_cuda.hpp>
-#include <memory.hpp>
-#include "shared.hpp"
-
-#include <convolve_common.hpp>
-#include "convolve.hpp"
-#include "resize.hpp"
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/generate.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/gather.h>
-#include <thrust/random.h>
-
-#include <cfloat>
-
-namespace cuda
-{
-
-namespace kernel
-{
-
-static const dim_t SIFT_THREADS   = 256;
-static const dim_t SIFT_THREADS_X = 32;
-static const dim_t SIFT_THREADS_Y = 8;
-
-#define PI_VAL 3.14159265358979323846f
-
-// default width of descriptor histogram array
-#define DESCR_WIDTH 4
-
-// default number of bins per histogram in descriptor array
-#define DESCR_HIST_BINS 8
-
-// assumed gaussian blur for input image
-#define INIT_SIGMA 0.5f
-
-// width of border in which to ignore keypoints
-#define IMG_BORDER 5
-
-// maximum steps of keypointerpolation before failure
-#define MAX_INTERP_STEPS 5
-
-// default number of bins in histogram for orientation assignment
-#define ORI_HIST_BINS 36
-
-// determines gaussian sigma for orientation assignment
-#define ORI_SIG_FCTR 1.5f
-
-// determines the radius of the region used in orientation assignment */
-#define ORI_RADIUS (3.0f * ORI_SIG_FCTR)
-
-// number of passes of orientation histogram smoothing
-#define SMOOTH_ORI_PASSES 2
-
-// orientation magnitude relative to max that results in new feature
-#define ORI_PEAK_RATIO 0.8f
-
-// determines the size of a single descriptor orientation histogram
-#define DESCR_SCL_FCTR 3.f
-
-// threshold on magnitude of elements of descriptor vector
-#define DESC_MAG_THR 0.2f
-
-// factor used to convert floating-podescriptor to unsigned char
-#define INT_DESCR_FCTR 512.f
-
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
-    if(!(sigma>0)) sigma = 0.25*dim;
-
-    T sum = (T)0;
-    for(int i=0;i<dim;i++)
-    {
-        int x = i-(dim-1)/2;
-        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-        out[i] = el;
-        sum   += el;
-    }
-
-    for(int k=0;k<dim;k++)
-        out[k] /= sum;
-}
-
-template<typename T>
-Param<T> gauss_filter(float sigma)
-{
-    // Using 6-sigma rule
-    unsigned gauss_len = (unsigned)round(sigma * 6 + 1) | 1;
-
-    T* h_gauss = new T[gauss_len];
-    gaussian1D(h_gauss, gauss_len, sigma);
-    Param<T> gauss_filter;
-    gauss_filter.dims[0] = gauss_len;
-    gauss_filter.strides[0] = 1;
-
-    for (int k = 1; k < 4; k++) {
-        gauss_filter.dims[k] = 1;
-        gauss_filter.strides[k] = gauss_filter.dims[k-1] * gauss_filter.strides[k-1];
-    }
-
-    dim_t gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
-    gauss_filter.ptr = memAlloc<T>(gauss_elem);
-    CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(T), cudaMemcpyHostToDevice));
-
-    delete[] h_gauss;
-
-    return gauss_filter;
-}
-
-template<int N>
-__inline__ __device__ void gaussianElimination(float* A, float* b, float* x)
-{
-    // forward elimination
-    #pragma unroll
-    for (int i = 0; i < N-1; i++) {
-        #pragma unroll
-        for (int j = i+1; j < N; j++) {
-            float s = A[j*N+i] / A[i*N+i];
-
-            #pragma unroll
-            for (int k = i; k < N; k++)
-                A[j*N+k] -= s * A[i*N+k];
-
-            b[j] -= s * b[i];
-        }
-    }
-
-    #pragma unroll
-    for (int i = 0; i < N; i++)
-        x[i] = 0;
-
-    // backward substitution
-    float sum = 0;
-    #pragma unroll
-    for (int i = 0; i <= N-2; i++) {
-        sum = b[i];
-        #pragma unroll
-        for (int j = i+1; j < N; j++)
-            sum -= A[i*N+j] * x[j];
-        x[i] = sum / A[i*N+i];
-    }
-}
-
-__inline__ __device__ void normalizeDesc(
-    float* desc,
-    float* accum,
-    const int histlen)
-{
-    int tid_x = threadIdx.x;
-    int tid_y = threadIdx.y;
-    int bsz_x = blockDim.x;
-
-    for (int i = tid_x; i < histlen; i += bsz_x)
-        accum[tid_x] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
-    __syncthreads();
-
-    if (tid_x < 64)
-        accum[tid_x] += accum[tid_x+64];
-    __syncthreads();
-    if (tid_x < 32)
-        accum[tid_x] += accum[tid_x+32];
-    __syncthreads();
-    if (tid_x < 16)
-        accum[tid_x] += accum[tid_x+16];
-    __syncthreads();
-    if (tid_x < 8)
-        accum[tid_x] += accum[tid_x+8];
-    __syncthreads();
-    if (tid_x < 4)
-        accum[tid_x] += accum[tid_x+4];
-    __syncthreads();
-    if (tid_x < 2)
-        accum[tid_x] += accum[tid_x+2];
-    __syncthreads();
-    if (tid_x < 1)
-        accum[tid_x] += accum[tid_x+1];
-    __syncthreads();
-
-    float len_sq = accum[0];
-    float len_inv = 1.0f / sqrtf(len_sq);
-
-    for (int i = tid_x; i < histlen; i += bsz_x) {
-        desc[tid_y*histlen+i] *= len_inv;
-    }
-    __syncthreads();
-}
-
-template<typename T>
-__global__ void sub(
-    Param<T> out,
-    CParam<T> in,
-    const unsigned nel,
-    const unsigned n_layers)
-{
-    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    for (unsigned l = 0; l < n_layers; l++)
-        out.ptr[l*nel + i] = in.ptr[(l+1)*nel + i] - in.ptr[l*nel + i];
-}
-
-#define SCPTR(Y, X) (s_center[(Y) * s_i + (X)])
-#define SPPTR(Y, X) (s_prev[(Y) * s_i + (X)])
-#define SNPTR(Y, X) (s_next[(Y) * s_i + (X)])
-#define DPTR(Z, Y, X) (dog.ptr[(Z) * imel + (Y) * dim0 + (X)])
-
-// Determines whether a pixel is a scale-space extremum by comparing it to its
-// 3x3x3 pixel neighborhood.
-template<typename T>
-__global__ void detectExtrema(
-    float* x_out,
-    float* y_out,
-    unsigned* layer_out,
-    unsigned* counter,
-    CParam<T> dog,
-    const unsigned max_feat,
-    const float threshold)
-{
-    const int dim0 = dog.dims[0];
-    const int dim1 = dog.dims[1];
-    const int imel = dim0 * dim1;
-
-    const int tid_i = threadIdx.x;
-    const int tid_j = threadIdx.y;
-    const int bsz_i = blockDim.x;
-    const int bsz_j = blockDim.y;
-    const int i = blockIdx.x * bsz_i + tid_i+IMG_BORDER;
-    const int j = blockIdx.y * bsz_j + tid_j+IMG_BORDER;
-
-    const int x = tid_i+1;
-    const int y = tid_j+1;
-
-    // One pixel border for each side
-    const int s_i = bsz_i+2;
-    const int s_j = bsz_j+2;
-
-    SharedMemory<float> shared;
-    float* shrdMem = shared.getPointer();
-    float* s_next   = shrdMem;
-    float* s_center = shrdMem + s_i * s_j;
-    float* s_prev   = shrdMem + s_i * s_j * 2;
-
-    for (int l = 1; l < dog.dims[2]-1; l++) {
-        const int s_i_half = s_i/2;
-        const int s_j_half = s_j/2;
-        if (tid_i < s_i_half && tid_j < s_j_half && i < dim0-IMG_BORDER+1 && j < dim1-IMG_BORDER+1) {
-            SNPTR(tid_j, tid_i) = DPTR(l+1, j-1, i-1);
-            SCPTR(tid_j, tid_i) = DPTR(l,   j-1, i-1);
-            SPPTR(tid_j, tid_i) = DPTR(l-1, j-1, i-1);
-
-            SNPTR(tid_j, tid_i+s_i_half) = DPTR((l+1), j-1, i-1+s_i_half);
-            SCPTR(tid_j, tid_i+s_i_half) = DPTR((l  ), j-1, i-1+s_i_half);
-            SPPTR(tid_j, tid_i+s_i_half) = DPTR((l-1), j-1, i-1+s_i_half);
-
-            SNPTR(tid_j+s_j_half, tid_i) = DPTR(l+1, j-1+s_j_half, i-1);
-            SCPTR(tid_j+s_j_half, tid_i) = DPTR(l,   j-1+s_j_half, i-1);
-            SPPTR(tid_j+s_j_half, tid_i) = DPTR(l-1, j-1+s_j_half, i-1);
-
-            SNPTR(tid_j+s_j_half, tid_i+s_i_half) = DPTR(l+1, j-1+s_j_half, i-1+s_i_half);
-            SCPTR(tid_j+s_j_half, tid_i+s_i_half) = DPTR(l,   j-1+s_j_half, i-1+s_i_half);
-            SPPTR(tid_j+s_j_half, tid_i+s_i_half) = DPTR(l-1, j-1+s_j_half, i-1+s_i_half);
-        }
-        __syncthreads();
-
-        float p = SCPTR(y, x);
-
-        if (abs(p) > threshold && i < dim0-IMG_BORDER && j < dim1-IMG_BORDER &&
-            ((p > 0 && p > SCPTR(y-1, x-1) && p > SCPTR(y-1, x) &&
-              p > SCPTR(y-1, x+1) && p > SCPTR(y, x-1) && p > SCPTR(y,   x+1)  &&
-              p > SCPTR(y+1, x-1) && p > SCPTR(y+1, x) && p > SCPTR(y+1, x+1)  &&
-              p > SPPTR(y-1, x-1) && p > SPPTR(y-1, x) && p > SPPTR(y-1, x+1)  &&
-              p > SPPTR(y,   x-1) && p > SPPTR(y  , x) && p > SPPTR(y,   x+1)  &&
-              p > SPPTR(y+1, x-1) && p > SPPTR(y+1, x) && p > SPPTR(y+1, x+1)  &&
-              p > SNPTR(y-1, x-1) && p > SNPTR(y-1, x) && p > SNPTR(y-1, x+1)  &&
-              p > SNPTR(y,   x-1) && p > SNPTR(y  , x) && p > SNPTR(y,   x+1)  &&
-              p > SNPTR(y+1, x-1) && p > SNPTR(y+1, x) && p > SNPTR(y+1, x+1)) ||
-             (p < 0 && p < SCPTR(y-1, x-1) && p < SCPTR(y-1, x) &&
-              p < SCPTR(y-1, x+1) && p < SCPTR(y, x-1) && p < SCPTR(y,   x+1)  &&
-              p < SCPTR(y+1, x-1) && p < SCPTR(y+1, x) && p < SCPTR(y+1, x+1)  &&
-              p < SPPTR(y-1, x-1) && p < SPPTR(y-1, x) && p < SPPTR(y-1, x+1)  &&
-              p < SPPTR(y,   x-1) && p < SPPTR(y  , x) && p < SPPTR(y,   x+1)  &&
-              p < SPPTR(y+1, x-1) && p < SPPTR(y+1, x) && p < SPPTR(y+1, x+1)  &&
-              p < SNPTR(y-1, x-1) && p < SNPTR(y-1, x) && p < SNPTR(y-1, x+1)  &&
-              p < SNPTR(y,   x-1) && p < SNPTR(y  , x) && p < SNPTR(y,   x+1)  &&
-              p < SNPTR(y+1, x-1) && p < SNPTR(y+1, x) && p < SNPTR(y+1, x+1)))) {
-
-            unsigned idx = atomicAdd(counter, 1u);
-            if (idx < max_feat)
-            {
-                x_out[idx] = (float)j;
-                y_out[idx] = (float)i;
-                layer_out[idx] = l;
-            }
-        }
-        __syncthreads();
-    }
-}
-
-#undef SCPTR
-#undef SPPTR
-#undef SNPTR
-#define CPTR(Y, X) (center_ptr[(Y) * dim0 + (X)])
-#define PPTR(Y, X) (prev_ptr[(Y) * dim0 + (X)])
-#define NPTR(Y, X) (next_ptr[(Y) * dim0 + (X)])
-
-// Interpolates a scale-space extremum's location and scale to subpixel
-// accuracy to form an image feature. Rejects features with low contrast.
-// Based on Section 4 of Lowe's paper.
-template<typename T>
-__global__ void interpolateExtrema(
-    float* x_out,
-    float* y_out,
-    unsigned* layer_out,
-    float* response_out,
-    float* size_out,
-    unsigned* counter,
-    const float* x_in,
-    const float* y_in,
-    const unsigned* layer_in,
-    const unsigned extrema_feat,
-    const CParam<T> dog_octave,
-    const unsigned max_feat,
-    const unsigned octave,
-    const unsigned n_layers,
-    const float contrast_thr,
-    const float edge_thr,
-    const float sigma,
-    const float img_scale)
-{
-    const unsigned f = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (f < extrema_feat) {
-        const float first_deriv_scale = img_scale*0.5f;
-        const float second_deriv_scale = img_scale;
-        const float cross_deriv_scale = img_scale*0.25f;
-
-        float xl = 0, xy = 0, xx = 0, contr = 0;
-        int i = 0;
-
-        unsigned x = x_in[f];
-        unsigned y = y_in[f];
-        unsigned layer = layer_in[f];
-
-        const int dim0 = dog_octave.dims[0];
-        const int dim1 = dog_octave.dims[1];
-        const int imel = dim0 * dim1;
-
-        const T* prev_ptr   = dog_octave.ptr + (layer-1) * imel;
-        const T* center_ptr = dog_octave.ptr + (layer)   * imel;
-        const T* next_ptr   = dog_octave.ptr + (layer+1) * imel;
-
-        for(i = 0; i < MAX_INTERP_STEPS; i++) {
-            float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                           (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                           (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-
-            float d2  = CPTR(x, y) * 2.f;
-            float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-            float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-            float dss = (NPTR(x, y  ) + PPTR(x, y  ) - d2) * second_deriv_scale;
-            float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                         CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-            float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
-                         PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
-            float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
-                         PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
-            float H[9] = {dxx, dxy, dxs,
-                          dxy, dyy, dys,
-                          dxs, dys, dss};
-
-            float X[3];
-            gaussianElimination<3>(H, dD, X);
-
-            xl = -X[2];
-            xy = -X[1];
-            xx = -X[0];
-
-            if (abs(xl) < 0.5f && abs(xy) < 0.5f && abs(xx) < 0.5f)
-                break;
-
-            x += round(xx);
-            y += round(xy);
-            layer += round(xl);
-
-            if (layer < 1 || layer > n_layers ||
-                x < IMG_BORDER || x >= dim1 - IMG_BORDER ||
-                y < IMG_BORDER || y >= dim0 - IMG_BORDER)
-                return;
-        }
-
-        // ensure convergence of interpolation
-        if (i >= MAX_INTERP_STEPS)
-            return;
-
-        float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                       (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                       (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-        float X[3] = {xx, xy, xl};
-
-        float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
-        contr = CPTR(x, y)*img_scale + P * 0.5f;
-        if (abs(contr) < (contrast_thr / n_layers))
-            return;
-
-        // principal curvatures are computed using the trace and det of Hessian
-        float d2  = CPTR(x, y) * 2.f;
-        float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-        float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-        float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                     CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-
-        float tr = dxx + dyy;
-        float det = dxx * dyy - dxy * dxy;
-
-        // add FLT_EPSILON for double-precision compatibility
-        if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
-            return;
-
-        unsigned ridx = atomicAdd(counter, 1u);
-
-        if (ridx < max_feat)
-        {
-            x_out[ridx] = (x + xx) * (1 << octave);
-            y_out[ridx] = (y + xy) * (1 << octave);
-            layer_out[ridx] = layer;
-            response_out[ridx] = abs(contr);
-            size_out[ridx] = sigma*pow(2.f, octave + (layer + xl) / n_layers);
-        }
-    }
-}
-
-#undef CPTR
-#undef PPTR
-#undef NPTR
-
-// Remove duplicate keypoints
-__global__ void removeDuplicates(
-    float* x_out,
-    float* y_out,
-    unsigned* layer_out,
-    float* response_out,
-    float* size_out,
-    unsigned* counter,
-    const float* x_in,
-    const float* y_in,
-    const unsigned* layer_in,
-    const float* response_in,
-    const float* size_in,
-    const unsigned total_feat)
-{
-    const unsigned f = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (f >= total_feat)
-        return;
-
-    float prec_fctr = 1e4f;
-
-    if (f < total_feat-1) {
-        if (round(x_in[f]*prec_fctr) == round(x_in[f+1]*prec_fctr) &&
-            round(y_in[f]*prec_fctr) == round(y_in[f+1]*prec_fctr) &&
-            layer_in[f] == layer_in[f+1] &&
-            round(response_in[f]*prec_fctr) == round(response_in[f+1]*prec_fctr) &&
-            round(size_in[f]*prec_fctr) == round(size_in[f+1]*prec_fctr))
-            return;
-    }
-
-    unsigned idx = atomicAdd(counter, 1);
-
-    x_out[idx] = x_in[f];
-    y_out[idx] = y_in[f];
-    layer_out[idx] = layer_in[f];
-    response_out[idx] = response_in[f];
-    size_out[idx] = size_in[f];
-}
-
-#define IPTR(Y, X) (img_ptr[(Y) * dim0 + (X)])
-
-// Computes a canonical orientation for each image feature in an array.  Based
-// on Section 5 of Lowe's paper.  This function adds features to the array when
-// there is more than one dominant orientation at a given feature location.
-template<typename T>
-__global__ void calcOrientation(
-    float* x_out,
-    float* y_out,
-    unsigned* layer_out,
-    float* response_out,
-    float* size_out,
-    float* ori_out,
-    unsigned* counter,
-    const float* x_in,
-    const float* y_in,
-    const unsigned* layer_in,
-    const float* response_in,
-    const float* size_in,
-    const unsigned total_feat,
-    const CParam<T> gauss_octave,
-    const unsigned max_feat,
-    const unsigned octave,
-    const bool double_input)
-{
-    const int tid_x = threadIdx.x;
-    const int tid_y = threadIdx.y;
-    const int bsz_x = blockDim.x;
-    const int bsz_y = blockDim.y;
-
-    const unsigned f = blockIdx.y * bsz_y + tid_y;
-
-    const int n = ORI_HIST_BINS;
-
-    SharedMemory<float> shared;
-    float* shrdMem = shared.getPointer();
-    float* hist = shrdMem;
-    float* temphist = shrdMem + n*8;
-
-    // Initialize temporary histogram
-    for (int i = tid_x; i < ORI_HIST_BINS; i += bsz_x)
-        hist[tid_y*n + i] = 0.f;
-    __syncthreads();
-
-    float real_x, real_y, response, size;
-    unsigned layer;
-
-    if (f < total_feat) {
-        // Load keypoint information
-        real_x = x_in[f];
-        real_y = y_in[f];
-        layer = layer_in[f];
-        response = response_in[f];
-        size = size_in[f];
-
-        const int pt_x = (int)round(real_x / (1 << octave));
-        const int pt_y = (int)round(real_y / (1 << octave));
-
-        // Calculate auxiliary parameters
-        const float scl_octv = size*0.5f / (1 << octave);
-        const int radius = (int)round(ORI_RADIUS * scl_octv);
-        const float sigma = ORI_SIG_FCTR * scl_octv;
-        const int len = (radius*2+1);
-        const float exp_denom = 2.f * sigma * sigma;
-
-        const int dim0 = gauss_octave.dims[0];
-        const int dim1 = gauss_octave.dims[1];
-        const int imel = dim0 * dim1;
-
-        // Points img to correct Gaussian pyramid layer
-        const T* img_ptr = gauss_octave.ptr + layer * imel;
-
-        // Calculate orientation histogram
-        for (int l = tid_x; l < len*len; l += bsz_x) {
-            int i = l / len - radius;
-            int j = l % len - radius;
-
-            int y = pt_y + i;
-            int x = pt_x + j;
-            if (y < 1 || y >= dim0 - 1 ||
-                x < 1 || x >= dim1 - 1)
-                continue;
-
-            float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-            float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-            float mag = sqrt(dx*dx+dy*dy);
-            float ori = atan2(dy,dx);
-            float w = exp(-(i*i + j*j)/exp_denom);
-
-            int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
-            bin = bin < n ? bin : 0;
-
-            atomicAdd(&hist[tid_y*n+bin], w*mag);
-        }
-    }
-    __syncthreads();
-
-    for (int i = 0; i < SMOOTH_ORI_PASSES; i++) {
-        for (int j = tid_x; j < n; j += bsz_x) {
-            temphist[tid_y*n+j] = hist[tid_y*n+j];
-        }
-        __syncthreads();
-        for (int j = tid_x; j < n; j += bsz_x) {
-            float prev = (j == 0) ? temphist[tid_y*n+n-1] : temphist[tid_y*n+j-1];
-            float next = (j+1 == n) ? temphist[tid_y*n] : temphist[tid_y*n+j+1];
-            hist[tid_y*n+j] = 0.25f * prev + 0.5f * temphist[tid_y*n+j] + 0.25f * next;
-        }
-        __syncthreads();
-    }
-
-    for (int i = tid_x; i < n; i += bsz_x)
-        temphist[tid_y*n+i] = hist[tid_y*n+i];
-    __syncthreads();
-
-    if (tid_x < 16)
-        temphist[tid_y*n+tid_x] = fmax(hist[tid_y*n+tid_x], hist[tid_y*n+tid_x+16]);
-    __syncthreads();
-    if (tid_x < 8)
-        temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+8]);
-    __syncthreads();
-    if (tid_x < 4) {
-        temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], hist[tid_y*n+tid_x+32]);
-        temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+4]);
-    }
-    __syncthreads();
-    if (tid_x < 2)
-        temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+2]);
-    __syncthreads();
-    if (tid_x < 1)
-        temphist[tid_y*n+tid_x] = fmax(temphist[tid_y*n+tid_x], temphist[tid_y*n+tid_x+1]);
-    __syncthreads();
-    float omax = temphist[tid_y*n];
-
-    if (f < total_feat) {
-        float mag_thr = (float)(omax * ORI_PEAK_RATIO);
-        int l, r;
-        for (int j = tid_x; j < n; j+=bsz_x) {
-            l = (j == 0) ? n - 1 : j - 1;
-            r = (j + 1) % n;
-            if (hist[tid_y*n+j] > hist[tid_y*n+l] &&
-                hist[tid_y*n+j] > hist[tid_y*n+r] &&
-                hist[tid_y*n+j] >= mag_thr) {
-                int idx = atomicAdd(counter, 1);
-
-                if (idx < max_feat) {
-                    float bin = j + 0.5f * (hist[tid_y*n+l] - hist[tid_y*n+r]) /
-                                (hist[tid_y*n+l] - 2.0f*hist[tid_y*n+j] + hist[tid_y*n+r]);
-                    bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
-                    float ori = 360.f - ((360.f/n) * bin);
-
-                    float new_real_x = real_x;
-                    float new_real_y = real_y;
-                    float new_size = size;
-
-                    if (double_input) {
-                        float scale = 0.5f;
-                        new_real_x *= scale;
-                        new_real_y *= scale;
-                        new_size *= scale;
-                    }
-
-                    x_out[idx] = new_real_x;
-                    y_out[idx] = new_real_y;
-                    layer_out[idx] = layer;
-                    response_out[idx] = response;
-                    size_out[idx] = new_size;
-                    ori_out[idx] = ori;
-                }
-            }
-        }
-    }
-}
-
-// Computes feature descriptors for features in an array.  Based on Section 6
-// of Lowe's paper.
-template<typename T>
-__global__ void computeDescriptor(
-    float* desc_out,
-    const unsigned desc_len,
-    const unsigned histsz,
-    const float* x_in,
-    const float* y_in,
-    const unsigned* layer_in,
-    const float* response_in,
-    const float* size_in,
-    const float* ori_in,
-    const unsigned total_feat,
-    const CParam<T> gauss_octave,
-    const int d,
-    const int n,
-    const float scale,
-    const float sigma,
-    const int n_layers)
-{
-    const int tid_x = threadIdx.x;
-    const int tid_y = threadIdx.y;
-    const int bsz_x = blockDim.x;
-    const int bsz_y = blockDim.y;
-
-    const int f = blockIdx.y * bsz_y + tid_y;
-
-    SharedMemory<float> shared;
-    float* shrdMem = shared.getPointer();
-    float* desc = shrdMem;
-    float* accum = shrdMem + desc_len * histsz;
-
-    const int histlen = (d)*(d)*(n);
-
-    for (int i = tid_x; i < histlen*histsz; i += bsz_x)
-        desc[tid_y*histlen+i] = 0.f;
-    __syncthreads();
-
-    if (f < total_feat) {
-        const unsigned layer = layer_in[f];
-        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
-        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
-        //const float size = size_in[f];
-        const int fx = round(x_in[f] * scale);
-        const int fy = round(y_in[f] * scale);
-
-        const int dim0 = gauss_octave.dims[0];
-        const int dim1 = gauss_octave.dims[1];
-        const int imel = dim0 * dim1;
-
-        // Points img to correct Gaussian pyramid layer
-        const T* img_ptr = gauss_octave.ptr + layer * imel;
-
-        float cos_t = cosf(ori);
-        float sin_t = sinf(ori);
-        float bins_per_rad = n / (PI_VAL * 2.f);
-        float exp_denom = d * d * 0.5f;
-        float hist_width = DESCR_SCL_FCTR * sigma * powf(2.f, layer/n_layers);
-        int radius = hist_width * sqrtf(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
-        int len = radius*2+1;
-        const int hist_off = (tid_x % histsz) * desc_len;
-
-        // Calculate orientation histogram
-        for (int l = tid_x; l < len*len; l += bsz_x) {
-            int i = l / len - radius;
-            int j = l % len - radius;
-
-            int y = fy + i;
-            int x = fx + j;
-
-            float x_rot = (j * cos_t - i * sin_t) / hist_width;
-            float y_rot = (j * sin_t + i * cos_t) / hist_width;
-            float xbin = x_rot + d/2 - 0.5f;
-            float ybin = y_rot + d/2 - 0.5f;
-
-            if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
-                y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
-                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                float grad_mag = sqrtf(dx*dx + dy*dy);
-                float grad_ori = atan2f(dy, dx) - ori;
-                while (grad_ori < 0.0f)
-                    grad_ori += PI_VAL*2;
-                while (grad_ori >= PI_VAL*2)
-                    grad_ori -= PI_VAL*2;
-
-                float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
-                float obin = grad_ori * bins_per_rad;
-                float mag = grad_mag*w;
-
-                int x0 = floor(xbin);
-                int y0 = floor(ybin);
-                int o0 = floor(obin);
-                xbin -= x0;
-                ybin -= y0;
-                obin -= o0;
-
-                for (int yl = 0; yl <= 1; yl++) {
-                    int yb = y0 + yl;
-                    if (yb >= 0 && yb < d) {
-	                    float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
-	                    for (int xl = 0; xl <= 1; xl++) {
-	                        int xb = x0 + xl;
-	                        if (xb >= 0 && xb < d) {
-		                        float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
-		                        for (int ol = 0; ol <= 1; ol++) {
-		                            int ob = (o0 + ol) % n;
-		                            float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
-		                            atomicAdd(&desc[hist_off + tid_y*desc_len + (yb*d + xb)*n + ob], v_o);
-		                        }
-		                    }
-	                    }
-	                }
-                }
-            }
-        }
-    }
-    __syncthreads();
-
-    // Combine histograms (reduces previous atomicAdd overhead)
-    for (int l = tid_x; l < desc_len*4; l += bsz_x)
-        desc[l] += desc[l+4*desc_len];
-    __syncthreads();
-    for (int l = tid_x; l < desc_len*2; l += bsz_x)
-        desc[l    ] += desc[l+2*desc_len];
-    __syncthreads();
-    for (int l = tid_x; l < desc_len; l += bsz_x)
-        desc[l] += desc[l+desc_len];
-    __syncthreads();
-
-    normalizeDesc(desc, accum, histlen);
-
-    for (int i = tid_x; i < d*d*n; i += bsz_x)
-        desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
-    __syncthreads();
-
-    normalizeDesc(desc, accum, histlen);
-
-    if (f < total_feat) {
-        // Calculate final descriptor values
-        for (int k = tid_x; k < d*d*n; k += bsz_x)
-            desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
-    }
-}
-
-#undef IPTR
-
-template<typename T, typename convAccT>
-Param<T> createInitialImage(
-    CParam<T> img,
-    const float init_sigma,
-    const bool double_input)
-{
-    Param<T> init_img, init_tmp;
-    init_img.dims[0] = init_tmp.dims[0] = (double_input) ? img.dims[0] * 2 : img.dims[0];
-    init_img.dims[1] = init_tmp.dims[1] = (double_input) ? img.dims[1] * 2 : img.dims[1];
-    init_img.strides[0] = init_tmp.strides[0] = 1;
-    init_img.strides[1] = init_tmp.strides[1] = init_img.dims[0];
-
-    for (int k = 2; k < 4; k++) {
-        init_img.dims[k] = 1;
-        init_img.strides[k] = init_img.dims[k-1] * init_img.strides[k-1];
-        init_tmp.dims[k] = 1;
-        init_tmp.strides[k] = init_tmp.dims[k-1] * init_tmp.strides[k-1];
-    }
-
-    dim_t init_img_el = init_img.strides[3] * init_img.dims[3];
-    init_img.ptr = memAlloc<T>(init_img_el);
-    init_tmp.ptr = memAlloc<T>(init_img_el);
-
-    float s = (double_input) ? sqrt(init_sigma * init_sigma - INIT_SIGMA * INIT_SIGMA * 4)
-                             : sqrt(init_sigma * init_sigma - INIT_SIGMA * INIT_SIGMA);
-
-    Param<convAccT> filter = gauss_filter<convAccT>(s);
-
-    if (double_input) {
-        resize<T, AF_INTERP_BILINEAR>(init_img, img);
-        convolve2<T, convAccT, 0, false>(init_tmp, init_img, filter);
-    }
-    else
-        convolve2<T, convAccT, 0, false>(init_tmp, img, filter);
-
-    convolve2<T, convAccT, 1, false>(init_img, CParam<T>(init_tmp), filter);
-
-    memFree(init_tmp.ptr);
-    memFree(filter.ptr);
-
-    return init_img;
-}
-
-template<typename T, typename convAccT>
-std::vector< Param<T> > buildGaussPyr(
-    Param<T> init_img,
-    const unsigned n_octaves,
-    const unsigned n_layers,
-    const float init_sigma)
-{
-    // Precompute Gaussian sigmas using the following formula:
-    // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
-    std::vector<float> sig_layers(n_layers + 3);
-    sig_layers[0] = init_sigma;
-    float k = std::pow(2.0f, 1.0f / n_layers);
-    for (unsigned i = 1; i < n_layers + 3; i++) {
-        float sig_prev = std::pow(k, i-1) * init_sigma;
-        float sig_total = sig_prev * k;
-        sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
-    }
-
-    // Gaussian Pyramid
-    std::vector<Param<T> > gauss_pyr(n_octaves);
-    std::vector<Param<T> > tmp_pyr(n_octaves * (n_layers+3));
-    for (unsigned o = 0; o < n_octaves; o++) {
-        gauss_pyr[o].dims[0] = (o == 0) ? init_img.dims[0] : gauss_pyr[o-1].dims[0] / 2;
-        gauss_pyr[o].dims[1] = (o == 0) ? init_img.dims[1] : gauss_pyr[o-1].dims[1] / 2;
-        gauss_pyr[o].dims[2] = n_layers+3;
-        gauss_pyr[o].dims[3] = 1;
-
-        gauss_pyr[o].strides[0] = 1;
-        gauss_pyr[o].strides[1] = gauss_pyr[o].dims[0] * gauss_pyr[o].strides[0];
-        gauss_pyr[o].strides[2] = gauss_pyr[o].dims[1] * gauss_pyr[o].strides[1];
-        gauss_pyr[o].strides[3] = gauss_pyr[o].dims[2] * gauss_pyr[o].strides[2];
-
-        const unsigned nel = gauss_pyr[o].dims[3] * gauss_pyr[o].strides[3];
-        gauss_pyr[o].ptr = memAlloc<T>(nel);
-
-        for (unsigned l = 0; l < n_layers+3; l++) {
-            unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
-            unsigned idx = o*(n_layers+3) + l;
-
-            if (o == 0 && l == 0) {
-                for (int k = 0; k < 4; k++) {
-                    tmp_pyr[idx].dims[k] = init_img.dims[k];
-                    tmp_pyr[idx].strides[k] = init_img.strides[k];
-                }
-                tmp_pyr[idx].ptr = init_img.ptr;
-            }
-            else if (l == 0) {
-                tmp_pyr[idx].dims[0] = tmp_pyr[src_idx].dims[0] / 2;
-                tmp_pyr[idx].dims[1] = tmp_pyr[src_idx].dims[1] / 2;
-                tmp_pyr[idx].strides[0] = 1;
-                tmp_pyr[idx].strides[1] = tmp_pyr[idx].dims[0];
-
-                for (int k = 2; k < 4; k++) {
-                    tmp_pyr[idx].dims[k] = 1;
-                    tmp_pyr[idx].strides[k] = tmp_pyr[idx].dims[k-1] * tmp_pyr[idx].strides[k-1];
-                }
-
-                dim_t lvl_el = tmp_pyr[idx].strides[3] * tmp_pyr[idx].dims[3];
-                tmp_pyr[idx].ptr = memAlloc<T>(lvl_el);
-
-                resize<T, AF_INTERP_BILINEAR>(tmp_pyr[idx], tmp_pyr[src_idx]);
-            }
-            else {
-                for (int k = 0; k < 4; k++) {
-                    tmp_pyr[idx].dims[k] = tmp_pyr[src_idx].dims[k];
-                    tmp_pyr[idx].strides[k] = tmp_pyr[src_idx].strides[k];
-                }
-                dim_t lvl_el = tmp_pyr[idx].strides[3] * tmp_pyr[idx].dims[3];
-                tmp_pyr[idx].ptr = memAlloc<T>(lvl_el);
-
-                Param<T> tmp;
-                for (int k = 0; k < 4; k++) {
-                    tmp.dims[k] = tmp_pyr[idx].dims[k];
-                    tmp.strides[k] = tmp_pyr[idx].strides[k];
-                }
-                tmp.ptr = memAlloc<T>(lvl_el);
-
-                Param<convAccT> filter = gauss_filter<convAccT>(sig_layers[l]);
-
-                convolve2<T, convAccT, 0, false>(tmp, tmp_pyr[src_idx], filter);
-                convolve2<T, convAccT, 1, false>(tmp_pyr[idx], CParam<T>(tmp), filter);
-
-                memFree(tmp.ptr);
-                memFree(filter.ptr);
-            }
-
-            const unsigned imel = tmp_pyr[idx].dims[3] * tmp_pyr[idx].strides[3];
-            const unsigned offset = imel * l;
-
-            //getQueue().enqueueCopyBuffer(*tmp_pyr[idx].data, *gauss_pyr[o].data, 0, offset*sizeof(T), imel * sizeof(T));
-            CUDA_CHECK(cudaMemcpy(gauss_pyr[o].ptr + offset, tmp_pyr[idx].ptr, imel * sizeof(T), cudaMemcpyDeviceToDevice));
-        }
-    }
-
-    for (unsigned o = 0; o < n_octaves; o++) {
-        for (unsigned l = 0; l < n_layers+3; l++) {
-            unsigned idx = o*(n_layers+3) + l;
-            memFree(tmp_pyr[idx].ptr);
-        }
-    }
-
-    return gauss_pyr;
-}
-
-template<typename T>
-std::vector< Param<T> > buildDoGPyr(
-    std::vector< Param<T> >& gauss_pyr,
-    const unsigned n_octaves,
-    const unsigned n_layers)
-{
-    // DoG Pyramid
-    std::vector< Param<T> > dog_pyr(n_octaves);
-    for (unsigned o = 0; o < n_octaves; o++) {
-        for (int k = 0; k < 4; k++) {
-            dog_pyr[o].dims[k] = (k == 2) ? gauss_pyr[o].dims[k]-1 : gauss_pyr[o].dims[k];
-            dog_pyr[o].strides[k] = (k == 0) ? 1 : dog_pyr[o].dims[k-1] * dog_pyr[o].strides[k-1];
-        }
-
-        dog_pyr[o].ptr = memAlloc<T>(dog_pyr[o].dims[3] * dog_pyr[o].strides[3]);
-
-        const unsigned nel = dog_pyr[o].dims[1] * dog_pyr[o].strides[1];
-        const unsigned dog_layers = n_layers+2;
-
-        dim3 threads(SIFT_THREADS);
-        dim3 blocks(divup(nel, threads.x));
-        CUDA_LAUNCH((sub<T>), blocks, threads,
-                    dog_pyr[o], gauss_pyr[o], nel, dog_layers);
-        POST_LAUNCH_CHECK();
-    }
-
-    return dog_pyr;
-}
-
-template <typename T>
-void update_permutation(thrust::device_ptr<T>& keys, thrust::device_vector<int>& permutation)
-{
-    // temporary storage for keys
-    thrust::device_vector<T> temp(permutation.size());
-
-    // permute the keys with the current reordering
-    THRUST_SELECT((thrust::gather), permutation.begin(), permutation.end(), keys, temp.begin());
-
-    // stable_sort the permuted keys and update the permutation
-    THRUST_SELECT((thrust::stable_sort_by_key), temp.begin(), temp.end(), permutation.begin());
-}
-
-template <typename T>
-void apply_permutation(thrust::device_ptr<T>& keys, thrust::device_vector<int>& permutation)
-{
-    // copy keys to temporary vector
-    thrust::device_vector<T> temp(keys, keys+permutation.size());
-
-    // permute the keys
-    THRUST_SELECT((thrust::gather), permutation.begin(), permutation.end(), temp.begin(), keys);
-}
-
-template<typename T, typename convAccT>
-void sift(unsigned* out_feat,
-          unsigned* out_dlen,
-          float** d_x,
-          float** d_y,
-          float** d_score,
-          float** d_ori,
-          float** d_size,
-          float** d_desc,
-          CParam<T> img,
-          const unsigned n_layers,
-          const float contrast_thr,
-          const float edge_thr,
-          const float init_sigma,
-          const bool double_input,
-          const float img_scale,
-          const float feature_ratio)
-{
-    const unsigned min_dim = (double_input) ? min(img.dims[0]*2, img.dims[1]*2)
-                                            : min(img.dims[0], img.dims[1]);
-    const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
-
-    Param<T> init_img = createInitialImage<T, convAccT>(img, init_sigma, double_input);
-
-    std::vector< Param<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
-
-    std::vector< Param<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
-
-    std::vector<float*> d_x_pyr(n_octaves, NULL);
-    std::vector<float*> d_y_pyr(n_octaves, NULL);
-    std::vector<float*> d_response_pyr(n_octaves, NULL);
-    std::vector<float*> d_size_pyr(n_octaves, NULL);
-    std::vector<float*> d_ori_pyr(n_octaves, NULL);
-    std::vector<float*> d_desc_pyr(n_octaves, NULL);
-    std::vector<unsigned> feat_pyr(n_octaves, 0);
-    unsigned total_feat = 0;
-
-    const unsigned d = DESCR_WIDTH;
-    const unsigned n = DESCR_HIST_BINS;
-    const unsigned desc_len = d*d*n;
-
-    unsigned* d_count = memAlloc<unsigned>(1);
-    for (unsigned i = 0; i < n_octaves; i++) {
-        if (dog_pyr[i].dims[0]-2*IMG_BORDER < 1 ||
-            dog_pyr[i].dims[1]-2*IMG_BORDER < 1)
-            continue;
-
-        const unsigned imel = dog_pyr[i].dims[0] * dog_pyr[i].dims[1];
-        const unsigned max_feat = ceil(imel * feature_ratio);
-
-        CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
-                                   cuda::getStream(cuda::getActiveDeviceId())));
-
-        float* d_extrema_x = memAlloc<float>(max_feat);
-        float* d_extrema_y = memAlloc<float>(max_feat);
-        unsigned* d_extrema_layer = memAlloc<unsigned>(max_feat);
-
-        int dim0 = dog_pyr[i].dims[0];
-        int dim1 = dog_pyr[i].dims[1];
-
-        dim3 threads(SIFT_THREADS_X, SIFT_THREADS_Y);
-        dim3 blocks(divup(dim0-2*IMG_BORDER, threads.x), divup(dim1-2*IMG_BORDER, threads.y));
-
-        float extrema_thr = 0.5f * contrast_thr / n_layers;
-        const size_t extrema_shared_size = (threads.x+2) * (threads.y+2) * 3 * sizeof(float);
-        CUDA_LAUNCH_SMEM((detectExtrema<T>), blocks, threads, extrema_shared_size,
-                         d_extrema_x, d_extrema_y, d_extrema_layer, d_count,
-                         CParam<T>(dog_pyr[i]), max_feat, extrema_thr);
-        POST_LAUNCH_CHECK();
-
-        unsigned extrema_feat = 0;
-        CUDA_CHECK(cudaMemcpy(&extrema_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
-        extrema_feat = min(extrema_feat, max_feat);
-
-        if (extrema_feat == 0) {
-            memFree(d_extrema_x);
-            memFree(d_extrema_y);
-            memFree(d_extrema_layer);
-
-            continue;
-        }
-
-        CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
-                                   cuda::getStream(cuda::getActiveDeviceId())));
-
-        unsigned interp_feat = 0;
-
-        float* d_interp_x = memAlloc<float>(extrema_feat);
-        float* d_interp_y = memAlloc<float>(extrema_feat);
-        unsigned* d_interp_layer = memAlloc<unsigned>(extrema_feat);
-        float* d_interp_response = memAlloc<float>(extrema_feat);
-        float* d_interp_size = memAlloc<float>(extrema_feat);
-
-        threads = dim3(SIFT_THREADS, 1);
-        blocks = dim3(divup(extrema_feat, threads.x), 1);
-
-        CUDA_LAUNCH((interpolateExtrema<T>), blocks, threads,
-                    d_interp_x, d_interp_y, d_interp_layer,
-                    d_interp_response, d_interp_size, d_count,
-                    d_extrema_x, d_extrema_y, d_extrema_layer, extrema_feat,
-                    dog_pyr[i], max_feat, i, n_layers,
-                    contrast_thr, edge_thr, init_sigma, img_scale);
-        POST_LAUNCH_CHECK();
-
-        CUDA_CHECK(cudaMemcpy(&interp_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
-        interp_feat = min(interp_feat, max_feat);
-
-        CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
-                                   cuda::getStream(cuda::getActiveDeviceId())));
-
-        if (interp_feat == 0) {
-            memFree(d_interp_x);
-            memFree(d_interp_y);
-            memFree(d_interp_layer);
-            memFree(d_interp_response);
-            memFree(d_interp_size);
-
-            continue;
-        }
-
-        thrust::device_ptr<float> interp_x_ptr = thrust::device_pointer_cast(d_interp_x);
-        thrust::device_ptr<float> interp_y_ptr = thrust::device_pointer_cast(d_interp_y);
-        thrust::device_ptr<unsigned> interp_layer_ptr = thrust::device_pointer_cast(d_interp_layer);
-        thrust::device_ptr<float> interp_response_ptr = thrust::device_pointer_cast(d_interp_response);
-        thrust::device_ptr<float> interp_size_ptr = thrust::device_pointer_cast(d_interp_size);
-
-        thrust::device_vector<int> permutation(interp_feat);
-        thrust::sequence(permutation.begin(), permutation.end());
-
-        update_permutation<float>(interp_size_ptr, permutation);
-        update_permutation<float>(interp_response_ptr, permutation);
-        update_permutation<unsigned>(interp_layer_ptr, permutation);
-        update_permutation<float>(interp_y_ptr, permutation);
-        update_permutation<float>(interp_x_ptr, permutation);
-
-        apply_permutation<float>(interp_size_ptr, permutation);
-        apply_permutation<float>(interp_response_ptr, permutation);
-        apply_permutation<unsigned>(interp_layer_ptr, permutation);
-        apply_permutation<float>(interp_y_ptr, permutation);
-        apply_permutation<float>(interp_x_ptr, permutation);
-
-        memFree(d_extrema_x);
-        memFree(d_extrema_y);
-        memFree(d_extrema_layer);
-
-        float* d_nodup_x = memAlloc<float>(interp_feat);
-        float* d_nodup_y = memAlloc<float>(interp_feat);
-        unsigned* d_nodup_layer = memAlloc<unsigned>(interp_feat);
-        float* d_nodup_response = memAlloc<float>(interp_feat);
-        float* d_nodup_size = memAlloc<float>(interp_feat);
-
-        threads = dim3(SIFT_THREADS, 1);
-        blocks = dim3(divup(interp_feat, threads.x), 1);
-
-        CUDA_LAUNCH((removeDuplicates), blocks, threads,
-                    d_nodup_x, d_nodup_y, d_nodup_layer,
-                    d_nodup_response, d_nodup_size, d_count,
-                    d_interp_x, d_interp_y, d_interp_layer,
-                    d_interp_response, d_interp_size, interp_feat);
-        POST_LAUNCH_CHECK();
-
-        memFree(d_interp_x);
-        memFree(d_interp_y);
-        memFree(d_interp_layer);
-        memFree(d_interp_response);
-        memFree(d_interp_size);
-
-        unsigned nodup_feat = 0;
-        CUDA_CHECK(cudaMemcpy(&nodup_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
-        CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(unsigned),
-                                   cuda::getStream(cuda::getActiveDeviceId())));
-
-        const unsigned max_oriented_feat = nodup_feat * 3;
-
-        float* d_oriented_x = memAlloc<float>(max_oriented_feat);
-        float* d_oriented_y = memAlloc<float>(max_oriented_feat);
-        unsigned* d_oriented_layer = memAlloc<unsigned>(max_oriented_feat);
-        float* d_oriented_response = memAlloc<float>(max_oriented_feat);
-        float* d_oriented_size = memAlloc<float>(max_oriented_feat);
-        float* d_oriented_ori = memAlloc<float>(max_oriented_feat);
-
-        threads = dim3(SIFT_THREADS_X, SIFT_THREADS_Y);
-        blocks = dim3(1, divup(nodup_feat, threads.y));
-
-        const size_t ori_shared_size = ORI_HIST_BINS * threads.y * 2 * sizeof(float);
-        CUDA_LAUNCH_SMEM((calcOrientation<T>), blocks, threads, ori_shared_size,
-                         d_oriented_x, d_oriented_y, d_oriented_layer,
-                         d_oriented_response, d_oriented_size, d_oriented_ori, d_count,
-                         d_nodup_x, d_nodup_y, d_nodup_layer,
-                         d_nodup_response, d_nodup_size, nodup_feat,
-                         gauss_pyr[i], max_oriented_feat, i, double_input);
-        POST_LAUNCH_CHECK();
-
-        memFree(d_nodup_x);
-        memFree(d_nodup_y);
-        memFree(d_nodup_layer);
-        memFree(d_nodup_response);
-        memFree(d_nodup_size);
-
-        unsigned oriented_feat = 0;
-        CUDA_CHECK(cudaMemcpy(&oriented_feat, d_count, sizeof(unsigned), cudaMemcpyDeviceToHost));
-        oriented_feat = min(oriented_feat, max_oriented_feat);
-
-        if (oriented_feat == 0) {
-            memFree(d_oriented_x);
-            memFree(d_oriented_y);
-            memFree(d_oriented_layer);
-            memFree(d_oriented_response);
-            memFree(d_oriented_size);
-            memFree(d_oriented_ori);
-
-            continue;
-        }
-
-        float* d_desc = memAlloc<float>(oriented_feat * desc_len);
-
-        float scale = 1.f/(1 << i);
-        if (double_input) scale *= 2.f;
-
-        threads = dim3(SIFT_THREADS, 1);
-        blocks  = dim3(1, divup(oriented_feat, threads.y));
-
-        const unsigned histsz = 8;
-        const size_t shared_size = desc_len * (histsz+1) * sizeof(float);
-
-        CUDA_LAUNCH_SMEM((computeDescriptor<T>), blocks, threads, shared_size,
-                         d_desc, desc_len, histsz,
-                         d_oriented_x, d_oriented_y, d_oriented_layer,
-                         d_oriented_response, d_oriented_size, d_oriented_ori,
-                         oriented_feat, gauss_pyr[i], d, n, scale, init_sigma, n_layers);
-        POST_LAUNCH_CHECK();
-
-        total_feat += oriented_feat;
-        feat_pyr[i] = oriented_feat;
-
-        if (oriented_feat > 0) {
-            d_x_pyr[i] = d_oriented_x;
-            d_y_pyr[i] = d_oriented_y;
-            d_response_pyr[i] = d_oriented_response;
-            d_ori_pyr[i] = d_oriented_ori;
-            d_size_pyr[i] = d_oriented_size;
-            d_desc_pyr[i] = d_desc;
-        }
-    }
-
-    memFree(d_count);
-
-    for (size_t i = 0; i < gauss_pyr.size(); i++)
-        memFree(gauss_pyr[i].ptr);
-    for (size_t i = 0; i < dog_pyr.size(); i++)
-        memFree(dog_pyr[i].ptr);
-
-    // Allocate output memory
-    *d_x     = memAlloc<float>(total_feat);
-    *d_y     = memAlloc<float>(total_feat);
-    *d_score = memAlloc<float>(total_feat);
-    *d_ori   = memAlloc<float>(total_feat);
-    *d_size  = memAlloc<float>(total_feat);
-    *d_desc  = memAlloc<float>(total_feat * desc_len);
-
-    unsigned offset = 0;
-    for (unsigned i = 0; i < n_octaves; i++) {
-        if (feat_pyr[i] == 0)
-            continue;
-
-        CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_score+offset, d_response_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
-
-        CUDA_CHECK(cudaMemcpy(*d_desc+(offset*desc_len), d_desc_pyr[i],
-                             feat_pyr[i] * desc_len * sizeof(float), cudaMemcpyDeviceToDevice));
-
-        memFree(d_x_pyr[i]);
-        memFree(d_y_pyr[i]);
-        memFree(d_response_pyr[i]);
-        memFree(d_ori_pyr[i]);
-        memFree(d_size_pyr[i]);
-        memFree(d_desc_pyr[i]);
-
-        offset += feat_pyr[i];
-    }
-
-    // Sets number of output features
-    *out_feat = total_feat;
-    *out_dlen = desc_len;
-}
-
-} // namespace kernel
-
-} // namespace cuda
diff --git a/src/backend/opencl/kernel/sift_nonfree.cl b/src/backend/opencl/kernel/sift_nonfree.cl
deleted file mode 100644
index ef63cab..0000000
--- a/src/backend/opencl/kernel/sift_nonfree.cl
+++ /dev/null
@@ -1,806 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements.  Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims.  If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement.  It does not protect you from
-// committing patent infringement.  So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either.  If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//     * Redistributions of source code must retain the above copyright and
-//       patent notices, this list of conditions and the following
-//       disclaimer.
-//     * Redistributions in binary form must reproduce the above copyright
-//       notice, this list of conditions and the following disclaimer in
-//       the documentation and/or other materials provided with the
-//       distribution.
-//     * Neither the name of Oregon State University nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// width of border in which to ignore keypoints
-#define IMG_BORDER 5
-
-// maximum steps of keypoint interpolation before failure
-#define MAX_INTERP_STEPS 5
-
-// default number of bins in histogram for orientation assignment
-#define ORI_HIST_BINS 36
-
-// determines gaussian sigma for orientation assignment
-#define ORI_SIG_FCTR 1.5f
-
-// determines the radius of the region used in orientation assignment
-#define ORI_RADIUS (3 * ORI_SIG_FCTR)
-
-// number of passes of orientation histogram smoothing
-#define SMOOTH_ORI_PASSES 2
-
-// orientation magnitude relative to max that results in new feature
-#define ORI_PEAK_RATIO 0.8f
-
-// determines the size of a single descriptor orientation histogram
-#define DESCR_SCL_FCTR 3.f
-
-// threshold on magnitude of elements of descriptor vector
-#define DESCR_MAG_THR 0.2f
-
-// factor used to convert floating-point descriptor to unsigned char
-#define INT_DESCR_FCTR 512.f
-
-#define PI_VAL 3.14159265358979323846f
-
-void gaussianElimination(float* A, float* b, float* x, const int n)
-{
-    // forward elimination
-    for (int i = 0; i < n-1; i++) {
-        for (int j = i+1; j < n; j++) {
-            float s = A[j*n+i] / A[i*n+i];
-
-            //for (int k = i+1; k < n; k++)
-            for (int k = i; k < n; k++)
-                A[j*n+k] -= s * A[i*n+k];
-
-            b[j] -= s * b[i];
-        }
-    }
-
-    for (int i = 0; i < n; i++)
-        x[i] = 0;
-
-    // backward substitution
-    float sum = 0;
-    for (int i = 0; i <= n-2; i++) {
-        sum = b[i];
-        for (int j = i+1; j < n; j++)
-            sum -= A[i*n+j] * x[j];
-        x[i] = sum / A[i*n+i];
-    }
-}
-
-inline void fatomic_add(volatile __local float *source, const float operand) {
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } newVal;
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } prevVal;
-    do {
-        prevVal.floatVal = *source;
-        newVal.floatVal = prevVal.floatVal + operand;
-    } while (atomic_cmpxchg((volatile __local unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
-}
-
-inline void normalizeDesc(
-    __local float* desc,
-    __local float* accum,
-    const int histlen,
-    int lid_x,
-    int lid_y,
-    int lsz_x)
-{
-    for (int i = lid_x; i < histlen; i += lsz_x)
-        accum[i] = desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float sum = 0.0f;
-    for (int i = 0; i < histlen; i++)
-        sum += desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lid_x < 64)
-        accum[lid_x] += accum[lid_x+64];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 32)
-        accum[lid_x] += accum[lid_x+32];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 16)
-        accum[lid_x] += accum[lid_x+16];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 8)
-        accum[lid_x] += accum[lid_x+8];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 4)
-        accum[lid_x] += accum[lid_x+4];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 2)
-        accum[lid_x] += accum[lid_x+2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 1)
-        accum[lid_x] += accum[lid_x+1];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float len_sq = accum[0];
-    float len_inv = 1.0f / sqrt(len_sq);
-
-    for (int i = lid_x; i < histlen; i += lsz_x) {
-        desc[lid_y*histlen+i] *= len_inv;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-__kernel void sub(
-    __global T* out,
-    __global const T* in,
-    unsigned nel,
-    unsigned n_layers)
-{
-    unsigned i = get_global_id(0);
-
-    for (unsigned l = 0; l < n_layers; l++)
-        out[l*nel + i] = in[l*nel + i] - in[(l+1)*nel + i];
-}
-
-#define LCPTR(Y, X) (l_center[(Y) * l_i + (X)])
-#define LPPTR(Y, X) (l_prev[(Y) * l_i + (X)])
-#define LNPTR(Y, X) (l_next[(Y) * l_i + (X)])
-
-// Determines whether a pixel is a scale-space extremum by comparing it to its
-// 3x3x3 pixel neighborhood.
-__kernel void detectExtrema(
-    __global float* x_out,
-    __global float* y_out,
-    __global unsigned* layer_out,
-    __global unsigned* counter,
-    __global const T* dog,
-    KParam iDoG,
-    const unsigned max_feat,
-    const float threshold,
-    __local float* l_mem)
-{
-    const int dim0 = iDoG.dims[0];
-    const int dim1 = iDoG.dims[1];
-    const int imel = iDoG.dims[0]*iDoG.dims[1];
-
-    const int lid_i = get_local_id(0);
-    const int lid_j = get_local_id(1);
-    const int lsz_i = get_local_size(0);
-    const int lsz_j = get_local_size(1);
-    const int i = get_group_id(0) * lsz_i + lid_i+IMG_BORDER;
-    const int j = get_group_id(1) * lsz_j + lid_j+IMG_BORDER;
-
-    // One pixel border for each side
-    const int l_i = lsz_i+2;
-    const int l_j = lsz_j+2;
-
-    __local float* l_prev   = l_mem;
-    __local float* l_center = l_mem + l_i * l_j;
-    __local float* l_next   = l_mem + l_i * l_j * 2;
-
-    const int x = lid_i+1;
-    const int y = lid_j+1;
-
-    for (int l = 1; l < iDoG.dims[2]-1; l++) {
-        const int l_i_half = l_i/2;
-        const int l_j_half = l_j/2;
-        if (lid_i < l_i_half && lid_j < l_j_half && i < dim0-IMG_BORDER+1 && j < dim1-IMG_BORDER+1) {
-                l_next  [lid_j*l_i + lid_i] = (float)dog[(l+1)*imel+(j-1)*dim0+i-1];
-                l_center[lid_j*l_i + lid_i] = (float)dog[(l  )*imel+(j-1)*dim0+i-1];
-                l_prev  [lid_j*l_i + lid_i] = (float)dog[(l-1)*imel+(j-1)*dim0+i-1];
-
-                l_next  [lid_j*l_i + lid_i+l_i_half] = (float)dog[(l+1)*imel+(j-1)*dim0+i-1+l_i_half];
-                l_center[lid_j*l_i + lid_i+l_i_half] = (float)dog[(l  )*imel+(j-1)*dim0+i-1+l_i_half];
-                l_prev  [lid_j*l_i + lid_i+l_i_half] = (float)dog[(l-1)*imel+(j-1)*dim0+i-1+l_i_half];
-
-                l_next  [(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1];
-                l_center[(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1];
-                l_prev  [(lid_j+l_j_half)*l_i + lid_i] = (float)dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1];
-
-                l_next  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
-                l_center[(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
-                l_prev  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = (float)dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (i < dim0-IMG_BORDER && j < dim1-IMG_BORDER) {
-            const int l_i_half = l_i/2;
-            float p = l_center[y*l_i + x];
-
-            if (fabs((float)p) > threshold &&
-                ((p > 0 && p > LCPTR(y-1, x-1) && p > LCPTR(y-1, x) &&
-                  p > LCPTR(y-1, x+1) && p > LCPTR(y, x-1) && p > LCPTR(y,   x+1)  &&
-                  p > LCPTR(y+1, x-1) && p > LCPTR(y+1, x) && p > LCPTR(y+1, x+1)  &&
-                  p > LPPTR(y-1, x-1) && p > LPPTR(y-1, x) && p > LPPTR(y-1, x+1)  &&
-                  p > LPPTR(y,   x-1) && p > LPPTR(y  , x) && p > LPPTR(y,   x+1)  &&
-                  p > LPPTR(y+1, x-1) && p > LPPTR(y+1, x) && p > LPPTR(y+1, x+1)  &&
-                  p > LNPTR(y-1, x-1) && p > LNPTR(y-1, x) && p > LNPTR(y-1, x+1)  &&
-                  p > LNPTR(y,   x-1) && p > LNPTR(y  , x) && p > LNPTR(y,   x+1)  &&
-                  p > LNPTR(y+1, x-1) && p > LNPTR(y+1, x) && p > LNPTR(y+1, x+1)) ||
-                 (p < 0 && p < LCPTR(y-1, x-1) && p < LCPTR(y-1, x) &&
-                  p < LCPTR(y-1, x+1) && p < LCPTR(y, x-1) && p < LCPTR(y,   x+1)  &&
-                  p < LCPTR(y+1, x-1) && p < LCPTR(y+1, x) && p < LCPTR(y+1, x+1)  &&
-                  p < LPPTR(y-1, x-1) && p < LPPTR(y-1, x) && p < LPPTR(y-1, x+1)  &&
-                  p < LPPTR(y,   x-1) && p < LPPTR(y  , x) && p < LPPTR(y,   x+1)  &&
-                  p < LPPTR(y+1, x-1) && p < LPPTR(y+1, x) && p < LPPTR(y+1, x+1)  &&
-                  p < LNPTR(y-1, x-1) && p < LNPTR(y-1, x) && p < LNPTR(y-1, x+1)  &&
-                  p < LNPTR(y,   x-1) && p < LNPTR(y  , x) && p < LNPTR(y,   x+1)  &&
-                  p < LNPTR(y+1, x-1) && p < LNPTR(y+1, x) && p < LNPTR(y+1, x+1)))) {
-
-                unsigned idx = atomic_inc(counter);
-                if (idx < max_feat)
-                {
-                    x_out[idx] = (float)j;
-                    y_out[idx] = (float)i;
-                    layer_out[idx] = l;
-                }
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-#undef LCPTR
-#undef LPPTR
-#undef LNPTR
-#define CPTR(Y, X) (center[(Y) * dim0 + (X)])
-#define PPTR(Y, X) (prev[(Y) * dim0 + (X)])
-#define NPTR(Y, X) (next[(Y) * dim0 + (X)])
-
-// Interpolates a scale-space extremum's location and scale to subpixel
-// accuracy to form an image feature. Rejects features with low contrast.
-// Based on Section 4 of Lowe's paper.
-__kernel void interpolateExtrema(
-    __global float* x_out,
-    __global float* y_out,
-    __global unsigned* layer_out,
-    __global float* response_out,
-    __global float* size_out,
-    __global unsigned* counter,
-    __global const float* x_in,
-    __global const float* y_in,
-    __global const unsigned* layer_in,
-    const unsigned extrema_feat,
-    __global const T* dog_octave,
-    KParam iDoG,
-    const unsigned max_feat,
-    const unsigned octave,
-    const unsigned n_layers,
-    const float contrast_thr,
-    const float edge_thr,
-    const float sigma,
-    const float img_scale)
-{
-    const unsigned f = get_global_id(0);
-
-    if (f < extrema_feat)
-    {
-        const float first_deriv_scale = img_scale*0.5f;
-        const float second_deriv_scale = img_scale;
-        const float cross_deriv_scale = img_scale*0.25f;
-
-        float xl = 0, xy = 0, xx = 0, contr = 0;
-        int i = 0;
-
-        unsigned x = x_in[f];
-        unsigned y = y_in[f];
-        unsigned layer = layer_in[f];
-
-        const int dim0 = iDoG.dims[0];
-        const int dim1 = iDoG.dims[1];
-        const int imel = dim0 * dim1;
-
-        __global const T* prev   = dog_octave + (int)((layer-1)*imel);
-        __global const T* center = dog_octave + (int)((layer  )*imel);
-        __global const T* next   = dog_octave + (int)((layer+1)*imel);
-
-        for(i = 0; i < MAX_INTERP_STEPS; i++) {
-            float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                           (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                           (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-
-            float d2  = CPTR(x, y) * 2.f;
-            float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-            float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-            float dss = (NPTR(x, y  ) + PPTR(x, y  ) - d2) * second_deriv_scale;
-            float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                         CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-            float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
-                         PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
-            float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
-                         PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
-            float H[9] = {dxx, dxy, dxs,
-                          dxy, dyy, dys,
-                          dxs, dys, dss};
-
-            float X[3];
-            gaussianElimination(H, dD, X, 3);
-
-            xl = -X[2];
-            xy = -X[1];
-            xx = -X[0];
-
-            if(fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
-                break;
-
-            x += round(xx);
-            y += round(xy);
-            layer += round(xl);
-
-            if(layer < 1 || layer > n_layers ||
-               x < IMG_BORDER || x >= dim1 - IMG_BORDER ||
-               y < IMG_BORDER || y >= dim0 - IMG_BORDER)
-                return;
-        }
-
-        // ensure convergence of interpolation
-        if (i >= MAX_INTERP_STEPS)
-            return;
-
-        float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                       (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                       (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-        float X[3] = {xx, xy, xl};
-
-        float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
-        contr = center[x*dim0+y]*img_scale + P * 0.5f;
-        if(fabs(contr) < (contrast_thr / n_layers))
-            return;
-
-        // principal curvatures are computed using the trace and det of Hessian
-        float d2  = CPTR(x, y) * 2.f;
-        float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-        float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-        float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                     CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-
-        float tr = dxx + dyy;
-        float det = dxx * dyy - dxy * dxy;
-
-        // add FLT_EPSILON for double-precision compatibility
-        if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
-            return;
-
-        unsigned ridx = atomic_inc(counter);
-
-        if (ridx < max_feat)
-        {
-            x_out[ridx] = (x + xx) * (1 << octave);
-            y_out[ridx] = (y + xy) * (1 << octave);
-            layer_out[ridx] = layer;
-            response_out[ridx] = fabs(contr);
-            size_out[ridx] = sigma*pow(2.f, octave + (layer + xl) / n_layers);
-        }
-    }
-}
-
-#undef CPTR
-#undef PPTR
-#undef NPTR
-
-// Remove duplicate keypoints
-__kernel void removeDuplicates(
-    __global float* x_out,
-    __global float* y_out,
-    __global unsigned* layer_out,
-    __global float* response_out,
-    __global float* size_out,
-    __global unsigned* counter,
-    __global const float* x_in,
-    __global const float* y_in,
-    __global const unsigned* layer_in,
-    __global const float* response_in,
-    __global const float* size_in,
-    const unsigned total_feat)
-{
-    const unsigned f = get_global_id(0);
-
-    if (f < total_feat) {
-        const float prec_fctr = 1e4f;
-
-        bool cond = (f < total_feat-1)
-                    ? !(round(x_in[f]*prec_fctr) == round(x_in[f+1]*prec_fctr) &&
-                        round(y_in[f]*prec_fctr) == round(y_in[f+1]*prec_fctr) &&
-                        layer_in[f] == layer_in[f+1] &&
-                        round(response_in[f]*prec_fctr) == round(response_in[f+1]*prec_fctr) &&
-                        round(size_in[f]*prec_fctr) == round(size_in[f+1]*prec_fctr))
-                    : true;
-
-        if (cond) {
-            unsigned idx = atomic_inc(counter);
-
-            x_out[idx] = x_in[f];
-            y_out[idx] = y_in[f];
-            layer_out[idx] = layer_in[f];
-            response_out[idx] = response_in[f];
-            size_out[idx] = size_in[f];
-        }
-    }
-
-}
-
-#define IPTR(Y, X) (img[(Y) * dim0 + X])
-
-// Computes a canonical orientation for each image feature in an array.  Based
-// on Section 5 of Lowe's paper.  This function adds features to the array when
-// there is more than one dominant orientation at a given feature location.
-__kernel void calcOrientation(
-    __global float* x_out,
-    __global float* y_out,
-    __global unsigned* layer_out,
-    __global float* response_out,
-    __global float* size_out,
-    __global float* ori_out,
-    __global unsigned* counter,
-    __global const float* x_in,
-    __global const float* y_in,
-    __global const unsigned* layer_in,
-    __global const float* response_in,
-    __global const float* size_in,
-    const unsigned total_feat,
-    __global const T* gauss_octave,
-    KParam iGauss,
-    const unsigned max_feat,
-    const unsigned octave,
-    const int double_input,
-    __local float* l_mem)
-{
-    const int lid_x = get_local_id(0);
-    const int lid_y = get_local_id(1);
-    const int lsz_x = get_local_size(0);
-
-    const unsigned f = get_global_id(1);
-
-    const int n = ORI_HIST_BINS;
-
-    __local float* hist = l_mem;
-    __local float* temphist = l_mem + n*8;
-
-    // Initialize temporary histogram
-    for (int i = lid_x; i < n; i += lsz_x) {
-        hist[lid_y*n + i] = 0.f;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float real_x, real_y, response, size;
-    unsigned layer;
-
-    if (f < total_feat) {
-        // Load keypoint information
-        real_x = x_in[f];
-        real_y = y_in[f];
-        layer = layer_in[f];
-        response = response_in[f];
-        size = size_in[f];
-
-        const int pt_x = (int)round(real_x / (1 << octave));
-        const int pt_y = (int)round(real_y / (1 << octave));
-
-        // Calculate auxiliary parameters
-        const float scl_octv = size*0.5f / (1 << octave);
-        const int radius = (int)round(ORI_RADIUS * scl_octv);
-        const float sigma = ORI_SIG_FCTR * scl_octv;
-        const int len = (radius*2+1);
-        const float exp_denom = 2.f * sigma * sigma;
-
-        const int dim0 = iGauss.dims[0];
-        const int dim1 = iGauss.dims[1];
-
-        // Calculate layer offset
-        const int layer_offset = layer * dim0 * dim1;
-        __global const T* img = gauss_octave + layer_offset;
-
-        // Calculate orientation histogram
-        for (int l = lid_x; l < len*len; l += lsz_x) {
-            int i = l / len - radius;
-            int j = l % len - radius;
-
-            int y = pt_y + i;
-            int x = pt_x + j;
-            if (y < 1 || y >= dim0 - 1 ||
-                x < 1 || x >= dim1 - 1)
-                continue;
-
-            float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-            float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-            float mag = sqrt(dx*dx+dy*dy);
-            float ori = atan2(dy,dx);
-            float w = exp(-(i*i + j*j)/exp_denom);
-
-            int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
-            bin = bin < n ? bin : 0;
-            bin = (bin < 0) ? 0 : (bin >= n) ? n-1 : bin;
-
-            fatomic_add(&hist[lid_y*n+bin], w*mag);
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int i = 0; i < SMOOTH_ORI_PASSES; i++) {
-        for (int j = lid_x; j < n; j += lsz_x) {
-            temphist[lid_y*n+j] = hist[lid_y*n+j];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for (int j = lid_x; j < n; j += lsz_x) {
-            float prev = (j == 0) ? temphist[lid_y*n+n-1] : temphist[lid_y*n+j-1];
-            float next = (j+1 == n) ? temphist[lid_y*n] : temphist[lid_y*n+j+1];
-            hist[lid_y*n+j] = 0.25f * prev + 0.5f * temphist[lid_y*n+j] + 0.25f * next;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    for (int i = lid_x; i < n; i += lsz_x)
-        temphist[lid_y*n+i] = hist[lid_y*n+i];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lid_x < 16)
-        temphist[lid_y*n+lid_x] = fmax(hist[lid_y*n+lid_x], hist[lid_y*n+lid_x+16]);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 8)
-        temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+8]);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 4) {
-        temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], hist[lid_y*n+lid_x+32]);
-        temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+4]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 2)
-        temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+2]);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid_x < 1)
-        temphist[lid_y*n+lid_x] = fmax(temphist[lid_y*n+lid_x], temphist[lid_y*n+lid_x+1]);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    float omax = temphist[lid_y*n];
-
-    if (f < total_feat) {
-        float mag_thr = (float)(omax * ORI_PEAK_RATIO);
-        int l, r;
-        float bin;
-        for (int j = lid_x; j < n; j+=lsz_x) {
-            l = (j == 0) ? n - 1 : j - 1;
-            r = (j + 1) % n;
-            if (hist[lid_y*n+j] > hist[lid_y*n+l] &&
-                hist[lid_y*n+j] > hist[lid_y*n+r] &&
-                hist[lid_y*n+j] >= mag_thr) {
-                unsigned idx = atomic_inc(counter);
-
-                if (idx < max_feat) {
-                    float bin = j + 0.5f * (hist[lid_y*n+l] - hist[lid_y*n+r]) /
-                                (hist[lid_y*n+l] - 2.0f*hist[lid_y*n+j] + hist[lid_y*n+r]);
-                    bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
-                    float ori = 360.f - ((360.f/n) * bin);
-
-                    float new_real_x = real_x;
-                    float new_real_y = real_y;
-                    float new_size = size;
-
-                    if (double_input != 0) {
-                        float scale = 0.5f;
-                        new_real_x *= scale;
-                        new_real_y *= scale;
-                        new_size *= scale;
-                    }
-
-                    x_out[idx] = new_real_x;
-                    y_out[idx] = new_real_y;
-                    layer_out[idx] = layer;
-                    response_out[idx] = response;
-                    size_out[idx] = new_size;
-                    ori_out[idx] = ori;
-                }
-            }
-        }
-    }
-}
-
-// Computes feature descriptors for features in an array.  Based on Section 6
-// of Lowe's paper.
-__kernel void computeDescriptor(
-    __global float* desc_out,
-    const unsigned desc_len,
-    const unsigned histsz,
-    __global const float* x_in,
-    __global const float* y_in,
-    __global const unsigned* layer_in,
-    __global const float* response_in,
-    __global const float* size_in,
-    __global const float* ori_in,
-    const unsigned total_feat,
-    __global const T* gauss_octave,
-    KParam iGauss,
-    const int d,
-    const int n,
-    const float scale,
-    const float sigma,
-    const int n_layers,
-    __local float* l_mem)
-{
-    const int lid_x = get_local_id(0);
-    const int lid_y = get_local_id(1);
-    const int lsz_x = get_local_size(0);
-
-    const int f = get_global_id(1);
-
-    __local float* desc = l_mem;
-    __local float* accum = l_mem + desc_len * histsz;
-
-    const int histlen = d*d*n;
-
-    for (int i = lid_x; i < histlen*histsz; i += lsz_x)
-        desc[lid_y*histlen+i] = 0.f;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (f < total_feat) {
-        const unsigned layer = layer_in[f];
-        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
-        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
-        const float size = size_in[f];
-        const int fx = round(x_in[f] * scale);
-        const int fy = round(y_in[f] * scale);
-
-        // Points img to correct Gaussian pyramid layer
-        const int dim0 = iGauss.dims[0];
-        const int dim1 = iGauss.dims[1];
-        __global const T* img = gauss_octave + (layer * dim0 * dim1);
-
-        float cos_t = cos(ori);
-        float sin_t = sin(ori);
-        float bins_per_rad = n / (PI_VAL * 2.f);
-        float exp_denom = d * d * 0.5f;
-        float hist_width = DESCR_SCL_FCTR * sigma * pow(2.f, layer/n_layers);
-        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
-        int len = radius*2+1;
-        const int hist_off = (lid_x % histsz) * desc_len;
-
-        // Calculate orientation histogram
-        for (int l = lid_x; l < len*len; l += lsz_x) {
-            int i = l / len - radius;
-            int j = l % len - radius;
-
-            int y = fy + i;
-            int x = fx + j;
-
-            float x_rot = (j * cos_t - i * sin_t) / hist_width;
-            float y_rot = (j * sin_t + i * cos_t) / hist_width;
-            float xbin = x_rot + d/2 - 0.5f;
-            float ybin = y_rot + d/2 - 0.5f;
-
-            if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
-                y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
-                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                float grad_mag = sqrt(dx*dx + dy*dy);
-                float grad_ori = atan2(dy, dx) - ori;
-                while (grad_ori < 0.0f)
-                    grad_ori += PI_VAL*2;
-                while (grad_ori >= PI_VAL*2)
-                    grad_ori -= PI_VAL*2;
-
-                float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
-                float obin = grad_ori * bins_per_rad;
-                float mag = grad_mag*w;
-
-                int x0 = floor(xbin);
-                int y0 = floor(ybin);
-                int o0 = floor(obin);
-                xbin -= x0;
-                ybin -= y0;
-                obin -= o0;
-
-                for (int yl = 0; yl <= 1; yl++) {
-                    int yb = y0 + yl;
-                    if (yb >= 0 && yb < d) {
-	                    float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
-	                    for (int xl = 0; xl <= 1; xl++) {
-	                        int xb = x0 + xl;
-	                        if (xb >= 0 && xb < d) {
-		                        float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
-		                        for (int ol = 0; ol <= 1; ol++) {
-		                            int ob = (o0 + ol) % n;
-		                            float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
-		                            fatomic_add(&desc[hist_off + lid_y*desc_len + (yb*d + xb)*n + ob], v_o);
-		                        }
-		                    }
-	                    }
-	                }
-                }
-            }
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Combine histograms (reduces previous atomicAdd overhead)
-    for (int l = lid_x; l < desc_len*4; l += lsz_x)
-        desc[l] += desc[l+4*desc_len];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int l = lid_x; l < desc_len*2; l += lsz_x)
-        desc[l    ] += desc[l+2*desc_len];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int l = lid_x; l < desc_len; l += lsz_x)
-        desc[l] += desc[l+desc_len];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
-
-    for (int i = lid_x; i < d*d*n; i += lsz_x)
-        desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
-
-    if (f < total_feat) {
-        // Calculate final descriptor values
-        for (int k = lid_x; k < d*d*n; k += lsz_x)
-            desc_out[f*desc_len+k] = round(min(255.f, desc[lid_y*desc_len+k] * INT_DESCR_FCTR));
-    }
-}
-
-#undef IPTR
diff --git a/src/backend/opencl/kernel/sift_nonfree.hpp b/src/backend/opencl/kernel/sift_nonfree.hpp
deleted file mode 100644
index 9c36a3f..0000000
--- a/src/backend/opencl/kernel/sift_nonfree.hpp
+++ /dev/null
@@ -1,784 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-// The source code contained in this file is based on the original code by
-// Rob Hess. Please note that SIFT is an algorithm patented and protected
-// by US law, before using this code or any binary forms generated from it,
-// verify that you have permission to do so. The original license by Rob Hess
-// can be read below:
-//
-// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
-// All rights reserved.
-//
-// The following patent has been issued for methods embodied in this
-// software: "Method and apparatus for identifying scale invariant features
-// in an image and use of same for locating an object in an image," David
-// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
-// filed March 8, 1999. Asignee: The University of British Columbia. For
-// further details, contact David Lowe (lowe at cs.ubc.ca) or the
-// University-Industry Liaison Office of the University of British
-// Columbia.
-//
-// Note that restrictions imposed by this patent (and possibly others)
-// exist independently of and may be in conflict with the freedoms granted
-// in this license, which refers to copyright of the program, not patents
-// for any methods that it implements.  Both copyright and patent law must
-// be obeyed to legally use and redistribute this program and it is not the
-// purpose of this license to induce you to infringe any patents or other
-// property right claims or to contest validity of any such claims.  If you
-// redistribute or use the program, then this license merely protects you
-// from committing copyright infringement.  It does not protect you from
-// committing patent infringement.  So, before you do anything with this
-// program, make sure that you have permission to do so not merely in terms
-// of copyright, but also in terms of patent law.
-//
-// Please note that this license is not to be understood as a guarantee
-// either.  If you use the program according to this license, but in
-// conflict with patent law, it does not mean that the licensor will refund
-// you for any losses that you incur if you are sued for your patent
-// infringement.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//     * Redistributions of source code must retain the above copyright and
-//       patent notices, this list of conditions and the following
-//       disclaimer.
-//     * Redistributions in binary form must reproduce the above copyright
-//       notice, this list of conditions and the following disclaimer in
-//       the documentation and/or other materials provided with the
-//       distribution.
-//     * Neither the name of Oregon State University nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <af/defines.h>
-#include <program.hpp>
-#include <dispatch.hpp>
-#include <err_opencl.hpp>
-#include <debug_opencl.hpp>
-
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-
-#include <boost/compute/core.hpp>
-#include <boost/compute/iterator/buffer_iterator.hpp>
-#include <boost/compute/algorithm/gather.hpp>
-#include <boost/compute/algorithm/iota.hpp>
-#include <boost/compute/algorithm/sort_by_key.hpp>
-
-#pragma GCC diagnostic pop
-
-#include <convolve_common.hpp>
-#include <kernel/convolve_separable.hpp>
-#include <kernel/fast.hpp>
-#include <kernel/resize.hpp>
-#include <kernel/sort_index.hpp>
-#include <kernel_headers/sift_nonfree.hpp>
-#include <memory.hpp>
-#include <vector>
-
-using cl::Buffer;
-using cl::Program;
-using cl::Kernel;
-using cl::EnqueueArgs;
-using cl::LocalSpaceArg;
-using cl::NDRange;
-using std::vector;
-
-namespace opencl
-{
-
-namespace kernel
-{
-
-static const int SIFT_THREADS   = 256;
-static const int SIFT_THREADS_X = 32;
-static const int SIFT_THREADS_Y = 8;
-
-// assumed gaussian blur for input image
-static const float InitSigma = 0.5f;
-
-// width of border in which to ignore keypoints
-static const int ImgBorder = 5;
-
-// default width of descriptor histogram array
-static const int DescrWidth = 4;
-
-// default number of bins per histogram in descriptor array
-static const int DescrHistBins = 8;
-
-// default number of bins in histogram for orientation assignment
-static const int OriHistBins = 36;
-
-static const float PI_VAL = 3.14159265358979323846f;
-
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
-    if(!(sigma>0)) sigma = 0.25*dim;
-
-    T sum = (T)0;
-    for(int i=0;i<dim;i++)
-    {
-        int x = i-(dim-1)/2;
-        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-        out[i] = el;
-        sum   += el;
-    }
-
-    for(int k=0;k<dim;k++)
-        out[k] /= sum;
-}
-
-template<typename T>
-Param gaussFilter(float sigma)
-{
-    // Using 6-sigma rule
-    unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
-
-    T* h_gauss = new T[gauss_len];
-    gaussian1D(h_gauss, gauss_len, sigma);
-
-    Param gauss_filter;
-    gauss_filter.info.offset = 0;
-    gauss_filter.info.dims[0] = gauss_len;
-    gauss_filter.info.strides[0] = 1;
-
-    for (int k = 1; k < 4; k++) {
-        gauss_filter.info.dims[k] = 1;
-        gauss_filter.info.strides[k] = gauss_filter.info.dims[k-1] * gauss_filter.info.strides[k-1];
-    }
-
-    dim_t gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3];
-    gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T));
-    getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
-    getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
-
-    delete[] h_gauss;
-
-    return gauss_filter;
-}
-
-template<typename T, typename convAccT>
-void convSepFull(Param& dst, Param src, Param filter)
-{
-    Param tmp;
-    tmp.info.offset = 0;
-    for (int k = 0; k < 4; k++) {
-        tmp.info.dims[k] = src.info.dims[k];
-        tmp.info.strides[k] = src.info.strides[k];
-    }
-
-    const dim_t src_el = src.info.dims[3] * src.info.strides[3];
-    tmp.data = bufferAlloc(src_el * sizeof(T));
-
-    convSep<T, convAccT, 0, false>(tmp, src, filter);
-    convSep<T, convAccT, 1, false>(dst, tmp, filter);
-
-    bufferFree(tmp.data);
-}
-
-template<typename T, typename convAccT>
-Param createInitialImage(
-    Param img,
-    const float init_sigma,
-    const bool double_input)
-{
-    Param init_img;
-    init_img.info.offset = 0;
-    init_img.info.dims[0] = (double_input) ? img.info.dims[0] * 2 : img.info.dims[0];
-    init_img.info.dims[1] = (double_input) ? img.info.dims[1] * 2 : img.info.dims[1];
-    init_img.info.strides[0] = 1;
-    init_img.info.strides[1] = init_img.info.dims[0];
-
-    for (int k = 2; k < 4; k++) {
-        init_img.info.dims[k] = 1;
-        init_img.info.strides[k] = init_img.info.dims[k-1] * init_img.info.strides[k-1];
-    }
-
-    dim_t init_img_el = init_img.info.strides[3] * init_img.info.dims[3];
-    init_img.data = bufferAlloc(init_img_el * sizeof(T));
-
-    float s = (double_input) ? sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4)
-                             : sqrt(init_sigma * init_sigma - InitSigma * InitSigma);
-
-    const Param filter = gaussFilter<convAccT>(s);
-
-    if (double_input)
-        resize<T, AF_INTERP_BILINEAR>(init_img, img);
-
-    convSepFull<T, convAccT>(init_img, (double_input) ? init_img : img, filter);
-
-    bufferFree(filter.data);
-
-    return init_img;
-}
-
-template<typename T, typename convAccT>
-std::vector<Param> buildGaussPyr(
-    Param init_img,
-    const unsigned n_octaves,
-    const unsigned n_layers,
-    const float init_sigma)
-{
-    // Precompute Gaussian sigmas using the following formula:
-    // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
-    std::vector<float> sig_layers(n_layers + 3);
-    sig_layers[0] = init_sigma;
-    float k = std::pow(2.0f, 1.0f / n_layers);
-    for (unsigned i = 1; i < n_layers + 3; i++) {
-        float sig_prev = std::pow(k, i-1) * init_sigma;
-        float sig_total = sig_prev * k;
-        sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
-    }
-
-    // Gaussian Pyramid
-    std::vector<Param> gauss_pyr(n_octaves);
-    std::vector<Param> tmp_pyr(n_octaves * (n_layers+3));
-    for (unsigned o = 0; o < n_octaves; o++) {
-        gauss_pyr[o].info.offset = 0;
-        gauss_pyr[o].info.dims[0] = (o == 0) ? init_img.info.dims[0] : gauss_pyr[o-1].info.dims[0] / 2;
-        gauss_pyr[o].info.dims[1] = (o == 0) ? init_img.info.dims[1] : gauss_pyr[o-1].info.dims[1] / 2;
-        gauss_pyr[o].info.dims[2] = n_layers+3;
-        gauss_pyr[o].info.dims[3] = 1;
-
-        gauss_pyr[o].info.strides[0] = 1;
-        gauss_pyr[o].info.strides[1] = gauss_pyr[o].info.dims[0] * gauss_pyr[o].info.strides[0];
-        gauss_pyr[o].info.strides[2] = gauss_pyr[o].info.dims[1] * gauss_pyr[o].info.strides[1];
-        gauss_pyr[o].info.strides[3] = gauss_pyr[o].info.dims[2] * gauss_pyr[o].info.strides[2];
-
-        const unsigned nel = gauss_pyr[o].info.dims[3] * gauss_pyr[o].info.strides[3];
-        gauss_pyr[o].data = bufferAlloc(nel * sizeof(T));
-
-        for (unsigned l = 0; l < n_layers+3; l++) {
-            unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
-            unsigned idx = o*(n_layers+3) + l;
-
-            tmp_pyr[o].info.offset = 0;
-            if (o == 0 && l == 0) {
-                for (int k = 0; k < 4; k++) {
-                    tmp_pyr[idx].info.dims[k] = init_img.info.dims[k];
-                    tmp_pyr[idx].info.strides[k] = init_img.info.strides[k];
-                }
-                tmp_pyr[idx].data = init_img.data;
-            }
-            else if (l == 0) {
-                tmp_pyr[idx].info.dims[0] = tmp_pyr[src_idx].info.dims[0] / 2;
-                tmp_pyr[idx].info.dims[1] = tmp_pyr[src_idx].info.dims[1] / 2;
-                tmp_pyr[idx].info.strides[0] = 1;
-                tmp_pyr[idx].info.strides[1] = tmp_pyr[idx].info.dims[0];
-
-                for (int k = 2; k < 4; k++) {
-                    tmp_pyr[idx].info.dims[k] = 1;
-                    tmp_pyr[idx].info.strides[k] = tmp_pyr[idx].info.dims[k-1] * tmp_pyr[idx].info.strides[k-1];
-                }
-
-                dim_t lvl_el = tmp_pyr[idx].info.strides[3] * tmp_pyr[idx].info.dims[3];
-                tmp_pyr[idx].data = bufferAlloc(lvl_el * sizeof(T));
-
-                resize<T, AF_INTERP_BILINEAR>(tmp_pyr[idx], tmp_pyr[src_idx]);
-            }
-            else {
-                for (int k = 0; k < 4; k++) {
-                    tmp_pyr[idx].info.dims[k] = tmp_pyr[src_idx].info.dims[k];
-                    tmp_pyr[idx].info.strides[k] = tmp_pyr[src_idx].info.strides[k];
-                }
-                dim_t lvl_el = tmp_pyr[idx].info.strides[3] * tmp_pyr[idx].info.dims[3];
-                tmp_pyr[idx].data = bufferAlloc(lvl_el * sizeof(T));
-
-                Param filter = gaussFilter<convAccT>(sig_layers[l]);
-
-                convSepFull<T, convAccT>(tmp_pyr[idx], tmp_pyr[src_idx], filter);
-
-                bufferFree(filter.data);
-            }
-
-            const unsigned imel = tmp_pyr[idx].info.dims[3] * tmp_pyr[idx].info.strides[3];
-            const unsigned offset = imel * l;
-
-            getQueue().enqueueCopyBuffer(*tmp_pyr[idx].data, *gauss_pyr[o].data, 0, offset*sizeof(T), imel * sizeof(T));
-        }
-    }
-
-    for (unsigned o = 0; o < n_octaves; o++) {
-        for (unsigned l = 0; l < n_layers+3; l++) {
-            unsigned idx = o*(n_layers+3) + l;
-            bufferFree(tmp_pyr[idx].data);
-        }
-    }
-
-    return gauss_pyr;
-}
-
-template<typename T>
-std::vector<Param> buildDoGPyr(
-    std::vector<Param> gauss_pyr,
-    const unsigned n_octaves,
-    const unsigned n_layers,
-    Kernel* suKernel)
-{
-    // DoG Pyramid
-    std::vector<Param> dog_pyr(n_octaves);
-    for (unsigned o = 0; o < n_octaves; o++) {
-        for (int k = 0; k < 4; k++) {
-            dog_pyr[o].info.dims[k] = (k == 2) ? gauss_pyr[o].info.dims[k]-1 : gauss_pyr[o].info.dims[k];
-            dog_pyr[o].info.strides[k] = (k == 0) ? 1 : dog_pyr[o].info.dims[k-1] * dog_pyr[o].info.strides[k-1];
-        }
-        dog_pyr[o].info.offset = 0;
-
-        dog_pyr[o].data = bufferAlloc(dog_pyr[o].info.dims[3] * dog_pyr[o].info.strides[3] * sizeof(T));
-
-        const unsigned nel = dog_pyr[o].info.dims[1] * dog_pyr[o].info.strides[1];
-        const unsigned dog_layers = n_layers+2;
-
-        const int blk_x = divup(nel, SIFT_THREADS);
-        const NDRange local(SIFT_THREADS, 1);
-        const NDRange global(blk_x * SIFT_THREADS, 1);
-
-        auto suOp = make_kernel<Buffer, Buffer, unsigned, unsigned> (*suKernel);
-
-        suOp(EnqueueArgs(getQueue(), global, local),
-             *dog_pyr[o].data, *gauss_pyr[o].data, nel, dog_layers);
-        CL_DEBUG_FINISH(getQueue());
-    }
-
-    return dog_pyr;
-}
-
-template <typename T>
-void update_permutation(compute::buffer_iterator<T>& keys, compute::vector<int>& permutation, compute::command_queue& queue)
-{
-    // temporary storage for keys
-    compute::vector<T> temp(permutation.size(), 0, queue);
-
-    // permute the keys with the current reordering
-    compute::gather(permutation.begin(), permutation.end(), keys, temp.begin(), queue);
-
-    // stable_sort the permuted keys and update the permutation
-    compute::sort_by_key(temp.begin(), temp.end(), permutation.begin(), queue);
-}
-
-template <typename T>
-void apply_permutation(compute::buffer_iterator<T>& keys, compute::vector<int>& permutation, compute::command_queue& queue)
-{
-    // copy keys to temporary vector
-    compute::vector<T> temp(keys, keys+permutation.size(), queue);
-
-    // permute the keys
-    compute::gather(permutation.begin(), permutation.end(), temp.begin(), keys, queue);
-}
-
-template<typename T, typename convAccT>
-void sift(unsigned* out_feat,
-          unsigned* out_dlen,
-          Param& x_out,
-          Param& y_out,
-          Param& score_out,
-          Param& ori_out,
-          Param& size_out,
-          Param& desc_out,
-          Param img,
-          const unsigned n_layers,
-          const float contrast_thr,
-          const float edge_thr,
-          const float init_sigma,
-          const bool double_input,
-          const float img_scale,
-          const float feature_ratio)
-{
-    try {
-        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-        static std::map<int, Program*> siftProgs;
-        static std::map<int, Kernel*>  suKernel;
-        static std::map<int, Kernel*>  deKernel;
-        static std::map<int, Kernel*>  ieKernel;
-        static std::map<int, Kernel*>  coKernel;
-        static std::map<int, Kernel*>  rdKernel;
-        static std::map<int, Kernel*>  cdKernel;
-
-        int device = getActiveDeviceId();
-
-        std::call_once( compileFlags[device], [device] () {
-
-                std::ostringstream options;
-                options << " -D T=" << dtype_traits<T>::getName();
-
-                if (std::is_same<T, double>::value ||
-                    std::is_same<T, cdouble>::value) {
-                    options << " -D USE_DOUBLE";
-                }
-
-                cl::Program prog;
-                buildProgram(prog, sift_nonfree_cl, sift_nonfree_cl_len, options.str());
-                siftProgs[device] = new Program(prog);
-
-                suKernel[device] = new Kernel(*siftProgs[device], "sub");
-                deKernel[device] = new Kernel(*siftProgs[device], "detectExtrema");
-                ieKernel[device] = new Kernel(*siftProgs[device], "interpolateExtrema");
-                coKernel[device] = new Kernel(*siftProgs[device], "calcOrientation");
-                rdKernel[device] = new Kernel(*siftProgs[device], "removeDuplicates");
-                cdKernel[device] = new Kernel(*siftProgs[device], "computeDescriptor");
-            });
-
-        const unsigned min_dim = (double_input) ? min(img.info.dims[0]*2, img.info.dims[1]*2)
-                                                : min(img.info.dims[0], img.info.dims[1]);
-        const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
-
-        Param init_img = createInitialImage<T, convAccT>(img, init_sigma, double_input);
-
-        std::vector<Param> gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
-
-        std::vector<Param> dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers, suKernel[device]);
-
-        std::vector<cl::Buffer*> d_x_pyr(n_octaves, NULL);
-        std::vector<cl::Buffer*> d_y_pyr(n_octaves, NULL);
-        std::vector<cl::Buffer*> d_response_pyr(n_octaves, NULL);
-        std::vector<cl::Buffer*> d_size_pyr(n_octaves, NULL);
-        std::vector<cl::Buffer*> d_ori_pyr(n_octaves, NULL);
-        std::vector<cl::Buffer*> d_desc_pyr(n_octaves, NULL);
-        std::vector<unsigned> feat_pyr(n_octaves, 0);
-        unsigned total_feat = 0;
-
-        const unsigned d = DescrWidth;
-        const unsigned n = DescrHistBins;
-        const unsigned desc_len = d*d*n;
-
-        cl::Buffer* d_count = bufferAlloc(sizeof(unsigned));
-
-        for (unsigned o = 0; o < n_octaves; o++) {
-            if (dog_pyr[o].info.dims[0]-2*ImgBorder < 1 ||
-                dog_pyr[o].info.dims[1]-2*ImgBorder < 1)
-                continue;
-
-            const unsigned imel = dog_pyr[o].info.dims[0] * dog_pyr[o].info.dims[1];
-            const unsigned max_feat = ceil(imel * feature_ratio);
-
-            cl::Buffer* d_extrema_x     = bufferAlloc(max_feat * sizeof(float));
-            cl::Buffer* d_extrema_y     = bufferAlloc(max_feat * sizeof(float));
-            cl::Buffer* d_extrema_layer = bufferAlloc(max_feat * sizeof(unsigned));
-
-            unsigned extrema_feat = 0;
-            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &extrema_feat);
-
-            int dim0 = dog_pyr[o].info.dims[0];
-            int dim1 = dog_pyr[o].info.dims[1];
-
-            const int blk_x = divup(dim0-2*ImgBorder, SIFT_THREADS_X);
-            const int blk_y = divup(dim1-2*ImgBorder, SIFT_THREADS_Y);
-            const NDRange local(SIFT_THREADS_X, SIFT_THREADS_Y);
-            const NDRange global(blk_x * SIFT_THREADS_X, blk_y * SIFT_THREADS_Y);
-
-            float extrema_thr = 0.5f * contrast_thr / n_layers;
-
-            auto deOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
-                                    Buffer, KParam, unsigned, float,
-                                    LocalSpaceArg> (*deKernel[device]);
-
-            deOp(EnqueueArgs(getQueue(), global, local),
-                 *d_extrema_x, *d_extrema_y, *d_extrema_layer, *d_count,
-                 *dog_pyr[o].data, dog_pyr[o].info, max_feat, extrema_thr,
-                 cl::Local((SIFT_THREADS_X+2) * (SIFT_THREADS_Y+2) * 3 * sizeof(float)));
-            CL_DEBUG_FINISH(getQueue());
-
-            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &extrema_feat);
-            extrema_feat = min(extrema_feat, max_feat);
-
-            if (extrema_feat == 0) {
-                bufferFree(d_extrema_x);
-                bufferFree(d_extrema_y);
-                bufferFree(d_extrema_layer);
-
-                continue;
-            }
-
-            unsigned interp_feat = 0;
-            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &interp_feat);
-
-            cl::Buffer* d_interp_x = bufferAlloc(extrema_feat * sizeof(float));
-            cl::Buffer* d_interp_y = bufferAlloc(extrema_feat * sizeof(float));
-            cl::Buffer* d_interp_layer = bufferAlloc(extrema_feat * sizeof(unsigned));
-            cl::Buffer* d_interp_response = bufferAlloc(extrema_feat * sizeof(float));
-            cl::Buffer* d_interp_size = bufferAlloc(extrema_feat * sizeof(float));
-
-            const int blk_x_interp = divup(extrema_feat, SIFT_THREADS);
-            const NDRange local_interp(SIFT_THREADS, 1);
-            const NDRange global_interp(blk_x_interp * SIFT_THREADS, 1);
-
-            auto ieOp = make_kernel<Buffer, Buffer, Buffer,
-                                    Buffer, Buffer, Buffer,
-                                    Buffer, Buffer, Buffer, unsigned,
-                                    Buffer, KParam, unsigned, unsigned, unsigned,
-                                    float, float, float, float> (*ieKernel[device]);
-
-            ieOp(EnqueueArgs(getQueue(), global_interp, local_interp),
-                 *d_interp_x, *d_interp_y, *d_interp_layer,
-                 *d_interp_response, *d_interp_size, *d_count,
-                 *d_extrema_x, *d_extrema_y, *d_extrema_layer, extrema_feat,
-                 *dog_pyr[o].data, dog_pyr[o].info, extrema_feat, o, n_layers,
-                 contrast_thr, edge_thr, init_sigma, img_scale);
-            CL_DEBUG_FINISH(getQueue());
-
-            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &interp_feat);
-            interp_feat = min(interp_feat, extrema_feat);
-
-            if (interp_feat == 0) {
-                bufferFree(d_interp_x);
-                bufferFree(d_interp_y);
-                bufferFree(d_interp_layer);
-                bufferFree(d_interp_response);
-                bufferFree(d_interp_size);
-
-                continue;
-            }
-
-            compute::command_queue queue(getQueue()());
-            compute::context context(getContext()());
-
-            compute::buffer buf_interp_x((*d_interp_x)(), true);
-            compute::buffer buf_interp_y((*d_interp_y)(), true);
-            compute::buffer buf_interp_layer((*d_interp_layer)(), true);
-            compute::buffer buf_interp_response((*d_interp_response)(), true);
-            compute::buffer buf_interp_size((*d_interp_size)(), true);
-
-            compute::buffer_iterator<float> interp_x_begin = compute::make_buffer_iterator<float>(buf_interp_x, 0);
-            compute::buffer_iterator<float> interp_y_begin = compute::make_buffer_iterator<float>(buf_interp_y, 0);
-            compute::buffer_iterator<unsigned> interp_layer_begin = compute::make_buffer_iterator<unsigned>(buf_interp_layer, 0);
-            compute::buffer_iterator<float> interp_response_begin = compute::make_buffer_iterator<float>(buf_interp_response, 0);
-            compute::buffer_iterator<float> interp_size_begin = compute::make_buffer_iterator<float>(buf_interp_size, 0);
-
-            compute::vector<int> permutation(interp_feat, context);
-            compute::iota(permutation.begin(), permutation.end(), 0, queue);
-
-            update_permutation<float>(interp_x_begin, permutation, queue);
-            update_permutation<float>(interp_y_begin, permutation, queue);
-            update_permutation<unsigned>(interp_layer_begin, permutation, queue);
-            update_permutation<float>(interp_response_begin, permutation, queue);
-            update_permutation<float>(interp_size_begin, permutation, queue);
-
-            apply_permutation<float>(interp_x_begin, permutation, queue);
-            apply_permutation<float>(interp_y_begin, permutation, queue);
-            apply_permutation<unsigned>(interp_layer_begin, permutation, queue);
-            apply_permutation<float>(interp_response_begin, permutation, queue);
-            apply_permutation<float>(interp_size_begin, permutation, queue);
-
-            unsigned nodup_feat = 0;
-            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &nodup_feat);
-
-            cl::Buffer* d_nodup_x = bufferAlloc(interp_feat * sizeof(float));
-            cl::Buffer* d_nodup_y = bufferAlloc(interp_feat * sizeof(float));
-            cl::Buffer* d_nodup_layer = bufferAlloc(interp_feat * sizeof(unsigned));
-            cl::Buffer* d_nodup_response = bufferAlloc(interp_feat * sizeof(float));
-            cl::Buffer* d_nodup_size = bufferAlloc(interp_feat * sizeof(float));
-
-            const int blk_x_nodup = divup(extrema_feat, SIFT_THREADS);
-            const NDRange local_nodup(SIFT_THREADS, 1);
-            const NDRange global_nodup(blk_x_nodup * SIFT_THREADS, 1);
-
-            auto rdOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
-                                    Buffer, Buffer, Buffer, Buffer, Buffer,
-                                    unsigned> (*rdKernel[device]);
-
-            rdOp(EnqueueArgs(getQueue(), global_nodup, local_nodup),
-                 *d_nodup_x, *d_nodup_y, *d_nodup_layer,
-                 *d_nodup_response, *d_nodup_size, *d_count,
-                 *d_interp_x, *d_interp_y, *d_interp_layer,
-                 *d_interp_response, *d_interp_size, interp_feat);
-            CL_DEBUG_FINISH(getQueue());
-
-            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &nodup_feat);
-            nodup_feat = min(nodup_feat, interp_feat);
-
-            bufferFree(d_interp_x);
-            bufferFree(d_interp_y);
-            bufferFree(d_interp_layer);
-            bufferFree(d_interp_response);
-            bufferFree(d_interp_size);
-
-            unsigned oriented_feat = 0;
-            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &oriented_feat);
-            const unsigned max_oriented_feat = nodup_feat * 3;
-
-            cl::Buffer* d_oriented_x        = bufferAlloc(max_oriented_feat * sizeof(float));
-            cl::Buffer* d_oriented_y        = bufferAlloc(max_oriented_feat * sizeof(float));
-            cl::Buffer* d_oriented_layer    = bufferAlloc(max_oriented_feat * sizeof(unsigned));
-            cl::Buffer* d_oriented_response = bufferAlloc(max_oriented_feat * sizeof(float));
-            cl::Buffer* d_oriented_size     = bufferAlloc(max_oriented_feat * sizeof(float));
-            cl::Buffer* d_oriented_ori      = bufferAlloc(max_oriented_feat * sizeof(float));
-
-            const int blk_x_ori = divup(nodup_feat, SIFT_THREADS_Y);
-            const NDRange local_ori(SIFT_THREADS_X, SIFT_THREADS_Y);
-            const NDRange global_ori(SIFT_THREADS_X, blk_x_ori * SIFT_THREADS_Y);
-
-            auto coOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
-                                    Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
-                                    Buffer, KParam, unsigned, unsigned, int,
-                                    LocalSpaceArg> (*coKernel[device]);
-
-            coOp(EnqueueArgs(getQueue(), global_ori, local_ori),
-                 *d_oriented_x, *d_oriented_y, *d_oriented_layer,
-                 *d_oriented_response, *d_oriented_size, *d_oriented_ori, *d_count,
-                 *d_nodup_x, *d_nodup_y, *d_nodup_layer,
-                 *d_nodup_response, *d_nodup_size, nodup_feat,
-                 *gauss_pyr[o].data, gauss_pyr[o].info, max_oriented_feat, o, (int)double_input,
-                 cl::Local(OriHistBins * SIFT_THREADS_Y * 2 * sizeof(float)));
-            CL_DEBUG_FINISH(getQueue());
-
-            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &oriented_feat);
-            oriented_feat = min(oriented_feat, max_oriented_feat);
-
-            if (oriented_feat == 0) {
-                bufferFree(d_oriented_x);
-                bufferFree(d_oriented_y);
-                bufferFree(d_oriented_layer);
-                bufferFree(d_oriented_response);
-                bufferFree(d_oriented_size);
-
-                continue;
-            }
-
-            cl::Buffer* d_desc = bufferAlloc(oriented_feat * desc_len * sizeof(float));
-
-            float scale = 1.f/(1 << o);
-            if (double_input) scale *= 2.f;
-
-            const int blk_x_desc = divup(oriented_feat, 1);
-            const NDRange local_desc(SIFT_THREADS, 1);
-            const NDRange global_desc(SIFT_THREADS, blk_x_desc);
-
-            const unsigned histsz = 8;
-
-            auto cdOp = make_kernel<Buffer, unsigned, unsigned,
-                                    Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
-                                    Buffer, KParam, int, int, float, float, int,
-                                    LocalSpaceArg> (*cdKernel[device]);
-
-            cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
-                 *d_desc, desc_len, histsz,
-                 *d_oriented_x, *d_oriented_y, *d_oriented_layer,
-                 *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
-                 *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, init_sigma, n_layers,
-                 cl::Local(desc_len * (histsz+1) * sizeof(float)));
-            CL_DEBUG_FINISH(getQueue());
-
-            total_feat += oriented_feat;
-            feat_pyr[o] = oriented_feat;
-
-            if (oriented_feat > 0) {
-                d_x_pyr[o] = d_oriented_x;
-                d_y_pyr[o] = d_oriented_y;
-                d_response_pyr[o] = d_oriented_response;
-                d_ori_pyr[o] = d_oriented_ori;
-                d_size_pyr[o] = d_oriented_size;
-                d_desc_pyr[o] = d_desc;
-            }
-        }
-
-        bufferFree(d_count);
-
-        // If no features are found, set found features to 0 and return
-        if (total_feat == 0) {
-            *out_feat = 0;
-            return;
-        }
-
-        // Allocate output memory
-        x_out.info.dims[0] = total_feat;
-        x_out.info.strides[0] = 1;
-        y_out.info.dims[0] = total_feat;
-        y_out.info.strides[0] = 1;
-        score_out.info.dims[0] = total_feat;
-        score_out.info.strides[0] = 1;
-        ori_out.info.dims[0] = total_feat;
-        ori_out.info.strides[0] = 1;
-        size_out.info.dims[0] = total_feat;
-        size_out.info.strides[0] = 1;
-
-        desc_out.info.dims[0] = desc_len;
-        desc_out.info.strides[0] = 1;
-        desc_out.info.dims[1] = total_feat;
-        desc_out.info.strides[1] = desc_out.info.dims[0];
-
-        for (int k = 1; k < 4; k++) {
-            x_out.info.dims[k] = 1;
-            x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
-            y_out.info.dims[k] = 1;
-            y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
-            score_out.info.dims[k] = 1;
-            score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1];
-            ori_out.info.dims[k] = 1;
-            ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1];
-            size_out.info.dims[k] = 1;
-            size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1];
-            if (k > 1) {
-                desc_out.info.dims[k] = 1;
-                desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1];
-            }
-        }
-
-        if (total_feat > 0) {
-            size_t out_sz  = total_feat * sizeof(float);
-            x_out.data     = bufferAlloc(out_sz);
-            y_out.data     = bufferAlloc(out_sz);
-            score_out.data = bufferAlloc(out_sz);
-            ori_out.data   = bufferAlloc(out_sz);
-            size_out.data  = bufferAlloc(out_sz);
-
-            size_t desc_sz = total_feat * desc_len * sizeof(unsigned);
-            desc_out.data  = bufferAlloc(desc_sz);
-        }
-
-        unsigned offset = 0;
-        for (unsigned i = 0; i < n_octaves; i++) {
-            if (feat_pyr[i] == 0)
-                continue;
-
-            getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-            getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-            getQueue().enqueueCopyBuffer(*d_response_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-            getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-            getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
-
-            getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*desc_len*sizeof(unsigned), feat_pyr[i] * desc_len * sizeof(unsigned));
-
-            bufferFree(d_x_pyr[i]);
-            bufferFree(d_y_pyr[i]);
-            bufferFree(d_response_pyr[i]);
-            bufferFree(d_ori_pyr[i]);
-            bufferFree(d_size_pyr[i]);
-            bufferFree(d_desc_pyr[i]);
-
-            offset += feat_pyr[i];
-        }
-
-        // Sets number of output features and descriptor length
-        *out_feat = total_feat;
-        *out_dlen = desc_len;
-    } catch (cl::Error err) {
-        CL_TO_AF_ERROR(err);
-        throw;
-    }
-}
-
-} //namespace kernel
-
-} //namespace opencl
diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp
deleted file mode 100644
index f5bd8a4..0000000
--- a/test/sift_nonfree.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <gtest/gtest.h>
-#include <arrayfire.h>
-#include <af/dim4.hpp>
-#include <af/traits.hpp>
-#include <af/compatible.h>
-#include <string>
-#include <vector>
-#include <cmath>
-#include <testHelpers.hpp>
-#include <typeinfo>
-
-using std::string;
-using std::vector;
-using af::dim4;
-
-typedef struct
-{
-    float f[5];
-    unsigned d[128];
-} feat_desc_t;
-
-typedef struct
-{
-    float f[5];
-} feat_t;
-
-typedef struct
-{
-    float d[128];
-} desc_t;
-
-bool feat_cmp(feat_desc_t i, feat_desc_t j)
-{
-    for (int k = 0; k < 5; k++)
-        if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f))
-            return (round(i.f[k]*1e1f) < round(j.f[k]*1e1f));
-
-    return true;
-}
-
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat)
-{
-    feat.resize(nfeat);
-    for (size_t i = 0; i < feat.size(); i++) {
-        feat[i].f[0] = x[i];
-        feat[i].f[1] = y[i];
-        feat[i].f[2] = score[i];
-        feat[i].f[3] = ori[i];
-        feat[i].f[4] = size[i];
-        for (unsigned j = 0; j < 128; j++)
-            feat[i].d[j] = desc[i * 128 + j];
-    }
-}
-
-void array_to_feat_desc(vector<feat_desc_t>& feat, float* x, float* y, float* score, float* ori, float* size, vector<vector<float> >& desc, unsigned nfeat)
-{
-    feat.resize(nfeat);
-    for (size_t i = 0; i < feat.size(); i++) {
-        feat[i].f[0] = x[i];
-        feat[i].f[1] = y[i];
-        feat[i].f[2] = score[i];
-        feat[i].f[3] = ori[i];
-        feat[i].f[4] = size[i];
-        for (unsigned j = 0; j < 128; j++)
-            feat[i].d[j] = desc[i][j];
-    }
-}
-
-void array_to_feat(vector<feat_t>& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat)
-{
-    feat.resize(nfeat);
-    for (unsigned i = 0; i < feat.size(); i++) {
-        feat[i].f[0] = x[i];
-        feat[i].f[1] = y[i];
-        feat[i].f[2] = score[i];
-        feat[i].f[3] = ori[i];
-        feat[i].f[4] = size[i];
-    }
-}
-
-void split_feat_desc(vector<feat_desc_t>& fd, vector<feat_t>& f, vector<desc_t>& d)
-{
-    f.resize(fd.size());
-    d.resize(fd.size());
-    for (size_t i = 0; i < fd.size(); i++) {
-        f[i].f[0] = fd[i].f[0];
-        f[i].f[1] = fd[i].f[1];
-        f[i].f[2] = fd[i].f[2];
-        f[i].f[3] = fd[i].f[3];
-        f[i].f[4] = fd[i].f[4];
-        for (unsigned j = 0; j < 128; j++)
-            d[i].d[j] = fd[i].d[j];
-    }
-}
-
-unsigned popcount(unsigned x)
-{
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    x = (x + (x >> 4)) & 0x0F0F0F0F;
-    x = x + (x >> 8);
-    x = x + (x >> 16);
-    return x & 0x0000003F;
-}
-
-bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f)
-{
-    bool ret = true;
-    float sum = 0.0f;
-
-    for (dim_t i = 0; i < ndesc; i++) {
-        sum = 0.0f;
-        for (dim_t l = 0; l < desc_len; l++) {
-            dim_t idx = i * desc_len + l;
-            float x = (cpu[idx] - gpu[idx]);
-            sum += x*x;
-            if (abs(x) > (float)unit_thr) {
-                ret = false;
-                std::cout<<std::endl<<"@compareEuclidean: unit mismatch."<<std::endl;
-                std::cout<<"(cpu,gpu,cpu-gpu)["<<i<<","<<l<<"] : {"<<cpu[idx]<<","<<gpu[idx]<<","<<cpu[idx]-gpu[idx]<<"}"<<std::endl;
-                std::cout<<std::endl;
-                break;
-            }
-        }
-        if (sqrt(sum) > euc_thr) {
-            ret = false;
-            std::cout<<std::endl<<"@compareEuclidean: distance mismatch."<<std::endl;
-            std::cout<<"Euclidean distance: "<<sqrt(sum)<<std::endl;
-        }
-        if (ret == false)
-            return ret;
-    }
-
-    return ret;
-}
-
-template<typename T>
-class SIFT : public ::testing::Test
-{
-    public:
-        virtual void SetUp() {}
-};
-
-typedef ::testing::Types<float, double> TestTypes;
-
-TYPED_TEST_CASE(SIFT, TestTypes);
-
-template<typename T>
-void siftTest(string pTestFile)
-{
-#ifdef AF_BUILD_SIFT
-    if (noDoubleTests<T>()) return;
-
-    vector<dim4>           inDims;
-    vector<string>         inFiles;
-    vector<vector<float> > goldFeat;
-    vector<vector<float> > goldDesc;
-
-    readImageFeaturesDescriptors<float>(pTestFile, inDims, inFiles, goldFeat, goldDesc);
-
-    size_t testCount = inDims.size();
-
-    for (size_t testId=0; testId<testCount; ++testId) {
-        af_array inArray_f32  = 0;
-        af_array inArray      = 0;
-        af_array desc         = 0;
-        af_features feat;
-
-        inFiles[testId].insert(0,string(TEST_DIR"/sift/"));
-
-        ASSERT_EQ(AF_SUCCESS, af_load_image(&inArray_f32, inFiles[testId].c_str(), false));
-        ASSERT_EQ(AF_SUCCESS, conv_image<T>(&inArray, inArray_f32));
-
-        ASSERT_EQ(AF_SUCCESS, af_sift(&feat, &desc, inArray, 3, 0.04f, 10.0f, 1.6f, true, 1.f/256.f, 0.05f));
-
-        dim_t n = 0;
-        af_array x, y, score, orientation, size;
-
-        ASSERT_EQ(AF_SUCCESS, af_get_features_num(&n, feat));
-        ASSERT_EQ(AF_SUCCESS, af_get_features_xpos(&x, feat));
-        ASSERT_EQ(AF_SUCCESS, af_get_features_ypos(&y, feat));
-        ASSERT_EQ(AF_SUCCESS, af_get_features_score(&score, feat));
-        ASSERT_EQ(AF_SUCCESS, af_get_features_orientation(&orientation, feat));
-        ASSERT_EQ(AF_SUCCESS, af_get_features_size(&size, feat));
-
-        float * outX           = new float[n];
-        float * outY           = new float[n];
-        float * outScore       = new float[n];
-        float * outOrientation = new float[n];
-        float * outSize        = new float[n];
-        dim_t descSize;
-        dim_t descDims[4];
-        ASSERT_EQ(AF_SUCCESS, af_get_elements(&descSize, desc));
-        ASSERT_EQ(AF_SUCCESS, af_get_dims(&descDims[0], &descDims[1], &descDims[2], &descDims[3], desc));
-        float * outDesc     = new float[descSize];
-        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outX, x));
-        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outY, y));
-        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outScore, score));
-        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outOrientation, orientation));
-        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outSize, size));
-        ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outDesc, desc));
-
-        vector<feat_desc_t> out_feat_desc;
-        array_to_feat_desc(out_feat_desc, outX, outY, outScore, outOrientation, outSize, outDesc, n);
-
-        vector<feat_desc_t> gold_feat_desc;
-        array_to_feat_desc(gold_feat_desc, &goldFeat[0].front(), &goldFeat[1].front(), &goldFeat[2].front(), &goldFeat[3].front(), &goldFeat[4].front(), goldDesc, goldFeat[0].size());
-
-        std::stable_sort(out_feat_desc.begin(), out_feat_desc.end(), feat_cmp);
-        std::stable_sort(gold_feat_desc.begin(), gold_feat_desc.end(), feat_cmp);
-
-        vector<feat_t> out_feat;
-        vector<desc_t> v_out_desc;
-        vector<feat_t> gold_feat;
-        vector<desc_t> v_gold_desc;
-
-        split_feat_desc(out_feat_desc, out_feat, v_out_desc);
-        split_feat_desc(gold_feat_desc, gold_feat, v_gold_desc);
-
-        for (int elIter = 0; elIter < (int)n; elIter++) {
-            ASSERT_LE(fabs(out_feat[elIter].f[0] - gold_feat[elIter].f[0]), 1e-3) << "at: " << elIter << std::endl;
-            ASSERT_LE(fabs(out_feat[elIter].f[1] - gold_feat[elIter].f[1]), 1e-3) << "at: " << elIter << std::endl;
-            ASSERT_LE(fabs(out_feat[elIter].f[2] - gold_feat[elIter].f[2]), 1e-3) << "at: " << elIter << std::endl;
-            ASSERT_LE(fabs(out_feat[elIter].f[3] - gold_feat[elIter].f[3]), 0.5f) << "at: " << elIter << std::endl;
-            ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
-        }
-
-        EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 1.f, 2.f));
-
-        ASSERT_EQ(AF_SUCCESS, af_release_array(inArray));
-        ASSERT_EQ(AF_SUCCESS, af_release_array(inArray_f32));
-
-        ASSERT_EQ(AF_SUCCESS, af_release_array(x));
-        ASSERT_EQ(AF_SUCCESS, af_release_array(y));
-        ASSERT_EQ(AF_SUCCESS, af_release_array(score));
-        ASSERT_EQ(AF_SUCCESS, af_release_array(orientation));
-        ASSERT_EQ(AF_SUCCESS, af_release_array(size));
-        ASSERT_EQ(AF_SUCCESS, af_release_array(desc));
-
-        delete[] outX;
-        delete[] outY;
-        delete[] outScore;
-        delete[] outOrientation;
-        delete[] outSize;
-        delete[] outDesc;
-    }
-#endif
-}
-
-#define SIFT_INIT(desc, image) \
-    TYPED_TEST(SIFT, desc) \
-    {   \
-        siftTest<TypeParam>(string(TEST_DIR"/sift/"#image".test"));   \
-    }
-
-    SIFT_INIT(man, man);
-
-///////////////////////////////////// CPP ////////////////////////////////
-//
-TEST(SIFT, CPP)
-{
-#ifdef AF_BUILD_SIFT
-    if (noDoubleTests<float>()) return;
-
-    vector<dim4>           inDims;
-    vector<string>         inFiles;
-    vector<vector<float> > goldFeat;
-    vector<vector<float> > goldDesc;
-
-    readImageFeaturesDescriptors<float>(string(TEST_DIR"/sift/man.test"), inDims, inFiles, goldFeat, goldDesc);
-    inFiles[0].insert(0,string(TEST_DIR"/sift/"));
-
-    af::array in = af::loadImage(inFiles[0].c_str(), false);
-
-    af::features feat;
-    af::array desc;
-    af::sift(feat, desc, in, 3, 0.04f, 10.0f, 1.6f, true, 1.f/256.f, 0.05f);
-
-    float * outX           = new float[feat.getNumFeatures()];
-    float * outY           = new float[feat.getNumFeatures()];
-    float * outScore       = new float[feat.getNumFeatures()];
-    float * outOrientation = new float[feat.getNumFeatures()];
-    float * outSize        = new float[feat.getNumFeatures()];
-    float * outDesc        = new float[desc.elements()];
-    af::dim4 descDims = desc.dims();
-    feat.getX().host(outX);
-    feat.getY().host(outY);
-    feat.getScore().host(outScore);
-    feat.getOrientation().host(outOrientation);
-    feat.getSize().host(outSize);
-    desc.host(outDesc);
-
-    vector<feat_desc_t> out_feat_desc;
-    array_to_feat_desc(out_feat_desc, outX, outY, outScore, outOrientation, outSize, outDesc, feat.getNumFeatures());
-
-    vector<feat_desc_t> gold_feat_desc;
-    array_to_feat_desc(gold_feat_desc, &goldFeat[0].front(), &goldFeat[1].front(), &goldFeat[2].front(), &goldFeat[3].front(), &goldFeat[4].front(), goldDesc, goldFeat[0].size());
-
-    std::stable_sort(out_feat_desc.begin(), out_feat_desc.end(), feat_cmp);
-    std::stable_sort(gold_feat_desc.begin(), gold_feat_desc.end(), feat_cmp);
-
-    vector<feat_t> out_feat;
-    vector<desc_t> v_out_desc;
-    vector<feat_t> gold_feat;
-    vector<desc_t> v_gold_desc;
-
-    split_feat_desc(out_feat_desc, out_feat, v_out_desc);
-    split_feat_desc(gold_feat_desc, gold_feat, v_gold_desc);
-
-    for (int elIter = 0; elIter < (int)feat.getNumFeatures(); elIter++) {
-        ASSERT_LE(fabs(out_feat[elIter].f[0] - gold_feat[elIter].f[0]), 1e-3) << "at: " << elIter << std::endl;
-        ASSERT_LE(fabs(out_feat[elIter].f[1] - gold_feat[elIter].f[1]), 1e-3) << "at: " << elIter << std::endl;
-        ASSERT_LE(fabs(out_feat[elIter].f[2] - gold_feat[elIter].f[2]), 1e-3) << "at: " << elIter << std::endl;
-        ASSERT_LE(fabs(out_feat[elIter].f[3] - gold_feat[elIter].f[3]), 0.5f) << "at: " << elIter << std::endl;
-        ASSERT_LE(fabs(out_feat[elIter].f[4] - gold_feat[elIter].f[4]), 1e-3) << "at: " << elIter << std::endl;
-    }
-
-    EXPECT_TRUE(compareEuclidean(descDims[0], descDims[1], (float*)&v_out_desc[0], (float*)&v_gold_desc[0], 1.f, 2.f));
-
-    delete[] outX;
-    delete[] outY;
-    delete[] outScore;
-    delete[] outOrientation;
-    delete[] outSize;
-    delete[] outDesc;
-#endif
-}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git