[arrayfire] 244/408: Added OpenCL backend for SIFT

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:06 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit b87ba86552633001a2683b9c3890b6c88dc57b97
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date:   Thu Aug 13 10:28:56 2015 -0400

    Added OpenCL backend for SIFT
---
 src/backend/opencl/kernel/sift.cl  | 780 +++++++++++++++++++++++++++++++++++
 src/backend/opencl/kernel/sift.hpp | 822 +++++++++++++++++++++++++++++++++++++
 src/backend/opencl/sift.cpp        |  75 ++++
 src/backend/opencl/sift.hpp        |  26 ++
 4 files changed, 1703 insertions(+)

diff --git a/src/backend/opencl/kernel/sift.cl b/src/backend/opencl/kernel/sift.cl
new file mode 100644
index 0000000..1b856b4
--- /dev/null
+++ b/src/backend/opencl/kernel/sift.cl
@@ -0,0 +1,780 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+// The source code contained in this file is based on the original code by
+// Rob Hess. Please note that SIFT is an algorithm patented and protected
+// by US law, before using this code or any binary forms generated from it,
+// verify that you have permission to do so. The original license by Rob Hess
+// can be read below:
+//
+// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
+// All rights reserved.
+//
+// The following patent has been issued for methods embodied in this
+// software: "Method and apparatus for identifying scale invariant features
+// in an image and use of same for locating an object in an image," David
+// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
+// filed March 8, 1999. Asignee: The University of British Columbia. For
+// further details, contact David Lowe (lowe at cs.ubc.ca) or the
+// University-Industry Liaison Office of the University of British
+// Columbia.
+//
+// Note that restrictions imposed by this patent (and possibly others)
+// exist independently of and may be in conflict with the freedoms granted
+// in this license, which refers to copyright of the program, not patents
+// for any methods that it implements.  Both copyright and patent law must
+// be obeyed to legally use and redistribute this program and it is not the
+// purpose of this license to induce you to infringe any patents or other
+// property right claims or to contest validity of any such claims.  If you
+// redistribute or use the program, then this license merely protects you
+// from committing copyright infringement.  It does not protect you from
+// committing patent infringement.  So, before you do anything with this
+// program, make sure that you have permission to do so not merely in terms
+// of copyright, but also in terms of patent law.
+//
+// Please note that this license is not to be understood as a guarantee
+// either.  If you use the program according to this license, but in
+// conflict with patent law, it does not mean that the licensor will refund
+// you for any losses that you incur if you are sued for your patent
+// infringement.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//     * Redistributions of source code must retain the above copyright and
+//       patent notices, this list of conditions and the following
+//       disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in
+//       the documentation and/or other materials provided with the
+//       distribution.
+//     * Neither the name of Oregon State University nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// width of border in which to ignore keypoints
+#define IMG_BORDER 5
+
+// maximum steps of keypoint interpolation before failure
+#define MAX_INTERP_STEPS 5
+
+// default number of bins in histogram for orientation assignment
+#define ORI_HIST_BINS 36
+
+// determines gaussian sigma for orientation assignment
+#define ORI_SIG_FCTR 1.5f
+
+// determines the radius of the region used in orientation assignment
+#define ORI_RADIUS (3 * ORI_SIG_FCTR)
+
+// number of passes of orientation histogram smoothing
+#define SMOOTH_ORI_PASSES 2
+
+// orientation magnitude relative to max that results in new feature
+#define ORI_PEAK_RATIO 0.8f
+
+// determines the size of a single descriptor orientation histogram
+#define DESCR_SCL_FCTR 3.f
+
+// threshold on magnitude of elements of descriptor vector
+#define DESCR_MAG_THR 0.2f
+
+// factor used to convert floating-point descriptor to unsigned char
+#define INT_DESCR_FCTR 512.f
+
+#define PI_VAL 3.14159265358979323846f
+
+void gaussianElimination(float* A, float* b, float* x, const int n)
+{
+    // forward elimination
+    for (int i = 0; i < n-1; i++) {
+        for (int j = i+1; j < n; j++) {
+            float s = A[j*n+i] / A[i*n+i];
+
+            //for (int k = i+1; k < n; k++)
+            for (int k = i; k < n; k++)
+                A[j*n+k] -= s * A[i*n+k];
+
+            b[j] -= s * b[i];
+        }
+    }
+
+    for (int i = 0; i < n; i++)
+        x[i] = 0;
+
+    // backward substitution
+    float sum = 0;
+    for (int i = 0; i <= n-2; i++) {
+        sum = b[i];
+        for (int j = i+1; j < n; j++)
+            sum -= A[i*n+j] * x[j];
+        x[i] = sum / A[i*n+i];
+    }
+}
+
+inline void fatomic_add(volatile __local float *source, const float operand) {
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal + operand;
+    } while (atomic_cmpxchg((volatile __local unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+inline void normalizeDesc(
+    __local float* desc,
+    __local float* accum,
+    const int histlen,
+    int tid_x,
+    int tid_y,
+    int bsz_y)
+{
+    for (int i = tid_y; i < histlen; i += bsz_y)
+        accum[tid_y] = desc[tid_x*histlen+i]*desc[tid_x*histlen+i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float sum = 0.0f;
+    for (int i = 0; i < histlen; i++)
+        sum += desc[tid_x*histlen+i]*desc[tid_x*histlen+i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid_y < 64)
+        accum[tid_y] += accum[tid_y+64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_y < 32)
+        accum[tid_y] += accum[tid_y+32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_y < 16)
+        accum[tid_y] += accum[tid_y+16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_y < 8)
+        accum[tid_y] += accum[tid_y+8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_y < 4)
+        accum[tid_y] += accum[tid_y+4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_y < 2)
+        accum[tid_y] += accum[tid_y+2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_y < 1)
+        accum[tid_y] += accum[tid_y+1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float len_sq = accum[0];
+    float len_inv = 1.0f / sqrt(len_sq);
+
+    for (int i = tid_y; i < histlen; i += bsz_y) {
+        desc[tid_x*histlen+i] *= len_inv;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__kernel void sub(
+    __global T* out,
+    __global const T* in,
+    unsigned nel,
+    unsigned n_layers)
+{
+    unsigned i = get_global_id(0);
+
+    for (unsigned l = 0; l < n_layers; l++)
+        out[l*nel + i] = in[l*nel + i] - in[(l+1)*nel + i];
+}
+
+// Determines whether a pixel is a scale-space extremum by comparing it to its
+// 3x3x3 pixel neighborhood.
+__kernel void detectExtrema(
+    __global float* x_out,
+    __global float* y_out,
+    __global unsigned* layer_out,
+    __global unsigned* counter,
+    __global const T* dog,
+    KParam iDoG,
+    const unsigned max_feat,
+    const float threshold)
+{
+    // One pixel border for each side
+    const int l_i = 32+2;
+    const int l_j = 8+2;
+
+    __local float l_mem[(32+2)*(8+2)*3];
+    __local float* l_prev   = l_mem;
+    __local float* l_center = l_mem + (32+2)*(8+2);
+    __local float* l_next   = l_mem + (32+2)*(8+2)*2;
+    __local float* l_tmp;
+
+    const int dim0 = iDoG.dims[0];
+    const int dim1 = iDoG.dims[1];
+    const int imel = iDoG.dims[0]*iDoG.dims[1];
+
+    const int lid_i = get_local_id(0);
+    const int lid_j = get_local_id(1);
+    const int lsz_i = get_local_size(0);
+    const int lsz_j = get_local_size(1);
+    const int i = get_group_id(0) * lsz_i + lid_i+IMG_BORDER;
+    const int j = get_group_id(1) * lsz_j + lid_j+IMG_BORDER;
+
+    const int x = lid_i+1;
+    const int y = lid_j+1;
+
+    for (int l = 1; l < iDoG.dims[2]-1; l++) {
+        const int l_i_half = l_i/2;
+        const int l_j_half = l_j/2;
+        if (lid_i < l_i_half && lid_j < l_j_half && i < dim0-IMG_BORDER+1 && j < dim1-IMG_BORDER+1) {
+                l_next  [lid_j*l_i + lid_i] = dog[(l+1)*imel+(j-1)*dim0+i-1];
+                l_center[lid_j*l_i + lid_i] = dog[(l  )*imel+(j-1)*dim0+i-1];
+                l_prev  [lid_j*l_i + lid_i] = dog[(l-1)*imel+(j-1)*dim0+i-1];
+
+                l_next  [lid_j*l_i + lid_i+l_i_half] = dog[(l+1)*imel+(j-1)*dim0+i-1+l_i_half];
+                l_center[lid_j*l_i + lid_i+l_i_half] = dog[(l  )*imel+(j-1)*dim0+i-1+l_i_half];
+                l_prev  [lid_j*l_i + lid_i+l_i_half] = dog[(l-1)*imel+(j-1)*dim0+i-1+l_i_half];
+
+                l_next  [(lid_j+l_j_half)*l_i + lid_i] = dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1];
+                l_center[(lid_j+l_j_half)*l_i + lid_i] = dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1];
+                l_prev  [(lid_j+l_j_half)*l_i + lid_i] = dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1];
+
+                l_next  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = dog[(l+1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
+                l_center[(lid_j+l_j_half)*l_i + lid_i+l_i_half] = dog[(l  )*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
+                l_prev  [(lid_j+l_j_half)*l_i + lid_i+l_i_half] = dog[(l-1)*imel+(j-1+l_j_half)*dim0+i-1+l_i_half];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (i < dim0-IMG_BORDER && j < dim1-IMG_BORDER) {
+            const int l_i_half = l_i/2;
+            float p = l_center[y*l_i + x];
+
+            if (fabs((float)p) > threshold &&
+                ((p > 0                         && p > l_center[(y-1)*l_i + x-1] && p > l_center[(y-1)*l_i + x]   &&
+                  p > l_center[(y-1)*l_i + x+1] && p > l_center[y*l_i + (x-1)]   && p > l_center[y*l_i + x+1]     &&
+                  p > l_center[(y+1)*l_i + x-1] && p > l_center[(y+1)*l_i + x]   && p > l_center[(y+1)*l_i + x+1] &&
+                  p > l_prev[(y-1)*l_i + x-1]   && p > l_prev[(y-1)*l_i + x]     && p > l_prev[(y-1)*l_i + x+1]   &&
+                  p > l_prev[y*l_i + x-1]       && p > l_prev[y*l_i + x]         && p > l_prev[y*l_i + x+1]       &&
+                  p > l_prev[(y+1)*l_i + x-1]   && p > l_prev[(y+1)*l_i + x]     && p > l_prev[(y+1)*l_i + x+1]   &&
+                  p > l_next[(y-1)*l_i + x-1]   && p > l_next[(y-1)*l_i + x]     && p > l_next[(y-1)*l_i + x+1]   &&
+                  p > l_next[y*l_i + x-1]       && p > l_next[y*l_i + x]         && p > l_next[y*l_i + x+1]       &&
+                  p > l_next[(y+1)*l_i + x-1]   && p > l_next[(y+1)*l_i + x]     && p > l_next[(y+1)*l_i + x+1])  ||
+                 (p < 0                         && p < l_center[(y-1)*l_i + x-1] && p < l_center[(y-1)*l_i + x]   &&
+                  p < l_center[(y-1)*l_i + x+1] && p < l_center[y*l_i + (x-1)]   && p < l_center[y*l_i + x+1]     &&
+                  p < l_center[(y+1)*l_i + x-1] && p < l_center[(y+1)*l_i + x]   && p < l_center[(y+1)*l_i + x+1] &&
+                  p < l_prev[(y-1)*l_i + x-1]   && p < l_prev[(y-1)*l_i + x]     && p < l_prev[(y-1)*l_i + x+1]   &&
+                  p < l_prev[y*l_i + x-1]       && p < l_prev[y*l_i + x]         && p < l_prev[y*l_i + x+1]       &&
+                  p < l_prev[(y+1)*l_i + x-1]   && p < l_prev[(y+1)*l_i + x]     && p < l_prev[(y+1)*l_i + x+1]   &&
+                  p < l_next[(y-1)*l_i + x-1]   && p < l_next[(y-1)*l_i + x]     && p < l_next[(y-1)*l_i + x+1]   &&
+                  p < l_next[y*l_i + x-1]       && p < l_next[y*l_i + x]         && p < l_next[y*l_i + x+1]       &&
+                  p < l_next[(y+1)*l_i + x-1]   && p < l_next[(y+1)*l_i + x]     && p < l_next[(y+1)*l_i + x+1]))) {
+
+                unsigned idx = atomic_inc(counter);
+                if (idx < max_feat)
+                {
+                    x_out[idx] = (float)j;
+                    y_out[idx] = (float)i;
+                    layer_out[idx] = l;
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+// Interpolates a scale-space extremum's location and scale to subpixel
+// accuracy to form an image feature. Rejects features with low contrast.
+// Based on Section 4 of Lowe's paper.
+__kernel void interpolateExtrema(
+    __global float* x_out,
+    __global float* y_out,
+    __global unsigned* layer_out,
+    __global float* response_out,
+    __global float* size_out,
+    __global unsigned* counter,
+    __global const float* x_in,
+    __global const float* y_in,
+    __global const unsigned* layer_in,
+    const unsigned extrema_feat,
+    __global const T* dog_octave,
+    KParam iDoG,
+    const unsigned max_feat,
+    const unsigned octave,
+    const unsigned n_layers,
+    const float contrast_thr,
+    const float edge_thr,
+    const float sigma,
+    const float img_scale)
+{
+    const unsigned f = get_global_id(0);
+
+    if (f < extrema_feat)
+    {
+        const float first_deriv_scale = img_scale*0.5f;
+        const float second_deriv_scale = img_scale;
+        const float cross_deriv_scale = img_scale*0.25f;
+
+        float xl = 0, xy = 0, xx = 0, contr = 0;
+        int i = 0;
+
+        unsigned x = x_in[f];
+        unsigned y = y_in[f];
+        unsigned layer = layer_in[f];
+
+        const int dim0 = iDoG.dims[0];
+        const int dim1 = iDoG.dims[1];
+        const int imel = dim0 * dim1;
+
+        __global const T* prev   = dog_octave + (int)((layer-1)*imel);
+        __global const T* center = dog_octave + (int)((layer  )*imel);
+        __global const T* next   = dog_octave + (int)((layer+1)*imel);
+
+        for(i = 0; i < MAX_INTERP_STEPS; i++) {
+            float dD[3] = {(center[(x+1)*dim0+y] - center[(x-1)*dim0+y]) * first_deriv_scale,
+                           (center[x*dim0+y+1] - center[x*dim0+y-1]) * first_deriv_scale,
+                           (next[x*dim0+y] - prev[x*dim0+y]) * first_deriv_scale};
+
+            float d2 = center[x*dim0+y]*2.f;
+            float dxx = (center[(x+1)*dim0+y] + center[(x-1)*dim0+y] - d2)*second_deriv_scale;
+            float dyy = (center[x*dim0+y+1] + center[x*dim0+y-1] - d2)*second_deriv_scale;
+            float dss = (next[x*dim0+y] + prev[x*dim0+y] - d2)*second_deriv_scale;
+            float dxy = (center[(x+1)*dim0+y+1] - center[(x-1)*dim0+y+1] -
+                         center[(x+1)*dim0+y-1] + center[(x-1)*dim0+y-1])*cross_deriv_scale;
+            float dxs = (next[(x+1)*dim0+y] - next[(x-1)*dim0+y] -
+                         prev[(x+1)*dim0+y] + prev[(x-1)*dim0+y])*cross_deriv_scale;
+            float dys = (next[x*dim0+y+1] - next[x*dim0+y-1] -
+                         prev[x*dim0+y+1] + prev[x*dim0+y-1])*cross_deriv_scale;
+
+            float H[9] = {dxx, dxy, dxs,
+                          dxy, dyy, dys,
+                          dxs, dys, dss};
+
+            float X[3];
+            gaussianElimination(H, dD, X, 3);
+
+            xl = -X[2];
+            xy = -X[1];
+            xx = -X[0];
+
+            if(fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
+                break;
+
+            x += round(xx);
+            y += round(xy);
+            layer += round(xl);
+
+            if(layer < 1 || layer > n_layers ||
+               x < IMG_BORDER || x >= dim1 - IMG_BORDER ||
+               y < IMG_BORDER || y >= dim0 - IMG_BORDER)
+                return;
+        }
+
+        // ensure convergence of interpolation
+        if (i >= MAX_INTERP_STEPS)
+            return;
+
+        float dD[3] = {(center[(x+1)*dim0+y] - center[(x-1)*dim0+y]) * first_deriv_scale,
+                       (center[x*dim0+y+1] - center[x*dim0+y-1]) * first_deriv_scale,
+                       (next[x*dim0+y] - prev[(x-1)*dim0+y]) * first_deriv_scale};
+        float X[3] = {xx, xy, xl};
+
+        float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
+
+        contr = center[x*dim0+y]*img_scale + P * 0.5f;
+        if(fabs(contr) < (contrast_thr / n_layers))
+            return;
+
+        // principal curvatures are computed using the trace and det of Hessian
+        float d2 = center[x*dim0+y]*2.f;
+        float dxx = (center[(x+1)*dim0+y] + center[(x-1)*dim0+y] - d2) * second_deriv_scale;
+        float dyy = (center[x*dim0+y+1] + center[x*dim0+y-1] - d2) * second_deriv_scale;
+        float dxy = (center[(x+1)*dim0+y+1] - center[(x-1)*dim0+y+1] -
+                     center[(x+1)*dim0+y-1] + center[(x-1)*dim0+y-1]) * cross_deriv_scale;
+
+        float tr = dxx + dyy;
+        float det = dxx * dyy - dxy * dxy;
+
+        // add FLT_EPSILON for double-precision compatibility
+        if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
+            return;
+
+        unsigned ridx = atomic_inc(counter);
+
+        if (ridx < max_feat)
+        {
+            x_out[ridx] = (x + xx) * (1 << octave);
+            y_out[ridx] = (y + xy) * (1 << octave);
+            layer_out[ridx] = layer;
+            response_out[ridx] = fabs(contr);
+            size_out[ridx] = sigma*pow(2.f, octave + (layer + xl) / n_layers);
+        }
+    }
+}
+
+// Remove duplicate keypoints
+__kernel void removeDuplicates(
+    __global float* x_out,
+    __global float* y_out,
+    __global unsigned* layer_out,
+    __global float* response_out,
+    __global float* size_out,
+    __global unsigned* counter,
+    __global const float* x_in,
+    __global const float* y_in,
+    __global const unsigned* layer_in,
+    __global const float* response_in,
+    __global const float* size_in,
+    const unsigned total_feat)
+{
+    const unsigned f = get_global_id(0);
+
+    if (f < total_feat) {
+        const float prec_fctr = 1e4f;
+
+        bool cond = (f < total_feat-1)
+                    ? !(round(x_in[f]*prec_fctr) == round(x_in[f+1]*prec_fctr) &&
+                        round(y_in[f]*prec_fctr) == round(y_in[f+1]*prec_fctr) &&
+                        layer_in[f] == layer_in[f+1] &&
+                        round(response_in[f]*prec_fctr) == round(response_in[f+1]*prec_fctr) &&
+                        round(size_in[f]*prec_fctr) == round(size_in[f+1]*prec_fctr))
+                    : true;
+
+        if (cond) {
+            unsigned idx = atomic_inc(counter);
+
+            x_out[idx] = x_in[f];
+            y_out[idx] = y_in[f];
+            layer_out[idx] = layer_in[f];
+            response_out[idx] = response_in[f];
+            size_out[idx] = size_in[f];
+        }
+    }
+
+}
+
+// Computes a canonical orientation for each image feature in an array.  Based
+// on Section 5 of Lowe's paper.  This function adds features to the array when
+// there is more than one dominant orientation at a given feature location.
+__kernel void calcOrientation(
+    __global float* x_out,
+    __global float* y_out,
+    __global unsigned* layer_out,
+    __global float* response_out,
+    __global float* size_out,
+    __global float* ori_out,
+    __global unsigned* counter,
+    __global const float* x_in,
+    __global const float* y_in,
+    __global const unsigned* layer_in,
+    __global const float* response_in,
+    __global const float* size_in,
+    const unsigned total_feat,
+    __global const T* gauss_octave,
+    KParam iGauss,
+    const unsigned max_feat,
+    const unsigned octave,
+    const int double_input)
+{
+    const unsigned f = get_global_id(0);
+    const int tid_x = get_local_id(0);
+    const int tid_y = get_local_id(1);
+    const int bsz_y = get_local_size(1);
+
+    const int n = ORI_HIST_BINS;
+
+    const int hdim = ORI_HIST_BINS;
+    const int thdim = ORI_HIST_BINS;
+    __local float hist[ORI_HIST_BINS*8];
+    __local float temphist[ORI_HIST_BINS*8];
+
+    if (f < total_feat) {
+        // Load keypoint information
+        const float real_x = x_in[f];
+        const float real_y = y_in[f];
+        const unsigned layer = layer_in[f];
+        const float response = response_in[f];
+        const float size = size_in[f];
+
+        const int pt_x = (int)round(real_x / (1 << octave));
+        const int pt_y = (int)round(real_y / (1 << octave));
+
+        // Calculate auxiliary parameters
+        const float scl_octv = size*0.5f / (1 << octave);
+        const int radius = (int)round(ORI_RADIUS * scl_octv);
+        const float sigma = ORI_SIG_FCTR * scl_octv;
+        const int len = (radius*2+1);
+        const float exp_denom = 2.f * sigma * sigma;
+
+        // Initialize temporary histogram
+        for (int i = tid_y; i < ORI_HIST_BINS; i += bsz_y) {
+            hist[tid_x*hdim + i] = 0.f;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        const int dim0 = iGauss.dims[0];
+        const int dim1 = iGauss.dims[1];
+
+        // Calculate layer offset
+        const int layer_offset = layer * dim0 * dim1;
+        __global const T* img = gauss_octave + layer_offset;
+
+        // Calculate orientation histogram
+        for (int l = tid_y; l < len*len; l += bsz_y) {
+            int i = l / len - radius;
+            int j = l % len - radius;
+
+            int y = pt_y + i;
+            int x = pt_x + j;
+            if (y < 1 || y >= dim0 - 1 ||
+                x < 1 || x >= dim1 - 1)
+                continue;
+
+            float dx = (float)(img[(x+1)*dim0+y] - img[(x-1)*dim0+y]);
+            float dy = (float)(img[x*dim0+y-1] - img[x*dim0+y+1]);
+
+            float mag = sqrt(dx*dx+dy*dy);
+            float ori = atan2(dy,dx);
+            float w = exp(-(i*i + j*j)/exp_denom);
+
+            int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
+            bin = bin < n ? bin : 0;
+            bin = (bin < 0) ? 0 : (bin >= n) ? n-1 : bin;
+
+            fatomic_add(&hist[tid_x*hdim+bin], w*mag);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (int i = 0; i < SMOOTH_ORI_PASSES; i++) {
+            for (int j = tid_y; j < n; j += bsz_y) {
+                temphist[tid_x*hdim+j] = hist[tid_x*hdim+j];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            for (int j = tid_y; j < n; j += bsz_y) {
+                float prev = (j == 0) ? temphist[tid_x*hdim+n-1] : temphist[tid_x*hdim+j-1];
+                float next = (j+1 == n) ? temphist[tid_x*hdim] : temphist[tid_x*hdim+j+1];
+                hist[tid_x*hdim+j] = 0.25f * prev + 0.5f * temphist[tid_x*hdim+j] + 0.25f * next;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        for (int i = tid_y; i < n; i += bsz_y)
+            temphist[tid_x*hdim+i] = hist[tid_x*hdim+i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (tid_y < 16)
+            temphist[tid_x*thdim+tid_y] = fmax(hist[tid_x*hdim+tid_y], hist[tid_x*hdim+tid_y+16]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (tid_y < 8)
+            temphist[tid_x*thdim+tid_y] = fmax(temphist[tid_x*thdim+tid_y], temphist[tid_x*thdim+tid_y+8]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (tid_y < 4) {
+            temphist[tid_x*thdim+tid_y] = fmax(temphist[tid_x*thdim+tid_y], hist[tid_x*hdim+tid_y+32]);
+            temphist[tid_x*thdim+tid_y] = fmax(temphist[tid_x*thdim+tid_y], temphist[tid_x*thdim+tid_y+4]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (tid_y < 2)
+            temphist[tid_x*thdim+tid_y] = fmax(temphist[tid_x*thdim+tid_y], temphist[tid_x*thdim+tid_y+2]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (tid_y < 1)
+            temphist[tid_x*thdim+tid_y] = fmax(temphist[tid_x*thdim+tid_y], temphist[tid_x*thdim+tid_y+1]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        float omax = temphist[tid_x*thdim];
+
+        float mag_thr = (float)(omax * ORI_PEAK_RATIO);
+        int l, r;
+        float bin;
+        for (int j = tid_y; j < n; j+=bsz_y) {
+            l = (j == 0) ? n - 1 : j - 1;
+            r = (j + 1) % n;
+            if (hist[tid_x*hdim+j] > hist[tid_x*hdim+l] &&
+                hist[tid_x*hdim+j] > hist[tid_x*hdim+r] &&
+                hist[tid_x*hdim+j] >= mag_thr) {
+                unsigned idx = atomic_inc(counter);
+
+                if (idx < max_feat) {
+                    float bin = j + 0.5f * (hist[tid_x*hdim+l] - hist[tid_x*hdim+r]) /
+                                (hist[tid_x*hdim+l] - 2.0f*hist[tid_x*hdim+j] + hist[tid_x*hdim+r]);
+                    bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
+                    float ori = 360.f - ((360.f/n) * bin);
+
+                    float new_real_x = real_x;
+                    float new_real_y = real_y;
+                    float new_size = size;
+
+                    if (double_input != 0) {
+                        float scale = 0.5f;
+                        new_real_x *= scale;
+                        new_real_y *= scale;
+                        new_size *= scale;
+                    }
+
+                    x_out[idx] = new_real_x;
+                    y_out[idx] = new_real_y;
+                    layer_out[idx] = layer;
+                    response_out[idx] = response;
+                    size_out[idx] = new_size;
+                    ori_out[idx] = ori;
+                }
+            }
+        }
+    }
+}
+
+// Computes feature descriptors for features in an array.  Based on Section 6
+// of Lowe's paper.
+__kernel void computeDescriptor(
+    __global float* desc_out,
+    const unsigned desc_len,
+    __global const float* x_in,
+    __global const float* y_in,
+    __global const unsigned* layer_in,
+    __global const float* response_in,
+    __global const float* size_in,
+    __global const float* ori_in,
+    const unsigned total_feat,
+    __global const T* gauss_octave,
+    KParam iGauss,
+    const int d,
+    const int n,
+    const float scale,
+    const float sigma,
+    const int n_layers)
+{
+    const int f = get_global_id(0);
+    const int tid_x = get_local_id(0);
+    const int tid_y = get_local_id(1);
+    const int bsz_x = get_local_size(0);
+    const int bsz_y = get_local_size(1);
+
+    const int histsz = 8;
+    __local float desc[128*8];
+    __local float accum[128];
+
+    if (f < total_feat) {
+        const unsigned layer = layer_in[f];
+        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+        const float size = size_in[f];
+        const int fx = round(x_in[f] * scale);
+        const int fy = round(y_in[f] * scale);
+
+        // Points img to correct Gaussian pyramid layer
+        const int dim0 = iGauss.dims[0];
+        const int dim1 = iGauss.dims[1];
+        __global const T* img = gauss_octave + (layer * dim0 * dim1);
+
+        float cos_t = cos(ori);
+        float sin_t = sin(ori);
+        float bins_per_rad = n / (PI_VAL * 2.f);
+        float exp_denom = d * d * 0.5f;
+        float hist_width = DESCR_SCL_FCTR * sigma * pow(2.f, layer/n_layers);
+        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+        int len = radius*2+1;
+        const int histlen = d*d*n;
+        const int hist_off = (tid_y % histsz) * 128;
+
+        for (int i = tid_y; i < histlen*histsz; i += bsz_y)
+            desc[tid_x*histlen+i] = 0.f;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Calculate orientation histogram
+        for (int l = tid_y; l < len*len; l += bsz_y) {
+            int i = l / len - radius;
+            int j = l % len - radius;
+
+            int y = fy + i;
+            int x = fx + j;
+
+            float x_rot = (j * cos_t - i * sin_t) / hist_width;
+            float y_rot = (j * sin_t + i * cos_t) / hist_width;
+            float xbin = x_rot + d/2 - 0.5f;
+            float ybin = y_rot + d/2 - 0.5f;
+
+            if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
+                y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
+                float dx = img[(x+1)*dim0+y] - img[(x-1)*dim0+y];
+                float dy = img[x*dim0+(y-1)] - img[x*dim0+(y+1)];
+
+                float grad_mag = sqrt(dx*dx + dy*dy);
+                float grad_ori = atan2(dy, dx) - ori;
+                while (grad_ori < 0.0f)
+                    grad_ori += PI_VAL*2;
+                while (grad_ori >= PI_VAL*2)
+                    grad_ori -= PI_VAL*2;
+
+                float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
+                float obin = grad_ori * bins_per_rad;
+                float mag = grad_mag*w;
+
+                int x0 = floor(xbin);
+                int y0 = floor(ybin);
+                int o0 = floor(obin);
+                xbin -= x0;
+                ybin -= y0;
+                obin -= o0;
+
+                for (int yl = 0; yl <= 1; yl++) {
+                    int yb = y0 + yl;
+                    if (yb >= 0 && yb < d) {
+	                    float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
+	                    for (int xl = 0; xl <= 1; xl++) {
+	                        int xb = x0 + xl;
+	                        if (xb >= 0 && xb < d) {
+		                        float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
+		                        for (int ol = 0; ol <= 1; ol++) {
+		                            int ob = (o0 + ol) % n;
+		                            float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
+		                            fatomic_add(&desc[hist_off + tid_x*128 + (yb*d + xb)*n + ob], v_o);
+		                        }
+		                    }
+	                    }
+	                }
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Combine histograms (reduces previous atomicAdd overhead)
+        for (int l = tid_y; l < 128*4; l += bsz_y)
+            desc[l] += desc[l+4*128];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        for (int l = tid_y; l < 128*2; l += bsz_y)
+            desc[l    ] += desc[l+2*128];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        for (int l = tid_y; l < 128; l += bsz_y)
+            desc[l] += desc[l+128];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        normalizeDesc(desc, accum, histlen, tid_x, tid_y, bsz_y);
+
+        for (int i = tid_y; i < d*d*n; i += bsz_y)
+            desc[tid_x*128+i] = min(desc[tid_x*128+i], DESCR_MAG_THR);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        normalizeDesc(desc, accum, histlen, tid_x, tid_y, bsz_y);
+
+        // Calculate final descriptor values
+        for (int k = tid_y; k < d*d*n; k += bsz_y) {
+            desc_out[f*desc_len+k] = round(min(255.f, desc[tid_x*128+k] * INT_DESCR_FCTR));
+        }
+    }
+}
diff --git a/src/backend/opencl/kernel/sift.hpp b/src/backend/opencl/kernel/sift.hpp
new file mode 100644
index 0000000..1bb2dcc
--- /dev/null
+++ b/src/backend/opencl/kernel/sift.hpp
@@ -0,0 +1,822 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+// The source code contained in this file is based on the original code by
+// Rob Hess. Please note that SIFT is an algorithm patented and protected
+// by US law, before using this code or any binary forms generated from it,
+// verify that you have permission to do so. The original license by Rob Hess
+// can be read below:
+//
+// Copyright (c) 2006-2012, Rob Hess <rob at iqengines.com>
+// All rights reserved.
+//
+// The following patent has been issued for methods embodied in this
+// software: "Method and apparatus for identifying scale invariant features
+// in an image and use of same for locating an object in an image," David
+// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
+// filed March 8, 1999. Asignee: The University of British Columbia. For
+// further details, contact David Lowe (lowe at cs.ubc.ca) or the
+// University-Industry Liaison Office of the University of British
+// Columbia.
+//
+// Note that restrictions imposed by this patent (and possibly others)
+// exist independently of and may be in conflict with the freedoms granted
+// in this license, which refers to copyright of the program, not patents
+// for any methods that it implements.  Both copyright and patent law must
+// be obeyed to legally use and redistribute this program and it is not the
+// purpose of this license to induce you to infringe any patents or other
+// property right claims or to contest validity of any such claims.  If you
+// redistribute or use the program, then this license merely protects you
+// from committing copyright infringement.  It does not protect you from
+// committing patent infringement.  So, before you do anything with this
+// program, make sure that you have permission to do so not merely in terms
+// of copyright, but also in terms of patent law.
+//
+// Please note that this license is not to be understood as a guarantee
+// either.  If you use the program according to this license, but in
+// conflict with patent law, it does not mean that the licensor will refund
+// you for any losses that you incur if you are sued for your patent
+// infringement.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//     * Redistributions of source code must retain the above copyright and
+//       patent notices, this list of conditions and the following
+//       disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in
+//       the documentation and/or other materials provided with the
+//       distribution.
+//     * Neither the name of Oregon State University nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <af/defines.h>
+#include <program.hpp>
+#include <dispatch.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+#include <boost/compute/core.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/algorithm/gather.hpp>
+#include <boost/compute/algorithm/iota.hpp>
+#include <boost/compute/algorithm/sort_by_key.hpp>
+#include <convolve_common.hpp>
+#include <kernel/convolve_separable.hpp>
+#include <kernel/fast.hpp>
+#include <kernel/resize.hpp>
+#include <kernel/sort_index.hpp>
+#include <kernel_headers/sift.hpp>
+#include <memory.hpp>
+#include <vector>
+
+using cl::Buffer;
+using cl::Program;
+using cl::Kernel;
+using cl::EnqueueArgs;
+using cl::LocalSpaceArg;
+using cl::NDRange;
+using std::vector;
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+static const int SIFT_THREADS   = 256;
+static const int SIFT_THREADS_X = 16;
+static const int SIFT_THREADS_Y = 16;
+
+static const float InitSigma = 0.5f;
+static const int ImgBorder = 5;
+
+// default width of descriptor histogram array
+static const int DescrWidth = 4;
+
+// default number of bins per histogram in descriptor array
+static const int DescrHistBins = 8;
+
+static const float PI_VAL = 3.14159265358979323846f;
+
+template<typename T>
+void gaussian1D(T* out, const int dim, double sigma=0.0)
+{
+    if(!(sigma>0)) sigma = 0.25*dim;
+
+    T sum = (T)0;
+    for(int i=0;i<dim;i++)
+    {
+        int x = i-(dim-1)/2;
+        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
+        out[i] = el;
+        sum   += el;
+    }
+
+    for(int k=0;k<dim;k++)
+        out[k] /= sum;
+}
+
+template<typename T>
+Param gaussFilter(float sigma)
+{
+    // Using 6-sigma rule
+    unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
+
+    T* h_gauss = new T[gauss_len];
+    gaussian1D(h_gauss, gauss_len, sigma);
+
+    Param gauss_filter;
+    gauss_filter.info.offset = 0;
+    gauss_filter.info.dims[0] = gauss_len;
+    gauss_filter.info.strides[0] = 1;
+
+    for (int k = 1; k < 4; k++) {
+        gauss_filter.info.dims[k] = 1;
+        gauss_filter.info.strides[k] = gauss_filter.info.dims[k-1] * gauss_filter.info.strides[k-1];
+    }
+
+    dim_t gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3];
+    gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T));
+    getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
+    getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss);
+
+    delete[] h_gauss;
+
+    return gauss_filter;
+}
+
+template<typename T, typename convAccT>
+void convHelper(Param& dst, Param src, Param filter)
+{
+    Param tmp;
+    tmp.info.offset = 0;
+    for (int k = 0; k < 4; k++) {
+        tmp.info.dims[k] = src.info.dims[k];
+        tmp.info.strides[k] = src.info.strides[k];
+    }
+
+    const dim_t src_el = src.info.dims[3] * src.info.strides[3];
+    tmp.data = bufferAlloc(src_el * sizeof(T));
+
+    const dim_t fLen = filter.info.dims[0];
+    switch(fLen) {
+        case 3:
+            convolve2<T, convAccT, 0, false, 3 >(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 3 >(dst, tmp, filter);
+            break;
+        case 5:
+            convolve2<T, convAccT, 0, false, 5 >(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 5 >(dst, tmp, filter);
+            break;
+        case 7:
+            convolve2<T, convAccT, 0, false, 7 >(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 7 >(dst, tmp, filter);
+            break;
+        case 9:
+            convolve2<T, convAccT, 0, false, 9 >(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 9 >(dst, tmp, filter);
+            break;
+        case 11:
+            convolve2<T, convAccT, 0, false, 11>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 11>(dst, tmp, filter);
+            break;
+        case 13:
+            convolve2<T, convAccT, 0, false, 13>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 13>(dst, tmp, filter);
+            break;
+        case 15:
+            convolve2<T, convAccT, 0, false, 15>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 15>(dst, tmp, filter);
+            break;
+        case 17:
+            convolve2<T, convAccT, 0, false, 17>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 17>(dst, tmp, filter);
+            break;
+        case 19:
+            convolve2<T, convAccT, 0, false, 19>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 19>(dst, tmp, filter);
+            break;
+        case 21:
+            convolve2<T, convAccT, 0, false, 21>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 21>(dst, tmp, filter);
+            break;
+        case 23:
+            convolve2<T, convAccT, 0, false, 23>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 23>(dst, tmp, filter);
+            break;
+        case 25:
+            convolve2<T, convAccT, 0, false, 25>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 25>(dst, tmp, filter);
+            break;
+        case 27:
+            convolve2<T, convAccT, 0, false, 27>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 27>(dst, tmp, filter);
+            break;
+        case 29:
+            convolve2<T, convAccT, 0, false, 29>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 29>(dst, tmp, filter);
+            break;
+        case 31:
+            convolve2<T, convAccT, 0, false, 31>(tmp, src, filter);
+            convolve2<T, convAccT, 1, false, 31>(dst, tmp, filter);
+            break;
+    }
+
+    bufferFree(tmp.data);
+}
+
+template<typename T, typename convAccT>
+Param createInitialImage(
+    Param img,
+    const float init_sigma,
+    const bool double_input)
+{
+    Param init_img;
+    init_img.info.offset = 0;
+    init_img.info.dims[0] = (double_input) ? img.info.dims[0] * 2 : img.info.dims[0];
+    init_img.info.dims[1] = (double_input) ? img.info.dims[1] * 2 : img.info.dims[1];
+    init_img.info.strides[0] = 1;
+    init_img.info.strides[1] = init_img.info.dims[0];
+
+    for (int k = 2; k < 4; k++) {
+        init_img.info.dims[k] = 1;
+        init_img.info.strides[k] = init_img.info.dims[k-1] * init_img.info.strides[k-1];
+    }
+
+    dim_t init_img_el = init_img.info.strides[3] * init_img.info.dims[3];
+    init_img.data = bufferAlloc(init_img_el * sizeof(T));
+
+    float s = (double_input) ? sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4)
+                             : sqrt(init_sigma * init_sigma - InitSigma * InitSigma);
+
+    Param filter = gaussFilter<convAccT>(s);
+
+    if (double_input)
+        resize<T, AF_INTERP_BILINEAR>(init_img, img);
+
+    convHelper<T, convAccT>(init_img, (double_input) ? init_img : img, filter);
+
+    bufferFree(filter.data);
+
+    return init_img;
+}
+
+template<typename T, typename convAccT>
+std::vector<Param> buildGaussPyr(
+    Param init_img,
+    const unsigned n_octaves,
+    const unsigned n_layers,
+    const float init_sigma)
+{
+    // Precompute Gaussian sigmas using the following formula:
+    // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
+    std::vector<float> sig_layers(n_layers + 3);
+    sig_layers[0] = init_sigma;
+    float k = std::pow(2.0f, 1.0f / n_layers);
+    for (unsigned i = 1; i < n_layers + 3; i++) {
+        float sig_prev = std::pow(k, i-1) * init_sigma;
+        float sig_total = sig_prev * k;
+        sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
+    }
+
+    // Gaussian Pyramid
+    std::vector<Param> gauss_pyr(n_octaves);
+    std::vector<Param> tmp_pyr(n_octaves * (n_layers+3));
+    for (unsigned o = 0; o < n_octaves; o++) {
+        gauss_pyr[o].info.offset = 0;
+        gauss_pyr[o].info.dims[0] = (o == 0) ? init_img.info.dims[0] : gauss_pyr[o-1].info.dims[0] / 2;
+        gauss_pyr[o].info.dims[1] = (o == 0) ? init_img.info.dims[1] : gauss_pyr[o-1].info.dims[1] / 2;
+        gauss_pyr[o].info.dims[2] = n_layers+3;
+        gauss_pyr[o].info.dims[3] = 1;
+
+        gauss_pyr[o].info.strides[0] = 1;
+        gauss_pyr[o].info.strides[1] = gauss_pyr[o].info.dims[0] * gauss_pyr[o].info.strides[0];
+        gauss_pyr[o].info.strides[2] = gauss_pyr[o].info.dims[1] * gauss_pyr[o].info.strides[1];
+        gauss_pyr[o].info.strides[3] = gauss_pyr[o].info.dims[2] * gauss_pyr[o].info.strides[2];
+
+        const unsigned nel = gauss_pyr[o].info.dims[3] * gauss_pyr[o].info.strides[3];
+        gauss_pyr[o].data = bufferAlloc(nel * sizeof(T));
+
+        for (unsigned l = 0; l < n_layers+3; l++) {
+            unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
+            unsigned idx = o*(n_layers+3) + l;
+
+            tmp_pyr[o].info.offset = 0;
+            if (o == 0 && l == 0) {
+                for (int k = 0; k < 4; k++) {
+                    tmp_pyr[idx].info.dims[k] = init_img.info.dims[k];
+                    tmp_pyr[idx].info.strides[k] = init_img.info.strides[k];
+                }
+                tmp_pyr[idx].data = init_img.data;
+            }
+            else if (l == 0) {
+                tmp_pyr[idx].info.dims[0] = tmp_pyr[src_idx].info.dims[0] / 2;
+                tmp_pyr[idx].info.dims[1] = tmp_pyr[src_idx].info.dims[1] / 2;
+                tmp_pyr[idx].info.strides[0] = 1;
+                tmp_pyr[idx].info.strides[1] = tmp_pyr[idx].info.dims[0];
+
+                for (int k = 2; k < 4; k++) {
+                    tmp_pyr[idx].info.dims[k] = 1;
+                    tmp_pyr[idx].info.strides[k] = tmp_pyr[idx].info.dims[k-1] * tmp_pyr[idx].info.strides[k-1];
+                }
+
+                dim_t lvl_el = tmp_pyr[idx].info.strides[3] * tmp_pyr[idx].info.dims[3];
+                tmp_pyr[idx].data = bufferAlloc(lvl_el * sizeof(T));
+
+                resize<T, AF_INTERP_BILINEAR>(tmp_pyr[idx], tmp_pyr[src_idx]);
+            }
+            else {
+                for (int k = 0; k < 4; k++) {
+                    tmp_pyr[idx].info.dims[k] = tmp_pyr[src_idx].info.dims[k];
+                    tmp_pyr[idx].info.strides[k] = tmp_pyr[src_idx].info.strides[k];
+                }
+                dim_t lvl_el = tmp_pyr[idx].info.strides[3] * tmp_pyr[idx].info.dims[3];
+                tmp_pyr[idx].data = bufferAlloc(lvl_el * sizeof(T));
+
+                Param filter = gaussFilter<convAccT>(sig_layers[l]);
+
+                convHelper<T, convAccT>(tmp_pyr[idx], tmp_pyr[src_idx], filter);
+
+                bufferFree(filter.data);
+            }
+
+            const unsigned imel = tmp_pyr[idx].info.dims[3] * tmp_pyr[idx].info.strides[3];
+            const unsigned offset = imel * l;
+
+            getQueue().enqueueCopyBuffer(*tmp_pyr[idx].data, *gauss_pyr[o].data, 0, offset*sizeof(T), imel * sizeof(T));
+        }
+    }
+
+    for (unsigned o = 0; o < n_octaves; o++) {
+        for (unsigned l = 0; l < n_layers+3; l++) {
+            unsigned idx = o*(n_layers+3) + l;
+            bufferFree(tmp_pyr[idx].data);
+        }
+    }
+
+    return gauss_pyr;
+}
+
+template<typename T>
+std::vector<Param> buildDoGPyr(
+    std::vector<Param> gauss_pyr,
+    const unsigned n_octaves,
+    const unsigned n_layers,
+    Kernel* suKernel)
+{
+    // DoG Pyramid
+    std::vector<Param> dog_pyr(n_octaves);
+    for (unsigned o = 0; o < n_octaves; o++) {
+        for (int k = 0; k < 4; k++) {
+            dog_pyr[o].info.dims[k] = (k == 2) ? gauss_pyr[o].info.dims[k]-1 : gauss_pyr[o].info.dims[k];
+            dog_pyr[o].info.strides[k] = (k == 0) ? 1 : dog_pyr[o].info.dims[k-1] * dog_pyr[o].info.strides[k-1];
+        }
+        dog_pyr[o].info.offset = 0;
+
+        dog_pyr[o].data = bufferAlloc(dog_pyr[o].info.dims[3] * dog_pyr[o].info.strides[3] * sizeof(T));
+
+        const unsigned nel = dog_pyr[o].info.dims[1] * dog_pyr[o].info.strides[1];
+        const unsigned dog_layers = n_layers+2;
+
+        const int blk_x = divup(nel, SIFT_THREADS);
+        const NDRange local(SIFT_THREADS, 1);
+        const NDRange global(blk_x * SIFT_THREADS, 1);
+
+        auto suOp = make_kernel<Buffer, Buffer, unsigned, unsigned> (*suKernel);
+
+        suOp(EnqueueArgs(getQueue(), global, local),
+             *dog_pyr[o].data, *gauss_pyr[o].data, nel, dog_layers);
+        CL_DEBUG_FINISH(getQueue());
+    }
+
+    return dog_pyr;
+}
+
+template <typename T>
+void update_permutation(compute::buffer_iterator<T>& keys, compute::vector<int>& permutation, compute::command_queue& queue)
+{
+    // temporary storage for keys
+    compute::vector<T> temp(permutation.size(), 0, queue);
+
+    // permute the keys with the current reordering
+    compute::gather(permutation.begin(), permutation.end(), keys, temp.begin(), queue);
+
+    // stable_sort the permuted keys and update the permutation
+    compute::sort_by_key(temp.begin(), temp.end(), permutation.begin(), queue);
+}
+
+template <typename T>
+void apply_permutation(compute::buffer_iterator<T>& keys, compute::vector<int>& permutation, compute::command_queue& queue)
+{
+    // copy keys to temporary vector
+    compute::vector<T> temp(keys, keys+permutation.size(), queue);
+
+    // permute the keys
+    compute::gather(permutation.begin(), permutation.end(), temp.begin(), keys, queue);
+}
+
+template<typename T, typename convAccT>
+void sift(unsigned* out_feat,
+          unsigned* out_dlen,
+          Param& x_out,
+          Param& y_out,
+          Param& score_out,
+          Param& ori_out,
+          Param& size_out,
+          Param& desc_out,
+          Param img,
+          const unsigned n_layers,
+          const float contrast_thr,
+          const float edge_thr,
+          const float init_sigma,
+          const bool double_input,
+          const float img_scale,
+          const float feature_ratio)
+{
+    try {
+        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+        static std::map<int, Program*> siftProgs;
+        static std::map<int, Kernel*>  suKernel;
+        static std::map<int, Kernel*>  deKernel;
+        static std::map<int, Kernel*>  ieKernel;
+        static std::map<int, Kernel*>  coKernel;
+        static std::map<int, Kernel*>  rdKernel;
+        static std::map<int, Kernel*>  cdKernel;
+
+        int device = getActiveDeviceId();
+
+        std::call_once( compileFlags[device], [device] () {
+
+                std::ostringstream options;
+                options << " -D T=" << dtype_traits<T>::getName();
+
+                if (std::is_same<T, double>::value ||
+                    std::is_same<T, cdouble>::value) {
+                    options << " -D USE_DOUBLE";
+                }
+
+                cl::Program prog;
+                buildProgram(prog, sift_cl, sift_cl_len, options.str());
+                siftProgs[device] = new Program(prog);
+
+                suKernel[device] = new Kernel(*siftProgs[device], "sub");
+                deKernel[device] = new Kernel(*siftProgs[device], "detectExtrema");
+                ieKernel[device] = new Kernel(*siftProgs[device], "interpolateExtrema");
+                coKernel[device] = new Kernel(*siftProgs[device], "calcOrientation");
+                rdKernel[device] = new Kernel(*siftProgs[device], "removeDuplicates");
+                cdKernel[device] = new Kernel(*siftProgs[device], "computeDescriptor");
+            });
+
+        const unsigned min_dim = (double_input) ? min(img.info.dims[0]*2, img.info.dims[1]*2)
+                                                : min(img.info.dims[0], img.info.dims[1]);
+        const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
+
+        Param init_img = createInitialImage<T, convAccT>(img, init_sigma, double_input);
+
+        std::vector<Param> gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
+
+        std::vector<Param> dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers, suKernel[device]);
+
+        std::vector<cl::Buffer*> d_x_pyr(n_octaves, NULL);
+        std::vector<cl::Buffer*> d_y_pyr(n_octaves, NULL);
+        std::vector<cl::Buffer*> d_response_pyr(n_octaves, NULL);
+        std::vector<cl::Buffer*> d_size_pyr(n_octaves, NULL);
+        std::vector<cl::Buffer*> d_ori_pyr(n_octaves, NULL);
+        std::vector<cl::Buffer*> d_desc_pyr(n_octaves, NULL);
+        std::vector<unsigned> feat_pyr(n_octaves, 0);
+        unsigned total_feat = 0;
+
+        const unsigned d = DescrWidth;
+        const unsigned n = DescrHistBins;
+        const unsigned desc_len = d*d*n;
+
+        cl::Buffer* d_count = bufferAlloc(sizeof(unsigned));
+
+        for (unsigned o = 0; o < n_octaves; o++) {
+            if (dog_pyr[o].info.dims[0]-2*ImgBorder < 1 ||
+                dog_pyr[o].info.dims[1]-2*ImgBorder < 1)
+                continue;
+
+            const unsigned imel = dog_pyr[o].info.dims[0] * dog_pyr[o].info.dims[1];
+            const unsigned max_feat = ceil(imel * feature_ratio);
+
+            cl::Buffer* d_extrema_x     = bufferAlloc(max_feat * sizeof(float));
+            cl::Buffer* d_extrema_y     = bufferAlloc(max_feat * sizeof(float));
+            cl::Buffer* d_extrema_layer = bufferAlloc(max_feat * sizeof(unsigned));
+
+            unsigned extrema_feat = 0;
+            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &extrema_feat);
+
+            int dim0 = dog_pyr[o].info.dims[0];
+            int dim1 = dog_pyr[o].info.dims[1];
+
+            const int blk_x = divup(dim0-2*ImgBorder, 32);
+            const int blk_y = divup(dim1-2*ImgBorder, 8);
+            const NDRange local(32, 8);
+            const NDRange global(blk_x * 32, blk_y * 8);
+
+            float extrema_thr = 0.5f * contrast_thr / n_layers;
+
+            auto deOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
+                                    Buffer, KParam, unsigned, float> (*deKernel[device]);
+
+            deOp(EnqueueArgs(getQueue(), global, local),
+                 *d_extrema_x, *d_extrema_y, *d_extrema_layer, *d_count,
+                 *dog_pyr[o].data, dog_pyr[o].info, max_feat, extrema_thr);
+            CL_DEBUG_FINISH(getQueue());
+
+            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &extrema_feat);
+            extrema_feat = min(extrema_feat, max_feat);
+
+            if (extrema_feat == 0) {
+                bufferFree(d_extrema_x);
+                bufferFree(d_extrema_y);
+                bufferFree(d_extrema_layer);
+
+                continue;
+            }
+
+            unsigned interp_feat = 0;
+            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &interp_feat);
+
+            cl::Buffer* d_interp_x = bufferAlloc(extrema_feat * sizeof(float));
+            cl::Buffer* d_interp_y = bufferAlloc(extrema_feat * sizeof(float));
+            cl::Buffer* d_interp_layer = bufferAlloc(extrema_feat * sizeof(unsigned));
+            cl::Buffer* d_interp_response = bufferAlloc(extrema_feat * sizeof(float));
+            cl::Buffer* d_interp_size = bufferAlloc(extrema_feat * sizeof(float));
+
+            const int blk_x_interp = divup(extrema_feat, SIFT_THREADS);
+            const NDRange local_interp(SIFT_THREADS, 1);
+            const NDRange global_interp(blk_x_interp * SIFT_THREADS, 1);
+
+            auto ieOp = make_kernel<Buffer, Buffer, Buffer,
+                                    Buffer, Buffer, Buffer,
+                                    Buffer, Buffer, Buffer, unsigned,
+                                    Buffer, KParam, unsigned, unsigned, unsigned,
+                                    float, float, float, float> (*ieKernel[device]);
+
+            ieOp(EnqueueArgs(getQueue(), global_interp, local_interp),
+                 *d_interp_x, *d_interp_y, *d_interp_layer,
+                 *d_interp_response, *d_interp_size, *d_count,
+                 *d_extrema_x, *d_extrema_y, *d_extrema_layer, extrema_feat,
+                 *dog_pyr[o].data, dog_pyr[o].info, extrema_feat, o, n_layers,
+                 contrast_thr, edge_thr, init_sigma, img_scale);
+            CL_DEBUG_FINISH(getQueue());
+
+            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &interp_feat);
+            interp_feat = min(interp_feat, extrema_feat);
+
+            if (interp_feat == 0) {
+                bufferFree(d_interp_x);
+                bufferFree(d_interp_y);
+                bufferFree(d_interp_layer);
+                bufferFree(d_interp_response);
+                bufferFree(d_interp_size);
+
+                continue;
+            }
+
+            compute::command_queue queue(getQueue()());
+            compute::context context(getContext()());
+
+            compute::buffer buf_interp_x((*d_interp_x)(), true);
+            compute::buffer buf_interp_y((*d_interp_y)(), true);
+            compute::buffer buf_interp_layer((*d_interp_layer)(), true);
+            compute::buffer buf_interp_response((*d_interp_response)(), true);
+            compute::buffer buf_interp_size((*d_interp_size)(), true);
+
+            compute::buffer_iterator<float> interp_x_begin = compute::make_buffer_iterator<float>(buf_interp_x, 0);
+            compute::buffer_iterator<float> interp_y_begin = compute::make_buffer_iterator<float>(buf_interp_y, 0);
+            compute::buffer_iterator<unsigned> interp_layer_begin = compute::make_buffer_iterator<unsigned>(buf_interp_layer, 0);
+            compute::buffer_iterator<float> interp_response_begin = compute::make_buffer_iterator<float>(buf_interp_response, 0);
+            compute::buffer_iterator<float> interp_size_begin = compute::make_buffer_iterator<float>(buf_interp_size, 0);
+
+            compute::vector<int> permutation(interp_feat, context);
+            compute::iota(permutation.begin(), permutation.end(), 0, queue);
+
+            update_permutation<float>(interp_x_begin, permutation, queue);
+            update_permutation<float>(interp_y_begin, permutation, queue);
+            update_permutation<unsigned>(interp_layer_begin, permutation, queue);
+            update_permutation<float>(interp_response_begin, permutation, queue);
+            update_permutation<float>(interp_size_begin, permutation, queue);
+
+            apply_permutation<float>(interp_x_begin, permutation, queue);
+            apply_permutation<float>(interp_y_begin, permutation, queue);
+            apply_permutation<unsigned>(interp_layer_begin, permutation, queue);
+            apply_permutation<float>(interp_response_begin, permutation, queue);
+            apply_permutation<float>(interp_size_begin, permutation, queue);
+
+            unsigned nodup_feat = 0;
+            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &nodup_feat);
+
+            cl::Buffer* d_nodup_x = bufferAlloc(interp_feat * sizeof(float));
+            cl::Buffer* d_nodup_y = bufferAlloc(interp_feat * sizeof(float));
+            cl::Buffer* d_nodup_layer = bufferAlloc(interp_feat * sizeof(unsigned));
+            cl::Buffer* d_nodup_response = bufferAlloc(interp_feat * sizeof(float));
+            cl::Buffer* d_nodup_size = bufferAlloc(interp_feat * sizeof(float));
+
+            const int blk_x_nodup = divup(extrema_feat, SIFT_THREADS);
+            const NDRange local_nodup(SIFT_THREADS, 1);
+            const NDRange global_nodup(blk_x_nodup * SIFT_THREADS, 1);
+
+            auto rdOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
+                                    Buffer, Buffer, Buffer, Buffer, Buffer,
+                                    unsigned> (*rdKernel[device]);
+
+            rdOp(EnqueueArgs(getQueue(), global_nodup, local_nodup),
+                 *d_nodup_x, *d_nodup_y, *d_nodup_layer,
+                 *d_nodup_response, *d_nodup_size, *d_count,
+                 *d_interp_x, *d_interp_y, *d_interp_layer,
+                 *d_interp_response, *d_interp_size, interp_feat);
+            CL_DEBUG_FINISH(getQueue());
+
+
+            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &nodup_feat);
+            nodup_feat = min(nodup_feat, interp_feat);
+
+            bufferFree(d_interp_x);
+            bufferFree(d_interp_y);
+            bufferFree(d_interp_layer);
+            bufferFree(d_interp_response);
+            bufferFree(d_interp_size);
+
+            unsigned oriented_feat = 0;
+            getQueue().enqueueWriteBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &oriented_feat);
+            const unsigned max_oriented_feat = nodup_feat * 3;
+
+            cl::Buffer* d_oriented_x        = bufferAlloc(max_oriented_feat * sizeof(float));
+            cl::Buffer* d_oriented_y        = bufferAlloc(max_oriented_feat * sizeof(float));
+            cl::Buffer* d_oriented_layer    = bufferAlloc(max_oriented_feat * sizeof(unsigned));
+            cl::Buffer* d_oriented_response = bufferAlloc(max_oriented_feat * sizeof(float));
+            cl::Buffer* d_oriented_size     = bufferAlloc(max_oriented_feat * sizeof(float));
+            cl::Buffer* d_oriented_ori      = bufferAlloc(max_oriented_feat * sizeof(float));
+
+            const int blk_x_ori = divup(nodup_feat, 8);
+            const NDRange local_ori(8, 32);
+            const NDRange global_ori(blk_x_ori * 8, 32);
+
+            auto coOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer,
+                                    Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
+                                    Buffer, KParam, unsigned, unsigned, int> (*coKernel[device]);
+
+            coOp(EnqueueArgs(getQueue(), global_ori, local_ori),
+                 *d_oriented_x, *d_oriented_y, *d_oriented_layer,
+                 *d_oriented_response, *d_oriented_size, *d_oriented_ori, *d_count,
+                 *d_nodup_x, *d_nodup_y, *d_nodup_layer,
+                 *d_nodup_response, *d_nodup_size, nodup_feat,
+                 *gauss_pyr[o].data, gauss_pyr[o].info, max_oriented_feat, o, (int)double_input);
+            CL_DEBUG_FINISH(getQueue());
+
+            getQueue().enqueueReadBuffer(*d_count, CL_TRUE, 0, sizeof(unsigned), &oriented_feat);
+            oriented_feat = min(oriented_feat, max_oriented_feat);
+
+            if (oriented_feat == 0) {
+                bufferFree(d_oriented_x);
+                bufferFree(d_oriented_y);
+                bufferFree(d_oriented_layer);
+                bufferFree(d_oriented_response);
+                bufferFree(d_oriented_size);
+
+                continue;
+            }
+
+            cl::Buffer* d_desc = bufferAlloc(oriented_feat * desc_len * sizeof(float));
+
+            float scale = 1.f/(1 << o);
+            if (double_input) scale *= 2.f;
+
+            const int blk_x_desc = divup(oriented_feat, 1);
+            const NDRange local_desc(1, SIFT_THREADS);
+            const NDRange global_desc(blk_x_desc, SIFT_THREADS);
+
+            auto cdOp = make_kernel<Buffer, unsigned,
+                                    Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
+                                    Buffer, KParam, int, int, float, float, int> (*cdKernel[device]);
+
+            cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
+                 *d_desc, desc_len,
+                 *d_oriented_x, *d_oriented_y, *d_oriented_layer,
+                 *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
+                 *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, init_sigma, n_layers);
+            CL_DEBUG_FINISH(getQueue());
+
+            total_feat += oriented_feat;
+            feat_pyr[o] = oriented_feat;
+
+            if (oriented_feat > 0) {
+                d_x_pyr[o] = d_oriented_x;
+                d_y_pyr[o] = d_oriented_y;
+                d_response_pyr[o] = d_oriented_response;
+                d_ori_pyr[o] = d_oriented_ori;
+                d_size_pyr[o] = d_oriented_size;
+                d_desc_pyr[o] = d_desc;
+            }
+        }
+
+        // If no features are found, set found features to 0 and return
+        if (total_feat == 0) {
+            *out_feat = 0;
+            return;
+        }
+
+        // Allocate output memory
+        x_out.info.dims[0] = total_feat;
+        x_out.info.strides[0] = 1;
+        y_out.info.dims[0] = total_feat;
+        y_out.info.strides[0] = 1;
+        score_out.info.dims[0] = total_feat;
+        score_out.info.strides[0] = 1;
+        ori_out.info.dims[0] = total_feat;
+        ori_out.info.strides[0] = 1;
+        size_out.info.dims[0] = total_feat;
+        size_out.info.strides[0] = 1;
+
+        desc_out.info.dims[0] = desc_len;
+        desc_out.info.strides[0] = 1;
+        desc_out.info.dims[1] = total_feat;
+        desc_out.info.strides[1] = desc_out.info.dims[0];
+
+        for (int k = 1; k < 4; k++) {
+            x_out.info.dims[k] = 1;
+            x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1];
+            y_out.info.dims[k] = 1;
+            y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1];
+            score_out.info.dims[k] = 1;
+            score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1];
+            ori_out.info.dims[k] = 1;
+            ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1];
+            size_out.info.dims[k] = 1;
+            size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1];
+            if (k > 1) {
+                desc_out.info.dims[k] = 1;
+                desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1];
+            }
+        }
+
+        if (total_feat > 0) {
+            size_t out_sz  = total_feat * sizeof(float);
+            x_out.data     = bufferAlloc(out_sz);
+            y_out.data     = bufferAlloc(out_sz);
+            score_out.data = bufferAlloc(out_sz);
+            ori_out.data   = bufferAlloc(out_sz);
+            size_out.data  = bufferAlloc(out_sz);
+
+            size_t desc_sz = total_feat * desc_len * sizeof(unsigned);
+            desc_out.data  = bufferAlloc(desc_sz);
+        }
+
+        unsigned offset = 0;
+        for (unsigned i = 0; i < n_octaves; i++) {
+            if (feat_pyr[i] == 0)
+                continue;
+
+            getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
+            getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
+            getQueue().enqueueCopyBuffer(*d_response_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
+            getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
+            getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float));
+
+            getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*desc_len*sizeof(unsigned), feat_pyr[i] * desc_len * sizeof(unsigned));
+
+            bufferFree(d_x_pyr[i]);
+            bufferFree(d_y_pyr[i]);
+            bufferFree(d_response_pyr[i]);
+            bufferFree(d_ori_pyr[i]);
+            bufferFree(d_size_pyr[i]);
+            bufferFree(d_desc_pyr[i]);
+
+            offset += feat_pyr[i];
+        }
+
+        // Sets number of output features and descriptor length
+        *out_feat = total_feat;
+        *out_dlen = desc_len;
+    } catch (cl::Error err) {
+        CL_TO_AF_ERROR(err);
+        throw;
+    }
+}
+
+} //namespace kernel
+
+} //namespace opencl
diff --git a/src/backend/opencl/sift.cpp b/src/backend/opencl/sift.cpp
new file mode 100644
index 0000000..1973e3e
--- /dev/null
+++ b/src/backend/opencl/sift.cpp
@@ -0,0 +1,75 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <af/features.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_opencl.hpp>
+#include <handle.hpp>
+#include <kernel/sift.hpp>
+
+using af::dim4;
+using af::features;
+
+namespace opencl
+{
+
+template<typename T, typename convAccT>
+unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
+              Array<float>& ori_out, Array<float>& size_out, Array<float>& desc_out,
+              const Array<T>& in, const unsigned n_layers,
+              const float contrast_thr, const float edge_thr,
+              const float init_sigma, const bool double_input,
+              const float img_scale, const float feature_ratio)
+{
+    unsigned nfeat_out;
+    unsigned desc_len;
+
+    Param x;
+    Param y;
+    Param score;
+    Param ori;
+    Param size;
+    Param desc;
+
+    kernel::sift<T,convAccT>(&nfeat_out, &desc_len, x, y, score, ori, size, desc,
+                             in, n_layers, contrast_thr, edge_thr, init_sigma,
+                             double_input, img_scale, feature_ratio);
+
+    if (nfeat_out > 0) {
+        const dim4 out_dims(nfeat_out);
+        const dim4 desc_dims(desc_len, nfeat_out);
+
+        x_out     = createParamArray<float>(x);
+        y_out     = createParamArray<float>(y);
+        score_out = createParamArray<float>(score);
+        ori_out   = createParamArray<float>(ori);
+        size_out  = createParamArray<float>(size);
+        desc_out  = createParamArray<float>(desc);
+    }
+
+    return nfeat_out;
+}
+
+
+#define INSTANTIATE(T, convAccT)                                                        \
+    template unsigned sift<T, convAccT>(Array<float>& x_out, Array<float>& y_out, \
+                                        Array<float>& score_out, Array<float>& ori_out, \
+                                        Array<float>& size_out, Array<float>& desc_out,  \
+                                        const Array<T>& in, const unsigned n_layers,                \
+                                        const float contrast_thr, const float edge_thr,             \
+                                        const float init_sigma, const bool double_input,            \
+                                        const float img_scale, const float feature_ratio);
+
+INSTANTIATE(float , float )
+INSTANTIATE(double, double)
+
+}
diff --git a/src/backend/opencl/sift.hpp b/src/backend/opencl/sift.hpp
new file mode 100644
index 0000000..96b422f
--- /dev/null
+++ b/src/backend/opencl/sift.hpp
@@ -0,0 +1,26 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/features.h>
+#include <Array.hpp>
+
+using af::features;
+
+namespace opencl
+{
+
+template<typename T, typename convAccT>
+unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
+              Array<float>& ori, Array<float>& size, Array<float>& desc,
+              const Array<T>& in, const unsigned n_layers,
+              const float contrast_thr, const float edge_thr,
+              const float init_sigma, const bool double_input,
+              const float img_scale, const float feature_ratio);
+
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list