[arrayfire] 37/408: FEAT Added nearest neighbour with SSD, SAD and SHD

Mon Sep 21 19:11:12 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit e2ee96e396e477660fbe3c9386c8f1c4760aec9f
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date:   Wed Jun 24 17:20:58 2015 -0400

    FEAT Added nearest neighbour with SSD, SAD and SHD
---
 include/af/vision.h                                |  10 ++
 src/api/c/hamming.cpp                              |  51 +-----
 src/api/c/nearest_neighbour.cpp                    |  90 +++++++++++
 src/api/cpp/hamming.cpp                            |   2 +-
 src/api/cpp/{hamming.cpp => nearest_neighbour.cpp} |   9 +-
 src/backend/cpu/hamming.cpp                        | 103 ------------
 src/backend/cpu/nearest_neighbour.cpp              | 175 +++++++++++++++++++++
 .../cpu/{hamming.hpp => nearest_neighbour.hpp}     |   9 +-
 src/backend/cuda/hamming.cu                        |  62 --------
 .../kernel/{hamming.hpp => nearest_neighbour.hpp}  | 155 ++++++++++++------
 src/backend/cuda/math.hpp                          |   4 +
 src/backend/cuda/nearest_neighbour.cu              |  79 ++++++++++
 .../cuda/{hamming.hpp => nearest_neighbour.hpp}    |   9 +-
 src/backend/opencl/hamming.cpp                     | 143 -----------------
 .../kernel/{hamming.cl => nearest_neighbour.cl}    |  66 +++++---
 .../kernel/{hamming.hpp => nearest_neighbour.hpp}  |  52 +++---
 src/backend/opencl/nearest_neighbour.cpp           | 165 +++++++++++++++++++
 .../opencl/{hamming.hpp => nearest_neighbour.hpp}  |   9 +-
 18 files changed, 729 insertions(+), 464 deletions(-)

diff --git a/include/af/vision.h b/include/af/vision.h
index ed5e05f..d40a150 100644
--- a/include/af/vision.h
+++ b/include/af/vision.h
@@ -90,6 +90,11 @@ AFAPI void hammingMatcher(array& idx, array& dist,
                           const array& query, const array& train,
                           const dim_t dist_dim=0, const unsigned n_dist=1);
 
+AFAPI void nearestNeighbour(array& idx, array& dist,
+                            const array& query, const array& train,
+                            const dim_t dist_dim=0, const unsigned n_dist=1,
+                            const af_match_type dist_type = AF_SSD);
+
 /**
    C++ Interface for image template matching
 
@@ -190,6 +195,11 @@ extern "C" {
                                     const af_array query, const af_array train,
                                     const dim_t dist_dim, const unsigned n_dist);
 
+    AFAPI af_err af_nearest_neighbour(af_array* idx, af_array* dist,
+                                      const af_array query, const af_array train,
+                                      const dim_t dist_dim, const unsigned n_dist,
+                                      const af_match_type dist_type);
+
     /**
        C Interface for image template matching
 
diff --git a/src/api/c/hamming.cpp b/src/api/c/hamming.cpp
index 8a1dd91..057d6f9 100644
--- a/src/api/c/hamming.cpp
+++ b/src/api/c/hamming.cpp
@@ -7,59 +7,10 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/dim4.hpp>
 #include <af/defines.h>
 #include <af/vision.h>
-#include <handle.hpp>
-#include <err_common.hpp>
-#include <backend.hpp>
-#include <hamming.hpp>
-
-using af::dim4;
-using namespace detail;
-
-template<typename T>
-static void hamming_matcher(af_array* idx, af_array* dist, const af_array query, const af_array train, const dim_t dist_dim, const uint n_dist)
-{
-    Array<uint> oIdxArray = createEmptyArray<uint>(af::dim4());
-    Array<uint> oDistArray = createEmptyArray<uint>(af::dim4());
-
-    hamming_matcher<T>(oIdxArray, oDistArray, getArray<T>(query), getArray<T>(train), dist_dim, n_dist);
-
-    *idx  = getHandle<uint>(oIdxArray);
-    *dist = getHandle<uint>(oDistArray);
-}
 
 af_err af_hamming_matcher(af_array* idx, af_array* dist, const af_array query, const af_array train, const dim_t dist_dim, const uint n_dist)
 {
-    try {
-        ArrayInfo qInfo = getInfo(query);
-        ArrayInfo tInfo = getInfo(train);
-        af_dtype qType  = qInfo.getType();
-        af_dtype tType  = tInfo.getType();
-        af::dim4 qDims  = qInfo.dims();
-        af::dim4 tDims  = tInfo.dims();
-
-        uint train_samples = (dist_dim == 0) ? 1 : 0;
-
-        DIM_ASSERT(3, qDims[dist_dim] == tDims[dist_dim]);
-        DIM_ASSERT(3, qDims[2] == 1 && qDims[3] == 1);
-        DIM_ASSERT(3, qType == tType);
-        DIM_ASSERT(4, tDims[2] == 1 && tDims[3] == 1);
-        DIM_ASSERT(5, (dist_dim == 0 || dist_dim == 1));
-        DIM_ASSERT(6, n_dist > 0 && n_dist <= (uint)tDims[train_samples]);
-
-        af_array oIdx;
-        af_array oDist;
-        switch(qType) {
-            case u8:  hamming_matcher<uchar>(&oIdx, &oDist, query, train, dist_dim, n_dist); break;
-            case u32: hamming_matcher<uint >(&oIdx, &oDist, query, train, dist_dim, n_dist); break;
-            default : TYPE_ERROR(1, qType);
-        }
-        std::swap(*idx, oIdx);
-        std::swap(*dist, oDist);
-    }
-    CATCHALL;
-
-    return AF_SUCCESS;
+    return af_nearest_neighbour(idx, dist, query, train, dist_dim, n_dist, AF_SHD);
 }
diff --git a/src/api/c/nearest_neighbour.cpp b/src/api/c/nearest_neighbour.cpp
new file mode 100644
index 0000000..d47e0ae
--- /dev/null
+++ b/src/api/c/nearest_neighbour.cpp
@@ -0,0 +1,90 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <af/vision.h>
+#include <handle.hpp>
+#include <err_common.hpp>
+#include <backend.hpp>
+#include <nearest_neighbour.hpp>
+
+using af::dim4;
+using namespace detail;
+
+template<typename Ti, typename To>
+static void nearest_neighbour(af_array* idx, af_array* dist,
+        const af_array query, const af_array train,
+        const dim_t dist_dim, const uint n_dist,
+        const af_match_type dist_type)
+{
+    Array<uint> oIdxArray = createEmptyArray<uint>(af::dim4());
+    Array<To>  oDistArray = createEmptyArray<To>(af::dim4());
+
+    nearest_neighbour<Ti, To>(oIdxArray, oDistArray, getArray<Ti>(query), getArray<Ti>(train),
+                              dist_dim, n_dist, dist_type);
+
+    *idx  = getHandle<uint>(oIdxArray);
+    *dist = getHandle<To>(oDistArray);
+}
+
+af_err af_nearest_neighbour(af_array* idx, af_array* dist,
+        const af_array query, const af_array train,
+        const dim_t dist_dim, const uint n_dist,
+        const af_match_type dist_type)
+{
+    try {
+        ArrayInfo qInfo = getInfo(query);
+        ArrayInfo tInfo = getInfo(train);
+        af_dtype qType  = qInfo.getType();
+        af_dtype tType  = tInfo.getType();
+        af::dim4 qDims  = qInfo.dims();
+        af::dim4 tDims  = tInfo.dims();
+
+        uint train_samples = (dist_dim == 0) ? 1 : 0;
+
+        DIM_ASSERT(2, qDims[dist_dim] == tDims[dist_dim]);
+        DIM_ASSERT(2, qDims[2] == 1 && qDims[3] == 1);
+        DIM_ASSERT(3, tDims[2] == 1 && tDims[3] == 1);
+        DIM_ASSERT(4, (dist_dim == 0 || dist_dim == 1));
+        DIM_ASSERT(5, n_dist > 0 && n_dist <= (uint)tDims[train_samples]);
+        ARG_ASSERT(6, dist_type == AF_SAD || dist_type == AF_SSD || dist_type == AF_SHD);
+        TYPE_ASSERT(qType == tType);
+
+        // For Hamming, only u8, u32 and u64 allowed.
+        af_array oIdx;
+        af_array oDist;
+
+        if(dist_type == AF_SHD) {
+            TYPE_ASSERT(qType == u8 || qType == u32 || qType == u64);
+            switch(qType) {
+                case u8:  nearest_neighbour<uchar, uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case u32: nearest_neighbour<uint , uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case u64: nearest_neighbour<uintl, uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                default : TYPE_ERROR(1, qType);
+            }
+        } else {
+            switch(qType) {
+                case f32: nearest_neighbour<float , float >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case f64: nearest_neighbour<double, double>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case s32: nearest_neighbour<int   , int   >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case u32: nearest_neighbour<uint  , uint  >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case s64: nearest_neighbour<intl  , intl  >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case u64: nearest_neighbour<uintl , uintl >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                case u8:  nearest_neighbour<uchar , uint  >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+                default : TYPE_ERROR(1, qType);
+            }
+        }
+        std::swap(*idx, oIdx);
+        std::swap(*dist, oDist);
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
diff --git a/src/api/cpp/hamming.cpp b/src/api/cpp/hamming.cpp
index f97c34c..4ef8d1e 100644
--- a/src/api/cpp/hamming.cpp
+++ b/src/api/cpp/hamming.cpp
@@ -20,7 +20,7 @@ void hammingMatcher(array& idx, array& dist,
 {
     af_array temp_idx  = 0;
     af_array temp_dist = 0;
-    AF_THROW(af_hamming_matcher(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist));
+    AF_THROW(af_nearest_neighbour(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist, AF_SHD));
     idx  = array(temp_idx);
     dist = array(temp_dist);
 }
diff --git a/src/api/cpp/hamming.cpp b/src/api/cpp/nearest_neighbour.cpp
similarity index 60%
copy from src/api/cpp/hamming.cpp
copy to src/api/cpp/nearest_neighbour.cpp
index f97c34c..5dae7ab 100644
--- a/src/api/cpp/hamming.cpp
+++ b/src/api/cpp/nearest_neighbour.cpp
@@ -14,13 +14,14 @@
 namespace af
 {
 
-void hammingMatcher(array& idx, array& dist,
-                     const array& query, const array& train,
-                     const dim_t dist_dim, const unsigned n_dist)
+void nearestNeighbour(array& idx, array& dist,
+                      const array& query, const array& train,
+                      const dim_t dist_dim, const unsigned n_dist,
+                      const af_match_type dist_type)
 {
     af_array temp_idx  = 0;
     af_array temp_dist = 0;
-    AF_THROW(af_hamming_matcher(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist));
+    AF_THROW(af_nearest_neighbour(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist, dist_type));
     idx  = array(temp_idx);
     dist = array(temp_dist);
 }
diff --git a/src/backend/cpu/hamming.cpp b/src/backend/cpu/hamming.cpp
deleted file mode 100644
index 544883b..0000000
--- a/src/backend/cpu/hamming.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
-#include <Array.hpp>
-#include <err_cpu.hpp>
-#include <handle.hpp>
-
-using af::dim4;
-
-namespace cpu
-{
-
-#if defined(_WIN32) || defined(_MSC_VER)
-
-#include <intrin.h>
-#define __builtin_popcount __popcnt
-
-#endif
-
-template<typename T>
-inline uint hamming_distance(T v1, T v2)
-{
-    return __builtin_popcount(v1 ^ v2);
-}
-
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
-                     const Array<T>& query, const Array<T>& train,
-                     const uint dist_dim, const uint n_dist)
-{
-    uint sample_dim = (dist_dim == 0) ? 1 : 0;
-    const dim4 qDims = query.dims();
-    const dim4 tDims = train.dims();
-
-    if (n_dist > 1) {
-        CPU_NOT_SUPPORTED();
-    }
-
-    const unsigned distLength = qDims[dist_dim];
-    const unsigned nQuery = qDims[sample_dim];
-    const unsigned nTrain = tDims[sample_dim];
-
-    const dim4 outDims(n_dist, nQuery);
-
-    idx  = createEmptyArray<uint>(outDims);
-    dist = createEmptyArray<uint>(outDims);
-
-    const T* qPtr = query.get();
-    const T* tPtr = train.get();
-    uint* iPtr = idx.get();
-    uint* dPtr = dist.get();
-
-    for (unsigned i = 0; i < nQuery; i++) {
-        unsigned best_dist = limit_max<unsigned>();
-        unsigned best_idx  = 0;
-
-        for (unsigned j = 0; j < nTrain; j++) {
-            unsigned local_dist = 0;
-            for (unsigned k = 0; k < distLength; k++) {
-                size_t qIdx, tIdx;
-                if (sample_dim == 0) {
-                    qIdx = k * qDims[0] + i;
-                    tIdx = k * tDims[0] + j;
-                }
-                else {
-                    qIdx = i * qDims[0] + k;
-                    tIdx = j * tDims[0] + k;
-                }
-
-                local_dist += hamming_distance(qPtr[qIdx], tPtr[tIdx]);
-            }
-
-            if (local_dist < best_dist) {
-                best_dist = local_dist;
-                best_idx  = j;
-            }
-        }
-
-        size_t oIdx;
-        oIdx = i;
-        iPtr[oIdx] = best_idx;
-        dPtr[oIdx] = best_dist;
-    }
-}
-
-#define INSTANTIATE(T)\
-    template void hamming_matcher<T>(Array<uint>& idx, Array<uint>& dist,           \
-                                     const Array<T>& query, const Array<T>& train,  \
-                                     const uint dist_dim, const uint n_dist);
-
-INSTANTIATE(uchar)
-INSTANTIATE(uint)
-
-}
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
new file mode 100644
index 0000000..f706769
--- /dev/null
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -0,0 +1,175 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_cpu.hpp>
+#include <handle.hpp>
+
+using af::dim4;
+
+namespace cpu
+{
+
+#if defined(_WIN32) || defined(_MSC_VER)
+
+#include <intrin.h>
+#define __builtin_popcount __popcnt
+
+#endif
+
+template<typename T, typename To, af_match_type dist_type>
+struct dist_op
+{
+    To operator()(T v1, T v2)
+    {
+        return v1 - v2;     // Garbage distance
+    }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SAD>
+{
+    To operator()(T v1, T v2)
+    {
+        return std::abs((double)v1 - (double)v2);
+    }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SSD>
+{
+    To operator()(T v1, T v2)
+    {
+        return (v1 - v2) * (v1 - v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uint, To, AF_SHD>
+{
+    To operator()(uint v1, uint v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uintl, To, AF_SHD>
+{
+    To operator()(uintl v1, uintl v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uchar, To, AF_SHD>
+{
+    To operator()(uchar v1, uchar v2)
+    {
+        return __builtin_popcount(v1 ^ v2);
+    }
+};
+
+template<typename T, typename To, af_match_type dist_type>
+void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
+                        const Array<T>& query, const Array<T>& train,
+                        const uint dist_dim, const uint n_dist)
+{
+    uint sample_dim = (dist_dim == 0) ? 1 : 0;
+    const dim4 qDims = query.dims();
+    const dim4 tDims = train.dims();
+
+    if (n_dist > 1) {
+        CPU_NOT_SUPPORTED();
+    }
+
+    const unsigned distLength = qDims[dist_dim];
+    const unsigned nQuery = qDims[sample_dim];
+    const unsigned nTrain = tDims[sample_dim];
+
+    const dim4 outDims(n_dist, nQuery);
+
+    idx  = createEmptyArray<uint>(outDims);
+    dist = createEmptyArray<To  >(outDims);
+
+    const T* qPtr = query.get();
+    const T* tPtr = train.get();
+    uint* iPtr = idx.get();
+    To* dPtr = dist.get();
+
+    dist_op<T, To, dist_type> op;
+
+    for (unsigned i = 0; i < nQuery; i++) {
+        To best_dist = limit_max<To>();
+        unsigned best_idx  = 0;
+
+        for (unsigned j = 0; j < nTrain; j++) {
+            To local_dist = 0;
+            for (unsigned k = 0; k < distLength; k++) {
+                size_t qIdx, tIdx;
+                if (sample_dim == 0) {
+                    qIdx = k * qDims[0] + i;
+                    tIdx = k * tDims[0] + j;
+                }
+                else {
+                    qIdx = i * qDims[0] + k;
+                    tIdx = j * tDims[0] + k;
+                }
+
+                local_dist += op(qPtr[qIdx], tPtr[tIdx]);
+            }
+
+            if (local_dist < best_dist) {
+                best_dist = local_dist;
+                best_idx  = j;
+            }
+        }
+
+        size_t oIdx;
+        oIdx = i;
+        iPtr[oIdx] = best_idx;
+        dPtr[oIdx] = best_dist;
+    }
+}
+
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+                       const Array<T>& query, const Array<T>& train,
+                       const uint dist_dim, const uint n_dist,
+                       const af_match_type dist_type)
+{
+    switch(dist_type) {
+        case AF_SAD: nearest_neighbour_<T, To, AF_SAD>(idx, dist, query, train, dist_dim, n_dist); break;
+        case AF_SSD: nearest_neighbour_<T, To, AF_SSD>(idx, dist, query, train, dist_dim, n_dist); break;
+        case AF_SHD: nearest_neighbour_<T, To, AF_SHD>(idx, dist, query, train, dist_dim, n_dist); break;
+        default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
+    }
+}
+
+#define INSTANTIATE(T, To)                                                              \
+    template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist,           \
+                                         const Array<T>& query, const Array<T>& train,  \
+                                         const uint dist_dim, const uint n_dist,        \
+                                         const af_match_type dist_type);
+
+INSTANTIATE(float , float)
+INSTANTIATE(double, double)
+INSTANTIATE(int   , int)
+INSTANTIATE(uint  , uint)
+INSTANTIATE(intl  , intl)
+INSTANTIATE(uintl , uintl)
+INSTANTIATE(uchar , uint)
+
+INSTANTIATE(uintl, uint)    // For Hamming
+
+}
diff --git a/src/backend/cpu/hamming.hpp b/src/backend/cpu/nearest_neighbour.hpp
similarity index 56%
rename from src/backend/cpu/hamming.hpp
rename to src/backend/cpu/nearest_neighbour.hpp
index 3787a68..4cf1dc2 100644
--- a/src/backend/cpu/hamming.hpp
+++ b/src/backend/cpu/nearest_neighbour.hpp
@@ -12,9 +12,10 @@
 namespace cpu
 {
 
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
-                     const Array<T>& query, const Array<T>& train,
-                     const uint dist_dim, const uint n_dist);
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+                       const Array<T>& query, const Array<T>& train,
+                       const uint dist_dim, const uint n_dist,
+                       const af_match_type dist_type = AF_SSD);
 
 }
diff --git a/src/backend/cuda/hamming.cu b/src/backend/cuda/hamming.cu
deleted file mode 100644
index 2022675..0000000
--- a/src/backend/cuda/hamming.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
-#include <Array.hpp>
-#include <err_cuda.hpp>
-#include <handle.hpp>
-#include <kernel/hamming.hpp>
-#include <kernel/transpose.hpp>
-
-using af::dim4;
-
-namespace cuda
-{
-
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
-                     const Array<T>& query, const Array<T>& train,
-                     const uint dist_dim, const uint n_dist)
-{
-    uint sample_dim = (dist_dim == 0) ? 1 : 0;
-    const dim4 qDims = query.dims();
-    const dim4 tDims = train.dims();
-
-    const dim4 outDims(n_dist, qDims[sample_dim]);
-
-    idx  = createEmptyArray<uint>(outDims);
-    dist = createEmptyArray<uint>(outDims);
-
-    Array<T> queryT = query;
-    Array<T> trainT = train;
-
-    if (dist_dim == 0) {
-        const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
-        const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
-        queryT = createEmptyArray<T>(queryTDims);
-        trainT = createEmptyArray<T>(trainTDims);
-
-        kernel::transpose<T, false>(queryT, query, query.ndims());
-        kernel::transpose<T, false>(trainT, train, train.ndims());
-    }
-
-    kernel::hamming_matcher<T>(idx, dist, queryT, trainT, 1, n_dist);
-}
-
-#define INSTANTIATE(T)\
-    template void hamming_matcher<T>(Array<uint>& idx, Array<uint>& dist,           \
-                                     const Array<T>& query, const Array<T>& train,  \
-                                     const uint dist_dim, const uint n_dist);
-
-INSTANTIATE(uchar)
-INSTANTIATE(uint)
-
-}
diff --git a/src/backend/cuda/kernel/hamming.hpp b/src/backend/cuda/kernel/nearest_neighbour.hpp
similarity index 77%
rename from src/backend/cuda/kernel/hamming.hpp
rename to src/backend/cuda/kernel/nearest_neighbour.hpp
index 99fcf7f..fa87e9b 100644
--- a/src/backend/cuda/kernel/hamming.hpp
+++ b/src/backend/cuda/kernel/nearest_neighbour.hpp
@@ -13,6 +13,7 @@
 #include <debug_cuda.hpp>
 #include <memory.hpp>
 #include <platform.hpp>
+#include <backend.hpp>
 
 namespace cuda
 {
@@ -22,10 +23,65 @@ namespace kernel
 
 static const unsigned THREADS = 256;
 
-template<typename T, unsigned feat_len, bool use_shmem>
-__global__ void hamming_matcher_unroll(
+template<typename T, typename To, af_match_type dist_type>
+struct dist_op
+{
+    __DH__ To operator()(T v1, T v2)
+    {
+        return v1 - v2;
+    }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SAD>
+{
+    __device__ To operator()(T v1, T v2)
+    {
+        return abs((double)v1 - (double)v2);
+    }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SSD>
+{
+    __device__ To operator()(T v1, T v2)
+    {
+        return (v1 - v2) * (v1 - v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uint, To, AF_SHD>
+{
+    __device__ To operator()(uint v1, uint v2)
+    {
+        return __popc(v1 ^ v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uintl, To, AF_SHD>
+{
+    __device__ To operator()(uintl v1, uintl v2)
+    {
+        return __popc(v1 ^ v2);
+    }
+};
+
+template<typename To>
+struct dist_op<uchar, To, AF_SHD>
+{
+    __device__ To operator()(uchar v1, uchar v2)
+    {
+        return __popc(v1 ^ v2);
+    }
+};
+
+
+template<typename T, typename To, af_match_type dist_type, unsigned feat_len, bool use_shmem>
+__global__ void nearest_neighbour_unroll(
     unsigned* out_idx,
-    unsigned* out_dist,
+    To* out_dist,
     CParam<T> query,
     CParam<T> train,
     const unsigned max_dist)
@@ -36,7 +92,7 @@ __global__ void hamming_matcher_unroll(
     unsigned f = blockDim.x * blockIdx.x + threadIdx.x;
     unsigned tid = threadIdx.x;
 
-    __shared__ unsigned s_dist[THREADS];
+    __shared__ To       s_dist[THREADS];
     __shared__ unsigned s_idx[THREADS];
 
     extern __shared__ char smem[];
@@ -59,6 +115,8 @@ __global__ void hamming_matcher_unroll(
     }
     __syncthreads();
 
+    dist_op<T, To, dist_type> op;
+
     for (unsigned j = 0; j < nquery; j++) {
         s_dist[tid] = max_dist;
 
@@ -69,17 +127,17 @@ __global__ void hamming_matcher_unroll(
         }
         __syncthreads();
 
-        unsigned dist = 0;
+        To dist = 0;
         if (valid_feat) {
             #pragma unroll
             for (unsigned k = 0; k < feat_len; k++) {
                 // Calculate Hamming distance for 32-bits of descriptor and
                 // accumulates to dist
                 if (use_shmem) {
-                    dist += __popc(s_train[k * blockDim.x + tid] ^ s_query[k]);
+                    dist += op(s_train[k * blockDim.x + tid], s_query[k]);
                 }
                 else {
-                    dist += __popc(train.ptr[k * ntrain + f] ^ s_query[k]);
+                    dist += op(train.ptr[k * ntrain + f], s_query[k]);
                 }
             }
 
@@ -159,13 +217,13 @@ __global__ void hamming_matcher_unroll(
     }
 }
 
-template<typename T, bool use_shmem>
-__global__ void hamming_matcher(
+template<typename T, typename To, af_match_type dist_type, bool use_shmem>
+__global__ void nearest_neighbour(
     unsigned* out_idx,
-    unsigned* out_dist,
+    To* out_dist,
     CParam<T> query,
     CParam<T> train,
-    const unsigned max_dist,
+    const To max_dist,
     const unsigned feat_len)
 {
     unsigned nquery = query.dims[0];
@@ -174,7 +232,7 @@ __global__ void hamming_matcher(
     unsigned f = blockDim.x * blockIdx.x + threadIdx.x;
     unsigned tid = threadIdx.x;
 
-    __shared__ unsigned s_dist[THREADS];
+    __shared__ To s_dist[THREADS];
     __shared__ unsigned s_idx[THREADS];
 
     extern __shared__ char smem[];
@@ -196,6 +254,8 @@ __global__ void hamming_matcher(
     }
     __syncthreads();
 
+    dist_op<T, To, dist_type> op;
+
     for (unsigned j = 0; j < nquery; j++) {
         s_dist[tid] = max_dist;
 
@@ -206,16 +266,16 @@ __global__ void hamming_matcher(
         }
         __syncthreads();
 
-        unsigned dist = 0;
+        To dist = 0;
         if (valid_feat) {
             for (unsigned k = 0; k < feat_len; k++) {
                 // Calculate Hamming distance for 32-bits of descriptor and
                 // accumulates to dist
                 if (use_shmem) {
-                    dist += __popc(s_train[k * blockDim.x + tid] ^ s_query[k]);
+                    dist += op(s_train[k * blockDim.x + tid], s_query[k]);
                 }
                 else {
-                    dist += __popc(train.ptr[k * ntrain + f] ^ s_query[k]);
+                    dist += op(train.ptr[k * ntrain + f], s_query[k]);
                 }
             }
 
@@ -295,19 +355,20 @@ __global__ void hamming_matcher(
     }
 }
 
+template<typename To>
 __global__ void select_matches(
     Param<unsigned> idx,
-    Param<unsigned> dist,
+    Param<To> dist,
     const unsigned* in_idx,
-    const unsigned* in_dist,
+    const To* in_dist,
     const unsigned nfeat,
     const unsigned nelem,
-    const unsigned max_dist)
+    const To max_dist)
 {
     unsigned f = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned sid = threadIdx.x * blockDim.y + threadIdx.y;
 
-    __shared__ unsigned s_dist[THREADS];
+    __shared__ To s_dist[THREADS];
     __shared__ unsigned s_idx[THREADS];
 
     if (f < nfeat) {
@@ -315,9 +376,9 @@ __global__ void select_matches(
         __syncthreads();
 
         for (unsigned i = threadIdx.y; i < nelem; i += blockDim.y) {
-            unsigned dist = in_dist[f * nelem + i];
+            To dist = in_dist[f * nelem + i];
 
-            // Copy all best matches previously found in hamming_matcher() to
+            // Copy all best matches previously found in nearest_neighbour() to
             // shared memory
             if (dist < s_dist[sid]) {
                 s_dist[sid] = dist;
@@ -329,7 +390,7 @@ __global__ void select_matches(
         // Reduce best matches and find the best of them all
         for (unsigned i = blockDim.y / 2; i > 0; i >>= 1) {
             if (threadIdx.y < i) {
-                unsigned dist = s_dist[sid + i];
+                To dist = s_dist[sid + i];
                 if (dist < s_dist[sid]) {
                     s_dist[sid] = dist;
                     s_idx[sid]  = s_idx[sid + i];
@@ -346,16 +407,16 @@ __global__ void select_matches(
     }
 }
 
-template<typename T>
-void hamming_matcher(Param<uint> idx,
-                     Param<uint> dist,
-                     CParam<T> query,
-                     CParam<T> train,
-                     const dim_t dist_dim,
-                     const unsigned n_dist)
+template<typename T, typename To, af_match_type dist_type>
+void nearest_neighbour(Param<uint> idx,
+                       Param<To> dist,
+                       CParam<T> query,
+                       CParam<T> train,
+                       const dim_t dist_dim,
+                       const unsigned n_dist)
 {
     const unsigned feat_len = query.dims[dist_dim];
-    const unsigned max_dist = feat_len * 8 * sizeof(T);
+    const To max_dist = limit_max<To>();
 
     if (feat_len > THREADS) {
         CUDA_NOT_SUPPORTED();
@@ -381,7 +442,7 @@ void hamming_matcher(Param<uint> idx,
     unsigned nblk = blocks.x;
 
     unsigned* d_blk_idx  = memAlloc<unsigned>(nblk * nquery);
-    unsigned* d_blk_dist = memAlloc<unsigned>(nblk * nquery);
+    To* d_blk_dist = memAlloc<To>(nblk * nquery);
 
     // For each query vector, find training vector with smallest Hamming
     // distance per CUDA block
@@ -389,35 +450,35 @@ void hamming_matcher(Param<uint> idx,
         switch(feat_len) {
         // Optimized lengths (faster due to loop unrolling)
         case 1:
-            hamming_matcher_unroll<T,1,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,1,true><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 2:
-            hamming_matcher_unroll<T,2,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,2,true><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 4:
-            hamming_matcher_unroll<T,4,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,4,true><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 8:
-            hamming_matcher_unroll<T,8,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,8,true><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 16:
-            hamming_matcher_unroll<T,16,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,16,true><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 32:
-            hamming_matcher_unroll<T,32,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,32,true><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 64:
-            hamming_matcher_unroll<T,64,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,64,true><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         default:
-            hamming_matcher<T,true><<<blocks, threads, smem_sz>>>
+            nearest_neighbour<T,To,dist_type,true><<<blocks, threads, smem_sz>>>
                            (d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
         }
     }
@@ -425,35 +486,35 @@ void hamming_matcher(Param<uint> idx,
         switch(feat_len) {
         // Optimized lengths (faster due to loop unrolling)
         case 1:
-            hamming_matcher_unroll<T,1,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,1,false><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 2:
-            hamming_matcher_unroll<T,2,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,2,false><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 4:
-            hamming_matcher_unroll<T,4,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,4,false><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 8:
-            hamming_matcher_unroll<T,8,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,8,false><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 16:
-            hamming_matcher_unroll<T,16,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,16,false><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 32:
-            hamming_matcher_unroll<T,32,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,32,false><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         case 64:
-            hamming_matcher_unroll<T,64,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour_unroll<T,To,dist_type,64,false><<<blocks, threads, smem_sz>>>
                                   (d_blk_idx, d_blk_dist, query, train, max_dist);
             break;
         default:
-            hamming_matcher<T,false><<<blocks, threads, smem_sz>>>
+            nearest_neighbour<T,To,dist_type,false><<<blocks, threads, smem_sz>>>
                            (d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
         }
     }
diff --git a/src/backend/cuda/math.hpp b/src/backend/cuda/math.hpp
index 3f7dcd4..83e3a0c 100644
--- a/src/backend/cuda/math.hpp
+++ b/src/backend/cuda/math.hpp
@@ -105,6 +105,10 @@ namespace cuda
     template<> __device__  float  limit_min<float>()  { return -CUDART_INF_F; }
     template<> __device__  double limit_max<double>() { return  CUDART_INF; }
     template<> __device__  double limit_min<double>() { return -CUDART_INF; }
+    template<> __device__  intl   limit_max<intl>()   { return 0x7fffffffffffffff; }
+    template<> __device__  intl   limit_min<intl>()   { return 0x8000000000000000; }
+    template<> __device__  uintl  limit_max<uintl>()  { return 0xffffffffffffffff; }
+    template<> __device__  uintl  limit_min<uintl>()  { return 0; }
 #endif
 
 #define upcast cuComplexFloatToDouble
diff --git a/src/backend/cuda/nearest_neighbour.cu b/src/backend/cuda/nearest_neighbour.cu
new file mode 100644
index 0000000..1899c9d
--- /dev/null
+++ b/src/backend/cuda/nearest_neighbour.cu
@@ -0,0 +1,79 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_cuda.hpp>
+#include <handle.hpp>
+#include <kernel/nearest_neighbour.hpp>
+#include <kernel/transpose.hpp>
+
+using af::dim4;
+
+namespace cuda
+{
+
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+                     const Array<T>& query, const Array<T>& train,
+                     const uint dist_dim, const uint n_dist,
+                     const af_match_type dist_type)
+{
+    uint sample_dim = (dist_dim == 0) ? 1 : 0;
+    const dim4 qDims = query.dims();
+    const dim4 tDims = train.dims();
+
+    const dim4 outDims(n_dist, qDims[sample_dim]);
+
+    idx  = createEmptyArray<uint>(outDims);
+    dist = createEmptyArray<To>(outDims);
+
+    Array<T> queryT = query;
+    Array<T> trainT = train;
+
+    if (dist_dim == 0) {
+        const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
+        const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
+        queryT = createEmptyArray<T>(queryTDims);
+        trainT = createEmptyArray<T>(trainTDims);
+
+        kernel::transpose<T, false>(queryT, query, query.ndims());
+        kernel::transpose<T, false>(trainT, train, train.ndims());
+    }
+
+    switch(dist_type) {
+        case AF_SAD: kernel::nearest_neighbour<T, To, AF_SAD>(idx, dist, queryT, trainT, 1, n_dist);
+                     break;
+        case AF_SSD: kernel::nearest_neighbour<T, To, AF_SSD>(idx, dist, queryT, trainT, 1, n_dist);
+                     break;
+        case AF_SHD: kernel::nearest_neighbour<T, To, AF_SHD>(idx, dist, queryT, trainT, 1, n_dist);
+                     break;
+        default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
+    }
+}
+
+#define INSTANTIATE(T, To)                                                              \
+    template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist,           \
+                                         const Array<T>& query, const Array<T>& train,  \
+                                         const uint dist_dim, const uint n_dist,        \
+                                         const af_match_type dist_type);
+
+INSTANTIATE(float , float)
+INSTANTIATE(double, double)
+INSTANTIATE(int   , int)
+INSTANTIATE(uint  , uint)
+INSTANTIATE(intl  , intl)
+INSTANTIATE(uintl , uintl)
+INSTANTIATE(uchar , uint)
+
+INSTANTIATE(uintl, uint)    // For Hamming
+
+}
diff --git a/src/backend/cuda/hamming.hpp b/src/backend/cuda/nearest_neighbour.hpp
similarity index 57%
rename from src/backend/cuda/hamming.hpp
rename to src/backend/cuda/nearest_neighbour.hpp
index 37d3944..443f972 100644
--- a/src/backend/cuda/hamming.hpp
+++ b/src/backend/cuda/nearest_neighbour.hpp
@@ -14,9 +14,10 @@ using af::features;
 namespace cuda
 {
 
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
-                     const Array<T>& query, const Array<T>& train,
-                     const uint dist_dim, const uint n_dist);
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+                       const Array<T>& query, const Array<T>& train,
+                       const uint dist_dim, const uint n_dist,
+                       const af_match_type dist_type = AF_SSD);
 
 }
diff --git a/src/backend/opencl/hamming.cpp b/src/backend/opencl/hamming.cpp
deleted file mode 100644
index 5d4bd59..0000000
--- a/src/backend/opencl/hamming.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
-#include <Array.hpp>
-#include <err_opencl.hpp>
-#include <handle.hpp>
-#include <kernel/hamming.hpp>
-#include <kernel/transpose.hpp>
-
-using af::dim4;
-using cl::Device;
-
-namespace opencl
-{
-
-static const unsigned THREADS = 256;
-
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
-                     const Array<T>& query, const Array<T>& train,
-                     const uint dist_dim, const uint n_dist)
-{
-    uint sample_dim = (dist_dim == 0) ? 1 : 0;
-    const dim4 qDims = query.dims();
-    const dim4 tDims = train.dims();
-
-    const dim4 outDims(n_dist, qDims[sample_dim]);
-
-    idx  = createEmptyArray<uint>(outDims);
-    dist = createEmptyArray<uint>(outDims);
-
-    const unsigned feat_len = qDims[dist_dim];
-
-    // Determine maximum feat_len capable of using shared memory (faster)
-    cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-    size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
-    size_t ltrain_sz = THREADS * feat_len * sizeof(T);
-    bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false;
-    size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef;
-
-    Array<T> queryT = query;
-    Array<T> trainT = train;
-
-    if (dist_dim == 0) {
-        const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
-        const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
-        queryT = createEmptyArray<T>(queryTDims);
-        trainT = createEmptyArray<T>(trainTDims);
-
-        bool queryIs32Multiple = false;
-        if (qDims[0] % 32 == 0 && qDims[1] % 32 == 0)
-            queryIs32Multiple = true;
-
-        bool trainIs32Multiple = false;
-        if (tDims[0] % 32 == 0 && tDims[1] % 32 == 0)
-        trainIs32Multiple = true;
-
-        if (queryIs32Multiple)
-            kernel::transpose<T, false, true >(queryT, query);
-        else
-            kernel::transpose<T, false, false>(queryT, query);
-
-        if (trainIs32Multiple)
-            kernel::transpose<T, false, true >(trainT, train);
-        else
-            kernel::transpose<T, false, false>(trainT, train);
-    }
-
-    if (use_lmem) {
-        switch (feat_len) {
-        case 1:
-            kernel::hamming_matcher<T, true , 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 2:
-            kernel::hamming_matcher<T, true , 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 4:
-            kernel::hamming_matcher<T, true , 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 8:
-            kernel::hamming_matcher<T, true , 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 16:
-            kernel::hamming_matcher<T, true , 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 32:
-            kernel::hamming_matcher<T, true , 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 64:
-            kernel::hamming_matcher<T, true , 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        default:
-            kernel::hamming_matcher<T, true , 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        }
-    } else {
-        switch (feat_len) {
-        case 1:
-            kernel::hamming_matcher<T, false, 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 2:
-            kernel::hamming_matcher<T, false, 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 4:
-            kernel::hamming_matcher<T, false, 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 8:
-            kernel::hamming_matcher<T, false, 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 16:
-            kernel::hamming_matcher<T, false, 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 32:
-            kernel::hamming_matcher<T, false, 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        case 64:
-            kernel::hamming_matcher<T, false, 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        default:
-            kernel::hamming_matcher<T, false, 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
-            break;
-        }
-    }
-}
-
-#define INSTANTIATE(T)\
-    template void hamming_matcher<T>(Array<uint>& idx, Array<uint>& dist,           \
-                                     const Array<T>& query, const Array<T>& train,  \
-                                     const uint dist_dim, const uint n_dist);
-
-INSTANTIATE(uchar)
-INSTANTIATE(uint)
-
-}
diff --git a/src/backend/opencl/kernel/hamming.cl b/src/backend/opencl/kernel/nearest_neighbour.cl
similarity index 89%
rename from src/backend/opencl/kernel/hamming.cl
rename to src/backend/opencl/kernel/nearest_neighbour.cl
index 68ce6fd..024fdc6 100644
--- a/src/backend/opencl/kernel/hamming.cl
+++ b/src/backend/opencl/kernel/nearest_neighbour.cl
@@ -20,15 +20,39 @@ __inline unsigned popcount(unsigned x)
 }
 #endif
 
+#ifdef USE_DOUBLE
+To _sad_(T v1, T v2)
+{
+    return fabs(v1 - v2);
+}
+#else
+To _sad_(T v1, T v2)
+{
+    return fabs((float)v1 - (float)v2);
+}
+#endif
+
+To _ssd_(T v1, T v2)
+{
+    return (v1 - v2) * (v1 - v2);
+}
+
+#ifdef __SHD__
+unsigned _shd_(T v1, T v2)
+{
+    return popcount(v1 ^ v2);
+}
+#endif
+
 __kernel
-void hamming_matcher_unroll(
+void nearest_neighbour_unroll(
     __global unsigned* out_idx,
-    __global unsigned* out_dist,
+    __global To* out_dist,
     __global const T* query,
     KParam qInfo,
     __global const T* train,
     KParam tInfo,
-    const unsigned max_dist,
+    const To max_dist,
     __local T* lmem)
 {
     unsigned nquery = qInfo.dims[0];
@@ -37,7 +61,7 @@ void hamming_matcher_unroll(
     unsigned f = get_global_id(0);
     unsigned tid = get_local_id(0);
 
-    __local unsigned l_dist[THREADS];
+    __local To l_dist[THREADS];
     __local unsigned l_idx[THREADS];
 
     __local T* l_query = lmem;
@@ -69,16 +93,16 @@ void hamming_matcher_unroll(
         }
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        unsigned dist = 0;
+        To dist = 0;
         if (valid_feat) {
             #pragma unroll
             for (int k = 0; k < (int)FEAT_LEN; k++) {
                 // Calculate Hamming distance for 32-bits of descriptor and
                 // accumulates to dist
 #ifdef USE_LOCAL_MEM
-                dist += popcount(l_train[k * get_local_size(0) + tid] ^ l_query[k]);
+                dist += DISTOP(l_train[k * get_local_size(0) + tid], l_query[k]);
 #else
-                dist += popcount(train[k * ntrain + f] ^ l_query[k]);
+                dist += DISTOP(train[k * ntrain + f], l_query[k]);
 #endif
             }
         }
@@ -161,14 +185,14 @@ void hamming_matcher_unroll(
 }
 
 __kernel
-void hamming_matcher(
+void nearest_neighbour(
     __global unsigned* out_idx,
-    __global unsigned* out_dist,
+    __global To* out_dist,
     __global const T* query,
     KParam qInfo,
     __global const T* train,
     KParam tInfo,
-    const unsigned max_dist,
+    const To max_dist,
     const unsigned feat_len,
     __local T* lmem)
 {
@@ -178,7 +202,7 @@ void hamming_matcher(
     unsigned f = get_global_id(0);
     unsigned tid = get_local_id(0);
 
-    __local unsigned l_dist[THREADS];
+    __local To l_dist[THREADS];
     __local unsigned l_idx[THREADS];
 
     __local T* l_query = lmem;
@@ -209,15 +233,15 @@ void hamming_matcher(
         }
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        unsigned dist = 0;
+        To dist = 0;
         if (valid_feat) {
             for (int k = 0; k < (int)feat_len; k++) {
                 // Calculate Hamming distance for 32-bits of descriptor and
                 // accumulates to dist
 #ifdef USE_LOCAL_MEM
-                dist += popcount(l_train[k * get_local_size(0) + tid] ^ l_query[k]);
+                dist += DISTOP(l_train[k * get_local_size(0) + tid], l_query[k]);
 #else
-                dist += popcount(train[k * ntrain + f] ^ l_query[k]);
+                dist += DISTOP(train[k * ntrain + f], l_query[k]);
 #endif
             }
         }
@@ -302,18 +326,18 @@ void hamming_matcher(
 __kernel
 void select_matches(
     __global unsigned* idx,
-    __global unsigned* dist,
+    __global To* dist,
     __global const unsigned* in_idx,
-    __global const unsigned* in_dist,
+    __global const To* in_dist,
     const unsigned nfeat,
     const unsigned nelem,
-    const unsigned max_dist)
+    const To max_dist)
 {
     unsigned f = get_global_id(0);
     unsigned lsz1 = get_local_size(1);
     unsigned sid = get_local_id(0) * lsz1 + get_local_id(1);
 
-    __local unsigned l_dist[THREADS];
+    __local To l_dist[THREADS];
     __local unsigned l_idx[THREADS];
 
     bool valid_feat = (f < nfeat);
@@ -327,9 +351,9 @@ void select_matches(
 
     for (unsigned i = get_local_id(1); i < nelem_max; i += get_local_size(1)) {
         if (valid_feat && i < nelem) {
-            unsigned dist = in_dist[f * nelem + i];
+            To dist = in_dist[f * nelem + i];
 
-            // Copy all best matches previously found in hamming_matcher() to
+            // Copy all best matches previously found in nearest_neighbour() to
             // shared memory
             if (dist < l_dist[sid]) {
                 l_dist[sid] = dist;
@@ -342,7 +366,7 @@ void select_matches(
     for (unsigned i = get_local_size(1) / 2; i > 0; i >>= 1) {
         if (get_local_id(1) < i) {
             if (valid_feat) {
-                unsigned dist = l_dist[sid + i];
+                To dist = l_dist[sid + i];
                 if (dist < l_dist[sid]) {
                     l_dist[sid] = dist;
                     l_idx[sid]  = l_idx[sid + i];
diff --git a/src/backend/opencl/kernel/hamming.hpp b/src/backend/opencl/kernel/nearest_neighbour.hpp
similarity index 70%
rename from src/backend/opencl/kernel/hamming.hpp
rename to src/backend/opencl/kernel/nearest_neighbour.hpp
index b89ee15..601dcc3 100644
--- a/src/backend/opencl/kernel/hamming.hpp
+++ b/src/backend/opencl/kernel/nearest_neighbour.hpp
@@ -12,8 +12,9 @@
 #include <dispatch.hpp>
 #include <err_opencl.hpp>
 #include <debug_opencl.hpp>
-#include <kernel_headers/hamming.hpp>
+#include <kernel_headers/nearest_neighbour.hpp>
 #include <memory.hpp>
+#include <math.hpp>
 
 using cl::LocalSpaceArg;
 
@@ -25,18 +26,18 @@ namespace kernel
 
 static const unsigned THREADS = 256;
 
-template<typename T, bool use_lmem, unsigned unroll_len>
-void hamming_matcher(Param idx,
-                     Param dist,
-                     Param query,
-                     Param train,
-                     const dim_t dist_dim,
-                     const unsigned n_dist,
-                     const size_t lmem_sz)
+template<typename T, typename To, af_match_type dist_type, bool use_lmem, unsigned unroll_len>
+void nearest_neighbour(Param idx,
+                       Param dist,
+                       Param query,
+                       Param train,
+                       const dim_t dist_dim,
+                       const unsigned n_dist,
+                       const size_t lmem_sz)
 {
     try {
         static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-        static Program        hammingProgs[DeviceManager::MAX_DEVICES];
+        static Program        nearest_neighbourProgs[DeviceManager::MAX_DEVICES];
         static Kernel             huKernel[DeviceManager::MAX_DEVICES];
         static Kernel             hmKernel[DeviceManager::MAX_DEVICES];
         static Kernel             smKernel[DeviceManager::MAX_DEVICES];
@@ -44,7 +45,7 @@ void hamming_matcher(Param idx,
         int device = getActiveDeviceId();
 
         const unsigned feat_len = query.info.dims[dist_dim];
-        const unsigned max_dist = feat_len * 8 * sizeof(T);
+        const To max_dist = limit_max<To>();
 
         if (feat_len > THREADS) {
             OPENCL_NOT_SUPPORTED();
@@ -54,20 +55,29 @@ void hamming_matcher(Param idx,
 
                 std::ostringstream options;
                 options << " -D T=" << dtype_traits<T>::getName()
+                        << " -D To=" << dtype_traits<To>::getName()
                         << " -D THREADS=" << THREADS
                         << " -D FEAT_LEN=" << unroll_len;
 
+                switch(dist_type) {
+                    case AF_SAD: options <<" -D DISTOP=_sad_"; break;
+                    case AF_SSD: options <<" -D DISTOP=_ssd_"; break;
+                    case AF_SHD: options <<" -D DISTOP=_shd_ -D __SHD__";
+                                 break;
+                    default: break;
+                }
+
                 if (use_lmem)
                     options << " -D USE_LOCAL_MEM";
 
-                buildProgram(hammingProgs[device],
-                             hamming_cl,
-                             hamming_cl_len,
+                buildProgram(nearest_neighbourProgs[device],
+                             nearest_neighbour_cl,
+                             nearest_neighbour_cl_len,
                              options.str());
 
-                huKernel[device] = Kernel(hammingProgs[device], "hamming_matcher_unroll");
-                hmKernel[device] = Kernel(hammingProgs[device], "hamming_matcher");
-                smKernel[device] = Kernel(hammingProgs[device], "select_matches");
+                huKernel[device] = Kernel(nearest_neighbourProgs[device], "nearest_neighbour_unroll");
+                hmKernel[device] = Kernel(nearest_neighbourProgs[device], "nearest_neighbour");
+                smKernel[device] = Kernel(nearest_neighbourProgs[device], "select_matches");
             });
 
         const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;
@@ -80,7 +90,7 @@ void hamming_matcher(Param idx,
         const NDRange global(nblk * THREADS, 1);
 
         cl::Buffer *d_blk_idx  = bufferAlloc(nblk * nquery * sizeof(unsigned));
-        cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(unsigned));
+        cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(To));
 
         // For each query vector, find training vector with smallest Hamming
         // distance per CUDA block
@@ -88,7 +98,7 @@ void hamming_matcher(Param idx,
             auto huOp = make_kernel<Buffer, Buffer,
                                     Buffer, KParam,
                                     Buffer, KParam,
-                                    const unsigned,
+                                    const To,
                                     LocalSpaceArg> (huKernel[device]);
 
             huOp(EnqueueArgs(getQueue(), global, local),
@@ -100,7 +110,7 @@ void hamming_matcher(Param idx,
             auto hmOp = make_kernel<Buffer, Buffer,
                                     Buffer, KParam,
                                     Buffer, KParam,
-                                    const unsigned, const unsigned,
+                                    const To, const unsigned,
                                     LocalSpaceArg> (hmKernel[device]);
 
             hmOp(EnqueueArgs(getQueue(), global, local),
@@ -117,7 +127,7 @@ void hamming_matcher(Param idx,
         // best match
         auto smOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
                                 const unsigned, const unsigned,
-                                const unsigned> (smKernel[device]);
+                                const To> (smKernel[device]);
 
         smOp(EnqueueArgs(getQueue(), global_sm, local_sm),
              *idx.data, *dist.data,
diff --git a/src/backend/opencl/nearest_neighbour.cpp b/src/backend/opencl/nearest_neighbour.cpp
new file mode 100644
index 0000000..cd86b7f
--- /dev/null
+++ b/src/backend/opencl/nearest_neighbour.cpp
@@ -0,0 +1,165 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_opencl.hpp>
+#include <handle.hpp>
+#include <kernel/nearest_neighbour.hpp>
+#include <kernel/transpose.hpp>
+
+using af::dim4;
+using cl::Device;
+
+namespace opencl
+{
+
+static const unsigned THREADS = 256;
+
+template<typename T, typename To, af_match_type dist_type>
+void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
+                     const Array<T>& query, const Array<T>& train,
+                     const uint dist_dim, const uint n_dist)
+{
+    uint sample_dim = (dist_dim == 0) ? 1 : 0;
+    const dim4 qDims = query.dims();
+    const dim4 tDims = train.dims();
+
+    const dim4 outDims(n_dist, qDims[sample_dim]);
+
+    idx  = createEmptyArray<uint>(outDims);
+    dist = createEmptyArray<To>(outDims);
+
+    const unsigned feat_len = qDims[dist_dim];
+
+    // Determine maximum feat_len capable of using shared memory (faster)
+    cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
+    size_t ltrain_sz = THREADS * feat_len * sizeof(T);
+    bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false;
+    size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef;
+
+    Array<T> queryT = query;
+    Array<T> trainT = train;
+
+    if (dist_dim == 0) {
+        const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
+        const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
+        queryT = createEmptyArray<T>(queryTDims);
+        trainT = createEmptyArray<T>(trainTDims);
+
+        bool queryIs32Multiple = false;
+        if (qDims[0] % 32 == 0 && qDims[1] % 32 == 0)
+            queryIs32Multiple = true;
+
+        bool trainIs32Multiple = false;
+        if (tDims[0] % 32 == 0 && tDims[1] % 32 == 0)
+        trainIs32Multiple = true;
+
+        if (queryIs32Multiple)
+            kernel::transpose<T, false, true >(queryT, query);
+        else
+            kernel::transpose<T, false, false>(queryT, query);
+
+        if (trainIs32Multiple)
+            kernel::transpose<T, false, true >(trainT, train);
+        else
+            kernel::transpose<T, false, false>(trainT, train);
+    }
+
+    if (use_lmem) {
+        switch (feat_len) {
+        case 1:
+            kernel::nearest_neighbour<T, To, dist_type, true , 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 2:
+            kernel::nearest_neighbour<T, To, dist_type, true , 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 4:
+            kernel::nearest_neighbour<T, To, dist_type, true , 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 8:
+            kernel::nearest_neighbour<T, To, dist_type, true , 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 16:
+            kernel::nearest_neighbour<T, To, dist_type, true , 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 32:
+            kernel::nearest_neighbour<T, To, dist_type, true , 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 64:
+            kernel::nearest_neighbour<T, To, dist_type, true , 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        default:
+            kernel::nearest_neighbour<T, To, dist_type, true , 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        }
+    } else {
+        switch (feat_len) {
+        case 1:
+            kernel::nearest_neighbour<T, To, dist_type, false, 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 2:
+            kernel::nearest_neighbour<T, To, dist_type, false, 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 4:
+            kernel::nearest_neighbour<T, To, dist_type, false, 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 8:
+            kernel::nearest_neighbour<T, To, dist_type, false, 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 16:
+            kernel::nearest_neighbour<T, To, dist_type, false, 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 32:
+            kernel::nearest_neighbour<T, To, dist_type, false, 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        case 64:
+            kernel::nearest_neighbour<T, To, dist_type, false, 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        default:
+            kernel::nearest_neighbour<T, To, dist_type, false, 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+            break;
+        }
+    }
+}
+
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+                       const Array<T>& query, const Array<T>& train,
+                       const uint dist_dim, const uint n_dist,
+                       const af_match_type dist_type)
+{
+    switch(dist_type) {
+        case AF_SAD: nearest_neighbour_<T, To, AF_SAD>(idx, dist, query, train, dist_dim, n_dist); break;
+        case AF_SSD: nearest_neighbour_<T, To, AF_SSD>(idx, dist, query, train, dist_dim, n_dist); break;
+        case AF_SHD: nearest_neighbour_<T, To, AF_SHD>(idx, dist, query, train, dist_dim, n_dist); break;
+        default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
+    }
+}
+
+#define INSTANTIATE(T, To)                                                              \
+    template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist,           \
+                                         const Array<T>& query, const Array<T>& train,  \
+                                         const uint dist_dim, const uint n_dist,        \
+                                         const af_match_type dist_type);
+
+INSTANTIATE(float , float)
+INSTANTIATE(double, double)
+INSTANTIATE(int   , int)
+INSTANTIATE(uint  , uint)
+INSTANTIATE(intl  , intl)
+INSTANTIATE(uintl , uintl)
+INSTANTIATE(uchar , uint)
+
+INSTANTIATE(uintl, uint)    // For Hamming
+
+}
diff --git a/src/backend/opencl/hamming.hpp b/src/backend/opencl/nearest_neighbour.hpp
similarity index 58%
rename from src/backend/opencl/hamming.hpp
rename to src/backend/opencl/nearest_neighbour.hpp
index 18689ef..787e4a4 100644
--- a/src/backend/opencl/hamming.hpp
+++ b/src/backend/opencl/nearest_neighbour.hpp
@@ -14,9 +14,10 @@ using af::features;
 namespace opencl
 {
 
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
-                     const Array<T>& query, const Array<T>& train,
-                     const uint dist_dim, const uint n_dist);
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+                       const Array<T>& query, const Array<T>& train,
+                       const uint dist_dim, const uint n_dist,
+                       const af_match_type dist_type = AF_SSD);
 
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git