[arrayfire] 37/408: FEAT Added nearest neighbour with SSD, SAD and SHD
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:12 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit e2ee96e396e477660fbe3c9386c8f1c4760aec9f
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date: Wed Jun 24 17:20:58 2015 -0400
FEAT Added nearest neighbour with SSD, SAD and SHD
---
include/af/vision.h | 10 ++
src/api/c/hamming.cpp | 51 +-----
src/api/c/nearest_neighbour.cpp | 90 +++++++++++
src/api/cpp/hamming.cpp | 2 +-
src/api/cpp/{hamming.cpp => nearest_neighbour.cpp} | 9 +-
src/backend/cpu/hamming.cpp | 103 ------------
src/backend/cpu/nearest_neighbour.cpp | 175 +++++++++++++++++++++
.../cpu/{hamming.hpp => nearest_neighbour.hpp} | 9 +-
src/backend/cuda/hamming.cu | 62 --------
.../kernel/{hamming.hpp => nearest_neighbour.hpp} | 155 ++++++++++++------
src/backend/cuda/math.hpp | 4 +
src/backend/cuda/nearest_neighbour.cu | 79 ++++++++++
.../cuda/{hamming.hpp => nearest_neighbour.hpp} | 9 +-
src/backend/opencl/hamming.cpp | 143 -----------------
.../kernel/{hamming.cl => nearest_neighbour.cl} | 66 +++++---
.../kernel/{hamming.hpp => nearest_neighbour.hpp} | 52 +++---
src/backend/opencl/nearest_neighbour.cpp | 165 +++++++++++++++++++
.../opencl/{hamming.hpp => nearest_neighbour.hpp} | 9 +-
18 files changed, 729 insertions(+), 464 deletions(-)
diff --git a/include/af/vision.h b/include/af/vision.h
index ed5e05f..d40a150 100644
--- a/include/af/vision.h
+++ b/include/af/vision.h
@@ -90,6 +90,11 @@ AFAPI void hammingMatcher(array& idx, array& dist,
const array& query, const array& train,
const dim_t dist_dim=0, const unsigned n_dist=1);
+AFAPI void nearestNeighbour(array& idx, array& dist,
+ const array& query, const array& train,
+ const dim_t dist_dim=0, const unsigned n_dist=1,
+ const af_match_type dist_type = AF_SSD);
+
/**
C++ Interface for image template matching
@@ -190,6 +195,11 @@ extern "C" {
const af_array query, const af_array train,
const dim_t dist_dim, const unsigned n_dist);
+ AFAPI af_err af_nearest_neighbour(af_array* idx, af_array* dist,
+ const af_array query, const af_array train,
+ const dim_t dist_dim, const unsigned n_dist,
+ const af_match_type dist_type);
+
/**
C Interface for image template matching
diff --git a/src/api/c/hamming.cpp b/src/api/c/hamming.cpp
index 8a1dd91..057d6f9 100644
--- a/src/api/c/hamming.cpp
+++ b/src/api/c/hamming.cpp
@@ -7,59 +7,10 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/dim4.hpp>
#include <af/defines.h>
#include <af/vision.h>
-#include <handle.hpp>
-#include <err_common.hpp>
-#include <backend.hpp>
-#include <hamming.hpp>
-
-using af::dim4;
-using namespace detail;
-
-template<typename T>
-static void hamming_matcher(af_array* idx, af_array* dist, const af_array query, const af_array train, const dim_t dist_dim, const uint n_dist)
-{
- Array<uint> oIdxArray = createEmptyArray<uint>(af::dim4());
- Array<uint> oDistArray = createEmptyArray<uint>(af::dim4());
-
- hamming_matcher<T>(oIdxArray, oDistArray, getArray<T>(query), getArray<T>(train), dist_dim, n_dist);
-
- *idx = getHandle<uint>(oIdxArray);
- *dist = getHandle<uint>(oDistArray);
-}
af_err af_hamming_matcher(af_array* idx, af_array* dist, const af_array query, const af_array train, const dim_t dist_dim, const uint n_dist)
{
- try {
- ArrayInfo qInfo = getInfo(query);
- ArrayInfo tInfo = getInfo(train);
- af_dtype qType = qInfo.getType();
- af_dtype tType = tInfo.getType();
- af::dim4 qDims = qInfo.dims();
- af::dim4 tDims = tInfo.dims();
-
- uint train_samples = (dist_dim == 0) ? 1 : 0;
-
- DIM_ASSERT(3, qDims[dist_dim] == tDims[dist_dim]);
- DIM_ASSERT(3, qDims[2] == 1 && qDims[3] == 1);
- DIM_ASSERT(3, qType == tType);
- DIM_ASSERT(4, tDims[2] == 1 && tDims[3] == 1);
- DIM_ASSERT(5, (dist_dim == 0 || dist_dim == 1));
- DIM_ASSERT(6, n_dist > 0 && n_dist <= (uint)tDims[train_samples]);
-
- af_array oIdx;
- af_array oDist;
- switch(qType) {
- case u8: hamming_matcher<uchar>(&oIdx, &oDist, query, train, dist_dim, n_dist); break;
- case u32: hamming_matcher<uint >(&oIdx, &oDist, query, train, dist_dim, n_dist); break;
- default : TYPE_ERROR(1, qType);
- }
- std::swap(*idx, oIdx);
- std::swap(*dist, oDist);
- }
- CATCHALL;
-
- return AF_SUCCESS;
+ return af_nearest_neighbour(idx, dist, query, train, dist_dim, n_dist, AF_SHD);
}
diff --git a/src/api/c/nearest_neighbour.cpp b/src/api/c/nearest_neighbour.cpp
new file mode 100644
index 0000000..d47e0ae
--- /dev/null
+++ b/src/api/c/nearest_neighbour.cpp
@@ -0,0 +1,90 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <af/vision.h>
+#include <handle.hpp>
+#include <err_common.hpp>
+#include <backend.hpp>
+#include <nearest_neighbour.hpp>
+
+using af::dim4;
+using namespace detail;
+
+template<typename Ti, typename To>
+static void nearest_neighbour(af_array* idx, af_array* dist,
+ const af_array query, const af_array train,
+ const dim_t dist_dim, const uint n_dist,
+ const af_match_type dist_type)
+{
+ Array<uint> oIdxArray = createEmptyArray<uint>(af::dim4());
+ Array<To> oDistArray = createEmptyArray<To>(af::dim4());
+
+ nearest_neighbour<Ti, To>(oIdxArray, oDistArray, getArray<Ti>(query), getArray<Ti>(train),
+ dist_dim, n_dist, dist_type);
+
+ *idx = getHandle<uint>(oIdxArray);
+ *dist = getHandle<To>(oDistArray);
+}
+
+af_err af_nearest_neighbour(af_array* idx, af_array* dist,
+ const af_array query, const af_array train,
+ const dim_t dist_dim, const uint n_dist,
+ const af_match_type dist_type)
+{
+ try {
+ ArrayInfo qInfo = getInfo(query);
+ ArrayInfo tInfo = getInfo(train);
+ af_dtype qType = qInfo.getType();
+ af_dtype tType = tInfo.getType();
+ af::dim4 qDims = qInfo.dims();
+ af::dim4 tDims = tInfo.dims();
+
+ uint train_samples = (dist_dim == 0) ? 1 : 0;
+
+ DIM_ASSERT(2, qDims[dist_dim] == tDims[dist_dim]);
+ DIM_ASSERT(2, qDims[2] == 1 && qDims[3] == 1);
+ DIM_ASSERT(3, tDims[2] == 1 && tDims[3] == 1);
+ DIM_ASSERT(4, (dist_dim == 0 || dist_dim == 1));
+ DIM_ASSERT(5, n_dist > 0 && n_dist <= (uint)tDims[train_samples]);
+ ARG_ASSERT(6, dist_type == AF_SAD || dist_type == AF_SSD || dist_type == AF_SHD);
+ TYPE_ASSERT(qType == tType);
+
+ // For Hamming, only u8, u32 and u64 allowed.
+ af_array oIdx;
+ af_array oDist;
+
+ if(dist_type == AF_SHD) {
+ TYPE_ASSERT(qType == u8 || qType == u32 || qType == u64);
+ switch(qType) {
+ case u8: nearest_neighbour<uchar, uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case u32: nearest_neighbour<uint , uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case u64: nearest_neighbour<uintl, uint>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ default : TYPE_ERROR(1, qType);
+ }
+ } else {
+ switch(qType) {
+ case f32: nearest_neighbour<float , float >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case f64: nearest_neighbour<double, double>(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case s32: nearest_neighbour<int , int >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case u32: nearest_neighbour<uint , uint >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case s64: nearest_neighbour<intl , intl >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case u64: nearest_neighbour<uintl , uintl >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ case u8: nearest_neighbour<uchar , uint >(&oIdx, &oDist, query, train, dist_dim, n_dist, dist_type); break;
+ default : TYPE_ERROR(1, qType);
+ }
+ }
+ std::swap(*idx, oIdx);
+ std::swap(*dist, oDist);
+ }
+ CATCHALL;
+
+ return AF_SUCCESS;
+}
diff --git a/src/api/cpp/hamming.cpp b/src/api/cpp/hamming.cpp
index f97c34c..4ef8d1e 100644
--- a/src/api/cpp/hamming.cpp
+++ b/src/api/cpp/hamming.cpp
@@ -20,7 +20,7 @@ void hammingMatcher(array& idx, array& dist,
{
af_array temp_idx = 0;
af_array temp_dist = 0;
- AF_THROW(af_hamming_matcher(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist));
+ AF_THROW(af_nearest_neighbour(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist, AF_SHD));
idx = array(temp_idx);
dist = array(temp_dist);
}
diff --git a/src/api/cpp/hamming.cpp b/src/api/cpp/nearest_neighbour.cpp
similarity index 60%
copy from src/api/cpp/hamming.cpp
copy to src/api/cpp/nearest_neighbour.cpp
index f97c34c..5dae7ab 100644
--- a/src/api/cpp/hamming.cpp
+++ b/src/api/cpp/nearest_neighbour.cpp
@@ -14,13 +14,14 @@
namespace af
{
-void hammingMatcher(array& idx, array& dist,
- const array& query, const array& train,
- const dim_t dist_dim, const unsigned n_dist)
+void nearestNeighbour(array& idx, array& dist,
+ const array& query, const array& train,
+ const dim_t dist_dim, const unsigned n_dist,
+ const af_match_type dist_type)
{
af_array temp_idx = 0;
af_array temp_dist = 0;
- AF_THROW(af_hamming_matcher(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist));
+ AF_THROW(af_nearest_neighbour(&temp_idx, &temp_dist, query.get(), train.get(), dist_dim, n_dist, dist_type));
idx = array(temp_idx);
dist = array(temp_dist);
}
diff --git a/src/backend/cpu/hamming.cpp b/src/backend/cpu/hamming.cpp
deleted file mode 100644
index 544883b..0000000
--- a/src/backend/cpu/hamming.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
-#include <Array.hpp>
-#include <err_cpu.hpp>
-#include <handle.hpp>
-
-using af::dim4;
-
-namespace cpu
-{
-
-#if defined(_WIN32) || defined(_MSC_VER)
-
-#include <intrin.h>
-#define __builtin_popcount __popcnt
-
-#endif
-
-template<typename T>
-inline uint hamming_distance(T v1, T v2)
-{
- return __builtin_popcount(v1 ^ v2);
-}
-
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist)
-{
- uint sample_dim = (dist_dim == 0) ? 1 : 0;
- const dim4 qDims = query.dims();
- const dim4 tDims = train.dims();
-
- if (n_dist > 1) {
- CPU_NOT_SUPPORTED();
- }
-
- const unsigned distLength = qDims[dist_dim];
- const unsigned nQuery = qDims[sample_dim];
- const unsigned nTrain = tDims[sample_dim];
-
- const dim4 outDims(n_dist, nQuery);
-
- idx = createEmptyArray<uint>(outDims);
- dist = createEmptyArray<uint>(outDims);
-
- const T* qPtr = query.get();
- const T* tPtr = train.get();
- uint* iPtr = idx.get();
- uint* dPtr = dist.get();
-
- for (unsigned i = 0; i < nQuery; i++) {
- unsigned best_dist = limit_max<unsigned>();
- unsigned best_idx = 0;
-
- for (unsigned j = 0; j < nTrain; j++) {
- unsigned local_dist = 0;
- for (unsigned k = 0; k < distLength; k++) {
- size_t qIdx, tIdx;
- if (sample_dim == 0) {
- qIdx = k * qDims[0] + i;
- tIdx = k * tDims[0] + j;
- }
- else {
- qIdx = i * qDims[0] + k;
- tIdx = j * tDims[0] + k;
- }
-
- local_dist += hamming_distance(qPtr[qIdx], tPtr[tIdx]);
- }
-
- if (local_dist < best_dist) {
- best_dist = local_dist;
- best_idx = j;
- }
- }
-
- size_t oIdx;
- oIdx = i;
- iPtr[oIdx] = best_idx;
- dPtr[oIdx] = best_dist;
- }
-}
-
-#define INSTANTIATE(T)\
- template void hamming_matcher<T>(Array<uint>& idx, Array<uint>& dist, \
- const Array<T>& query, const Array<T>& train, \
- const uint dist_dim, const uint n_dist);
-
-INSTANTIATE(uchar)
-INSTANTIATE(uint)
-
-}
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
new file mode 100644
index 0000000..f706769
--- /dev/null
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -0,0 +1,175 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_cpu.hpp>
+#include <handle.hpp>
+
+using af::dim4;
+
+namespace cpu
+{
+
+#if defined(_WIN32) || defined(_MSC_VER)
+
+#include <intrin.h>
+#define __builtin_popcount __popcnt
+
+#endif
+
+template<typename T, typename To, af_match_type dist_type>
+struct dist_op
+{
+ To operator()(T v1, T v2)
+ {
+ return v1 - v2; // Garbage distance
+ }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SAD>
+{
+ To operator()(T v1, T v2)
+ {
+ return std::abs((double)v1 - (double)v2);
+ }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SSD>
+{
+ To operator()(T v1, T v2)
+ {
+ return (v1 - v2) * (v1 - v2);
+ }
+};
+
+template<typename To>
+struct dist_op<uint, To, AF_SHD>
+{
+ To operator()(uint v1, uint v2)
+ {
+ return __builtin_popcount(v1 ^ v2);
+ }
+};
+
+template<typename To>
+struct dist_op<uintl, To, AF_SHD>
+{
+ To operator()(uintl v1, uintl v2)
+ {
+ return __builtin_popcount(v1 ^ v2);
+ }
+};
+
+template<typename To>
+struct dist_op<uchar, To, AF_SHD>
+{
+ To operator()(uchar v1, uchar v2)
+ {
+ return __builtin_popcount(v1 ^ v2);
+ }
+};
+
+template<typename T, typename To, af_match_type dist_type>
+void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist)
+{
+ uint sample_dim = (dist_dim == 0) ? 1 : 0;
+ const dim4 qDims = query.dims();
+ const dim4 tDims = train.dims();
+
+ if (n_dist > 1) {
+ CPU_NOT_SUPPORTED();
+ }
+
+ const unsigned distLength = qDims[dist_dim];
+ const unsigned nQuery = qDims[sample_dim];
+ const unsigned nTrain = tDims[sample_dim];
+
+ const dim4 outDims(n_dist, nQuery);
+
+ idx = createEmptyArray<uint>(outDims);
+ dist = createEmptyArray<To >(outDims);
+
+ const T* qPtr = query.get();
+ const T* tPtr = train.get();
+ uint* iPtr = idx.get();
+ To* dPtr = dist.get();
+
+ dist_op<T, To, dist_type> op;
+
+ for (unsigned i = 0; i < nQuery; i++) {
+ To best_dist = limit_max<To>();
+ unsigned best_idx = 0;
+
+ for (unsigned j = 0; j < nTrain; j++) {
+ To local_dist = 0;
+ for (unsigned k = 0; k < distLength; k++) {
+ size_t qIdx, tIdx;
+ if (sample_dim == 0) {
+ qIdx = k * qDims[0] + i;
+ tIdx = k * tDims[0] + j;
+ }
+ else {
+ qIdx = i * qDims[0] + k;
+ tIdx = j * tDims[0] + k;
+ }
+
+ local_dist += op(qPtr[qIdx], tPtr[tIdx]);
+ }
+
+ if (local_dist < best_dist) {
+ best_dist = local_dist;
+ best_idx = j;
+ }
+ }
+
+ size_t oIdx;
+ oIdx = i;
+ iPtr[oIdx] = best_idx;
+ dPtr[oIdx] = best_dist;
+ }
+}
+
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist,
+ const af_match_type dist_type)
+{
+ switch(dist_type) {
+ case AF_SAD: nearest_neighbour_<T, To, AF_SAD>(idx, dist, query, train, dist_dim, n_dist); break;
+ case AF_SSD: nearest_neighbour_<T, To, AF_SSD>(idx, dist, query, train, dist_dim, n_dist); break;
+ case AF_SHD: nearest_neighbour_<T, To, AF_SHD>(idx, dist, query, train, dist_dim, n_dist); break;
+ default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
+ }
+}
+
+#define INSTANTIATE(T, To) \
+ template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist, \
+ const Array<T>& query, const Array<T>& train, \
+ const uint dist_dim, const uint n_dist, \
+ const af_match_type dist_type);
+
+INSTANTIATE(float , float)
+INSTANTIATE(double, double)
+INSTANTIATE(int , int)
+INSTANTIATE(uint , uint)
+INSTANTIATE(intl , intl)
+INSTANTIATE(uintl , uintl)
+INSTANTIATE(uchar , uint)
+
+INSTANTIATE(uintl, uint) // For Hamming
+
+}
diff --git a/src/backend/cpu/hamming.hpp b/src/backend/cpu/nearest_neighbour.hpp
similarity index 56%
rename from src/backend/cpu/hamming.hpp
rename to src/backend/cpu/nearest_neighbour.hpp
index 3787a68..4cf1dc2 100644
--- a/src/backend/cpu/hamming.hpp
+++ b/src/backend/cpu/nearest_neighbour.hpp
@@ -12,9 +12,10 @@
namespace cpu
{
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist);
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist,
+ const af_match_type dist_type = AF_SSD);
}
diff --git a/src/backend/cuda/hamming.cu b/src/backend/cuda/hamming.cu
deleted file mode 100644
index 2022675..0000000
--- a/src/backend/cuda/hamming.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
-#include <Array.hpp>
-#include <err_cuda.hpp>
-#include <handle.hpp>
-#include <kernel/hamming.hpp>
-#include <kernel/transpose.hpp>
-
-using af::dim4;
-
-namespace cuda
-{
-
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist)
-{
- uint sample_dim = (dist_dim == 0) ? 1 : 0;
- const dim4 qDims = query.dims();
- const dim4 tDims = train.dims();
-
- const dim4 outDims(n_dist, qDims[sample_dim]);
-
- idx = createEmptyArray<uint>(outDims);
- dist = createEmptyArray<uint>(outDims);
-
- Array<T> queryT = query;
- Array<T> trainT = train;
-
- if (dist_dim == 0) {
- const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
- const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
- queryT = createEmptyArray<T>(queryTDims);
- trainT = createEmptyArray<T>(trainTDims);
-
- kernel::transpose<T, false>(queryT, query, query.ndims());
- kernel::transpose<T, false>(trainT, train, train.ndims());
- }
-
- kernel::hamming_matcher<T>(idx, dist, queryT, trainT, 1, n_dist);
-}
-
-#define INSTANTIATE(T)\
- template void hamming_matcher<T>(Array<uint>& idx, Array<uint>& dist, \
- const Array<T>& query, const Array<T>& train, \
- const uint dist_dim, const uint n_dist);
-
-INSTANTIATE(uchar)
-INSTANTIATE(uint)
-
-}
diff --git a/src/backend/cuda/kernel/hamming.hpp b/src/backend/cuda/kernel/nearest_neighbour.hpp
similarity index 77%
rename from src/backend/cuda/kernel/hamming.hpp
rename to src/backend/cuda/kernel/nearest_neighbour.hpp
index 99fcf7f..fa87e9b 100644
--- a/src/backend/cuda/kernel/hamming.hpp
+++ b/src/backend/cuda/kernel/nearest_neighbour.hpp
@@ -13,6 +13,7 @@
#include <debug_cuda.hpp>
#include <memory.hpp>
#include <platform.hpp>
+#include <backend.hpp>
namespace cuda
{
@@ -22,10 +23,65 @@ namespace kernel
static const unsigned THREADS = 256;
-template<typename T, unsigned feat_len, bool use_shmem>
-__global__ void hamming_matcher_unroll(
+template<typename T, typename To, af_match_type dist_type>
+struct dist_op
+{
+ __DH__ To operator()(T v1, T v2)
+ {
+ return v1 - v2;
+ }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SAD>
+{
+ __device__ To operator()(T v1, T v2)
+ {
+ return abs((double)v1 - (double)v2);
+ }
+};
+
+template<typename T, typename To>
+struct dist_op<T, To, AF_SSD>
+{
+ __device__ To operator()(T v1, T v2)
+ {
+ return (v1 - v2) * (v1 - v2);
+ }
+};
+
+template<typename To>
+struct dist_op<uint, To, AF_SHD>
+{
+ __device__ To operator()(uint v1, uint v2)
+ {
+ return __popc(v1 ^ v2);
+ }
+};
+
+template<typename To>
+struct dist_op<uintl, To, AF_SHD>
+{
+ __device__ To operator()(uintl v1, uintl v2)
+ {
+ return __popc(v1 ^ v2);
+ }
+};
+
+template<typename To>
+struct dist_op<uchar, To, AF_SHD>
+{
+ __device__ To operator()(uchar v1, uchar v2)
+ {
+ return __popc(v1 ^ v2);
+ }
+};
+
+
+template<typename T, typename To, af_match_type dist_type, unsigned feat_len, bool use_shmem>
+__global__ void nearest_neighbour_unroll(
unsigned* out_idx,
- unsigned* out_dist,
+ To* out_dist,
CParam<T> query,
CParam<T> train,
const unsigned max_dist)
@@ -36,7 +92,7 @@ __global__ void hamming_matcher_unroll(
unsigned f = blockDim.x * blockIdx.x + threadIdx.x;
unsigned tid = threadIdx.x;
- __shared__ unsigned s_dist[THREADS];
+ __shared__ To s_dist[THREADS];
__shared__ unsigned s_idx[THREADS];
extern __shared__ char smem[];
@@ -59,6 +115,8 @@ __global__ void hamming_matcher_unroll(
}
__syncthreads();
+ dist_op<T, To, dist_type> op;
+
for (unsigned j = 0; j < nquery; j++) {
s_dist[tid] = max_dist;
@@ -69,17 +127,17 @@ __global__ void hamming_matcher_unroll(
}
__syncthreads();
- unsigned dist = 0;
+ To dist = 0;
if (valid_feat) {
#pragma unroll
for (unsigned k = 0; k < feat_len; k++) {
// Calculate Hamming distance for 32-bits of descriptor and
// accumulates to dist
if (use_shmem) {
- dist += __popc(s_train[k * blockDim.x + tid] ^ s_query[k]);
+ dist += op(s_train[k * blockDim.x + tid], s_query[k]);
}
else {
- dist += __popc(train.ptr[k * ntrain + f] ^ s_query[k]);
+ dist += op(train.ptr[k * ntrain + f], s_query[k]);
}
}
@@ -159,13 +217,13 @@ __global__ void hamming_matcher_unroll(
}
}
-template<typename T, bool use_shmem>
-__global__ void hamming_matcher(
+template<typename T, typename To, af_match_type dist_type, bool use_shmem>
+__global__ void nearest_neighbour(
unsigned* out_idx,
- unsigned* out_dist,
+ To* out_dist,
CParam<T> query,
CParam<T> train,
- const unsigned max_dist,
+ const To max_dist,
const unsigned feat_len)
{
unsigned nquery = query.dims[0];
@@ -174,7 +232,7 @@ __global__ void hamming_matcher(
unsigned f = blockDim.x * blockIdx.x + threadIdx.x;
unsigned tid = threadIdx.x;
- __shared__ unsigned s_dist[THREADS];
+ __shared__ To s_dist[THREADS];
__shared__ unsigned s_idx[THREADS];
extern __shared__ char smem[];
@@ -196,6 +254,8 @@ __global__ void hamming_matcher(
}
__syncthreads();
+ dist_op<T, To, dist_type> op;
+
for (unsigned j = 0; j < nquery; j++) {
s_dist[tid] = max_dist;
@@ -206,16 +266,16 @@ __global__ void hamming_matcher(
}
__syncthreads();
- unsigned dist = 0;
+ To dist = 0;
if (valid_feat) {
for (unsigned k = 0; k < feat_len; k++) {
// Calculate Hamming distance for 32-bits of descriptor and
// accumulates to dist
if (use_shmem) {
- dist += __popc(s_train[k * blockDim.x + tid] ^ s_query[k]);
+ dist += op(s_train[k * blockDim.x + tid], s_query[k]);
}
else {
- dist += __popc(train.ptr[k * ntrain + f] ^ s_query[k]);
+ dist += op(train.ptr[k * ntrain + f], s_query[k]);
}
}
@@ -295,19 +355,20 @@ __global__ void hamming_matcher(
}
}
+template<typename To>
__global__ void select_matches(
Param<unsigned> idx,
- Param<unsigned> dist,
+ Param<To> dist,
const unsigned* in_idx,
- const unsigned* in_dist,
+ const To* in_dist,
const unsigned nfeat,
const unsigned nelem,
- const unsigned max_dist)
+ const To max_dist)
{
unsigned f = blockIdx.x * blockDim.x + threadIdx.x;
unsigned sid = threadIdx.x * blockDim.y + threadIdx.y;
- __shared__ unsigned s_dist[THREADS];
+ __shared__ To s_dist[THREADS];
__shared__ unsigned s_idx[THREADS];
if (f < nfeat) {
@@ -315,9 +376,9 @@ __global__ void select_matches(
__syncthreads();
for (unsigned i = threadIdx.y; i < nelem; i += blockDim.y) {
- unsigned dist = in_dist[f * nelem + i];
+ To dist = in_dist[f * nelem + i];
- // Copy all best matches previously found in hamming_matcher() to
+ // Copy all best matches previously found in nearest_neighbour() to
// shared memory
if (dist < s_dist[sid]) {
s_dist[sid] = dist;
@@ -329,7 +390,7 @@ __global__ void select_matches(
// Reduce best matches and find the best of them all
for (unsigned i = blockDim.y / 2; i > 0; i >>= 1) {
if (threadIdx.y < i) {
- unsigned dist = s_dist[sid + i];
+ To dist = s_dist[sid + i];
if (dist < s_dist[sid]) {
s_dist[sid] = dist;
s_idx[sid] = s_idx[sid + i];
@@ -346,16 +407,16 @@ __global__ void select_matches(
}
}
-template<typename T>
-void hamming_matcher(Param<uint> idx,
- Param<uint> dist,
- CParam<T> query,
- CParam<T> train,
- const dim_t dist_dim,
- const unsigned n_dist)
+template<typename T, typename To, af_match_type dist_type>
+void nearest_neighbour(Param<uint> idx,
+ Param<To> dist,
+ CParam<T> query,
+ CParam<T> train,
+ const dim_t dist_dim,
+ const unsigned n_dist)
{
const unsigned feat_len = query.dims[dist_dim];
- const unsigned max_dist = feat_len * 8 * sizeof(T);
+ const To max_dist = limit_max<To>();
if (feat_len > THREADS) {
CUDA_NOT_SUPPORTED();
@@ -381,7 +442,7 @@ void hamming_matcher(Param<uint> idx,
unsigned nblk = blocks.x;
unsigned* d_blk_idx = memAlloc<unsigned>(nblk * nquery);
- unsigned* d_blk_dist = memAlloc<unsigned>(nblk * nquery);
+ To* d_blk_dist = memAlloc<To>(nblk * nquery);
// For each query vector, find training vector with smallest Hamming
// distance per CUDA block
@@ -389,35 +450,35 @@ void hamming_matcher(Param<uint> idx,
switch(feat_len) {
// Optimized lengths (faster due to loop unrolling)
case 1:
- hamming_matcher_unroll<T,1,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,1,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 2:
- hamming_matcher_unroll<T,2,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,2,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 4:
- hamming_matcher_unroll<T,4,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,4,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 8:
- hamming_matcher_unroll<T,8,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,8,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 16:
- hamming_matcher_unroll<T,16,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,16,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 32:
- hamming_matcher_unroll<T,32,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,32,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 64:
- hamming_matcher_unroll<T,64,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,64,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
default:
- hamming_matcher<T,true><<<blocks, threads, smem_sz>>>
+ nearest_neighbour<T,To,dist_type,true><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
}
}
@@ -425,35 +486,35 @@ void hamming_matcher(Param<uint> idx,
switch(feat_len) {
// Optimized lengths (faster due to loop unrolling)
case 1:
- hamming_matcher_unroll<T,1,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,1,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 2:
- hamming_matcher_unroll<T,2,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,2,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 4:
- hamming_matcher_unroll<T,4,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,4,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 8:
- hamming_matcher_unroll<T,8,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,8,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 16:
- hamming_matcher_unroll<T,16,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,16,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 32:
- hamming_matcher_unroll<T,32,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,32,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
case 64:
- hamming_matcher_unroll<T,64,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour_unroll<T,To,dist_type,64,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist);
break;
default:
- hamming_matcher<T,false><<<blocks, threads, smem_sz>>>
+ nearest_neighbour<T,To,dist_type,false><<<blocks, threads, smem_sz>>>
(d_blk_idx, d_blk_dist, query, train, max_dist, feat_len);
}
}
diff --git a/src/backend/cuda/math.hpp b/src/backend/cuda/math.hpp
index 3f7dcd4..83e3a0c 100644
--- a/src/backend/cuda/math.hpp
+++ b/src/backend/cuda/math.hpp
@@ -105,6 +105,10 @@ namespace cuda
template<> __device__ float limit_min<float>() { return -CUDART_INF_F; }
template<> __device__ double limit_max<double>() { return CUDART_INF; }
template<> __device__ double limit_min<double>() { return -CUDART_INF; }
+ template<> __device__ intl limit_max<intl>() { return 0x7fffffffffffffff; }
+ template<> __device__ intl limit_min<intl>() { return 0x8000000000000000; }
+ template<> __device__ uintl limit_max<uintl>() { return 0xffffffffffffffff; }
+ template<> __device__ uintl limit_min<uintl>() { return 0; }
#endif
#define upcast cuComplexFloatToDouble
diff --git a/src/backend/cuda/nearest_neighbour.cu b/src/backend/cuda/nearest_neighbour.cu
new file mode 100644
index 0000000..1899c9d
--- /dev/null
+++ b/src/backend/cuda/nearest_neighbour.cu
@@ -0,0 +1,79 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_cuda.hpp>
+#include <handle.hpp>
+#include <kernel/nearest_neighbour.hpp>
+#include <kernel/transpose.hpp>
+
+using af::dim4;
+
+namespace cuda
+{
+
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist,
+ const af_match_type dist_type)
+{
+ uint sample_dim = (dist_dim == 0) ? 1 : 0;
+ const dim4 qDims = query.dims();
+ const dim4 tDims = train.dims();
+
+ const dim4 outDims(n_dist, qDims[sample_dim]);
+
+ idx = createEmptyArray<uint>(outDims);
+ dist = createEmptyArray<To>(outDims);
+
+ Array<T> queryT = query;
+ Array<T> trainT = train;
+
+ if (dist_dim == 0) {
+ const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
+ const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
+ queryT = createEmptyArray<T>(queryTDims);
+ trainT = createEmptyArray<T>(trainTDims);
+
+ kernel::transpose<T, false>(queryT, query, query.ndims());
+ kernel::transpose<T, false>(trainT, train, train.ndims());
+ }
+
+ switch(dist_type) {
+ case AF_SAD: kernel::nearest_neighbour<T, To, AF_SAD>(idx, dist, queryT, trainT, 1, n_dist);
+ break;
+ case AF_SSD: kernel::nearest_neighbour<T, To, AF_SSD>(idx, dist, queryT, trainT, 1, n_dist);
+ break;
+ case AF_SHD: kernel::nearest_neighbour<T, To, AF_SHD>(idx, dist, queryT, trainT, 1, n_dist);
+ break;
+ default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
+ }
+}
+
+#define INSTANTIATE(T, To) \
+ template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist, \
+ const Array<T>& query, const Array<T>& train, \
+ const uint dist_dim, const uint n_dist, \
+ const af_match_type dist_type);
+
+INSTANTIATE(float , float)
+INSTANTIATE(double, double)
+INSTANTIATE(int , int)
+INSTANTIATE(uint , uint)
+INSTANTIATE(intl , intl)
+INSTANTIATE(uintl , uintl)
+INSTANTIATE(uchar , uint)
+
+INSTANTIATE(uintl, uint) // For Hamming
+
+}
diff --git a/src/backend/cuda/hamming.hpp b/src/backend/cuda/nearest_neighbour.hpp
similarity index 57%
rename from src/backend/cuda/hamming.hpp
rename to src/backend/cuda/nearest_neighbour.hpp
index 37d3944..443f972 100644
--- a/src/backend/cuda/hamming.hpp
+++ b/src/backend/cuda/nearest_neighbour.hpp
@@ -14,9 +14,10 @@ using af::features;
namespace cuda
{
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist);
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist,
+ const af_match_type dist_type = AF_SSD);
}
diff --git a/src/backend/opencl/hamming.cpp b/src/backend/opencl/hamming.cpp
deleted file mode 100644
index 5d4bd59..0000000
--- a/src/backend/opencl/hamming.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
-#include <Array.hpp>
-#include <err_opencl.hpp>
-#include <handle.hpp>
-#include <kernel/hamming.hpp>
-#include <kernel/transpose.hpp>
-
-using af::dim4;
-using cl::Device;
-
-namespace opencl
-{
-
-static const unsigned THREADS = 256;
-
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist)
-{
- uint sample_dim = (dist_dim == 0) ? 1 : 0;
- const dim4 qDims = query.dims();
- const dim4 tDims = train.dims();
-
- const dim4 outDims(n_dist, qDims[sample_dim]);
-
- idx = createEmptyArray<uint>(outDims);
- dist = createEmptyArray<uint>(outDims);
-
- const unsigned feat_len = qDims[dist_dim];
-
- // Determine maximum feat_len capable of using shared memory (faster)
- cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
- size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
- size_t ltrain_sz = THREADS * feat_len * sizeof(T);
- bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false;
- size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef;
-
- Array<T> queryT = query;
- Array<T> trainT = train;
-
- if (dist_dim == 0) {
- const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
- const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
- queryT = createEmptyArray<T>(queryTDims);
- trainT = createEmptyArray<T>(trainTDims);
-
- bool queryIs32Multiple = false;
- if (qDims[0] % 32 == 0 && qDims[1] % 32 == 0)
- queryIs32Multiple = true;
-
- bool trainIs32Multiple = false;
- if (tDims[0] % 32 == 0 && tDims[1] % 32 == 0)
- trainIs32Multiple = true;
-
- if (queryIs32Multiple)
- kernel::transpose<T, false, true >(queryT, query);
- else
- kernel::transpose<T, false, false>(queryT, query);
-
- if (trainIs32Multiple)
- kernel::transpose<T, false, true >(trainT, train);
- else
- kernel::transpose<T, false, false>(trainT, train);
- }
-
- if (use_lmem) {
- switch (feat_len) {
- case 1:
- kernel::hamming_matcher<T, true , 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 2:
- kernel::hamming_matcher<T, true , 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 4:
- kernel::hamming_matcher<T, true , 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 8:
- kernel::hamming_matcher<T, true , 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 16:
- kernel::hamming_matcher<T, true , 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 32:
- kernel::hamming_matcher<T, true , 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 64:
- kernel::hamming_matcher<T, true , 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- default:
- kernel::hamming_matcher<T, true , 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- }
- } else {
- switch (feat_len) {
- case 1:
- kernel::hamming_matcher<T, false, 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 2:
- kernel::hamming_matcher<T, false, 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 4:
- kernel::hamming_matcher<T, false, 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 8:
- kernel::hamming_matcher<T, false, 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 16:
- kernel::hamming_matcher<T, false, 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 32:
- kernel::hamming_matcher<T, false, 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- case 64:
- kernel::hamming_matcher<T, false, 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- default:
- kernel::hamming_matcher<T, false, 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- break;
- }
- }
-}
-
-#define INSTANTIATE(T)\
- template void hamming_matcher<T>(Array<uint>& idx, Array<uint>& dist, \
- const Array<T>& query, const Array<T>& train, \
- const uint dist_dim, const uint n_dist);
-
-INSTANTIATE(uchar)
-INSTANTIATE(uint)
-
-}
diff --git a/src/backend/opencl/kernel/hamming.cl b/src/backend/opencl/kernel/nearest_neighbour.cl
similarity index 89%
rename from src/backend/opencl/kernel/hamming.cl
rename to src/backend/opencl/kernel/nearest_neighbour.cl
index 68ce6fd..024fdc6 100644
--- a/src/backend/opencl/kernel/hamming.cl
+++ b/src/backend/opencl/kernel/nearest_neighbour.cl
@@ -20,15 +20,39 @@ __inline unsigned popcount(unsigned x)
}
#endif
+#ifdef USE_DOUBLE
+To _sad_(T v1, T v2)
+{
+ return fabs(v1 - v2);
+}
+#else
+To _sad_(T v1, T v2)
+{
+ return fabs((float)v1 - (float)v2);
+}
+#endif
+
+To _ssd_(T v1, T v2)
+{
+ return (v1 - v2) * (v1 - v2);
+}
+
+#ifdef __SHD__
+unsigned _shd_(T v1, T v2)
+{
+ return popcount(v1 ^ v2);
+}
+#endif
+
__kernel
-void hamming_matcher_unroll(
+void nearest_neighbour_unroll(
__global unsigned* out_idx,
- __global unsigned* out_dist,
+ __global To* out_dist,
__global const T* query,
KParam qInfo,
__global const T* train,
KParam tInfo,
- const unsigned max_dist,
+ const To max_dist,
__local T* lmem)
{
unsigned nquery = qInfo.dims[0];
@@ -37,7 +61,7 @@ void hamming_matcher_unroll(
unsigned f = get_global_id(0);
unsigned tid = get_local_id(0);
- __local unsigned l_dist[THREADS];
+ __local To l_dist[THREADS];
__local unsigned l_idx[THREADS];
__local T* l_query = lmem;
@@ -69,16 +93,16 @@ void hamming_matcher_unroll(
}
barrier(CLK_LOCAL_MEM_FENCE);
- unsigned dist = 0;
+ To dist = 0;
if (valid_feat) {
#pragma unroll
for (int k = 0; k < (int)FEAT_LEN; k++) {
// Calculate Hamming distance for 32-bits of descriptor and
// accumulates to dist
#ifdef USE_LOCAL_MEM
- dist += popcount(l_train[k * get_local_size(0) + tid] ^ l_query[k]);
+ dist += DISTOP(l_train[k * get_local_size(0) + tid], l_query[k]);
#else
- dist += popcount(train[k * ntrain + f] ^ l_query[k]);
+ dist += DISTOP(train[k * ntrain + f], l_query[k]);
#endif
}
}
@@ -161,14 +185,14 @@ void hamming_matcher_unroll(
}
__kernel
-void hamming_matcher(
+void nearest_neighbour(
__global unsigned* out_idx,
- __global unsigned* out_dist,
+ __global To* out_dist,
__global const T* query,
KParam qInfo,
__global const T* train,
KParam tInfo,
- const unsigned max_dist,
+ const To max_dist,
const unsigned feat_len,
__local T* lmem)
{
@@ -178,7 +202,7 @@ void hamming_matcher(
unsigned f = get_global_id(0);
unsigned tid = get_local_id(0);
- __local unsigned l_dist[THREADS];
+ __local To l_dist[THREADS];
__local unsigned l_idx[THREADS];
__local T* l_query = lmem;
@@ -209,15 +233,15 @@ void hamming_matcher(
}
barrier(CLK_LOCAL_MEM_FENCE);
- unsigned dist = 0;
+ To dist = 0;
if (valid_feat) {
for (int k = 0; k < (int)feat_len; k++) {
// Calculate Hamming distance for 32-bits of descriptor and
// accumulates to dist
#ifdef USE_LOCAL_MEM
- dist += popcount(l_train[k * get_local_size(0) + tid] ^ l_query[k]);
+ dist += DISTOP(l_train[k * get_local_size(0) + tid], l_query[k]);
#else
- dist += popcount(train[k * ntrain + f] ^ l_query[k]);
+ dist += DISTOP(train[k * ntrain + f], l_query[k]);
#endif
}
}
@@ -302,18 +326,18 @@ void hamming_matcher(
__kernel
void select_matches(
__global unsigned* idx,
- __global unsigned* dist,
+ __global To* dist,
__global const unsigned* in_idx,
- __global const unsigned* in_dist,
+ __global const To* in_dist,
const unsigned nfeat,
const unsigned nelem,
- const unsigned max_dist)
+ const To max_dist)
{
unsigned f = get_global_id(0);
unsigned lsz1 = get_local_size(1);
unsigned sid = get_local_id(0) * lsz1 + get_local_id(1);
- __local unsigned l_dist[THREADS];
+ __local To l_dist[THREADS];
__local unsigned l_idx[THREADS];
bool valid_feat = (f < nfeat);
@@ -327,9 +351,9 @@ void select_matches(
for (unsigned i = get_local_id(1); i < nelem_max; i += get_local_size(1)) {
if (valid_feat && i < nelem) {
- unsigned dist = in_dist[f * nelem + i];
+ To dist = in_dist[f * nelem + i];
- // Copy all best matches previously found in hamming_matcher() to
+ // Copy all best matches previously found in nearest_neighbour() to
// shared memory
if (dist < l_dist[sid]) {
l_dist[sid] = dist;
@@ -342,7 +366,7 @@ void select_matches(
for (unsigned i = get_local_size(1) / 2; i > 0; i >>= 1) {
if (get_local_id(1) < i) {
if (valid_feat) {
- unsigned dist = l_dist[sid + i];
+ To dist = l_dist[sid + i];
if (dist < l_dist[sid]) {
l_dist[sid] = dist;
l_idx[sid] = l_idx[sid + i];
diff --git a/src/backend/opencl/kernel/hamming.hpp b/src/backend/opencl/kernel/nearest_neighbour.hpp
similarity index 70%
rename from src/backend/opencl/kernel/hamming.hpp
rename to src/backend/opencl/kernel/nearest_neighbour.hpp
index b89ee15..601dcc3 100644
--- a/src/backend/opencl/kernel/hamming.hpp
+++ b/src/backend/opencl/kernel/nearest_neighbour.hpp
@@ -12,8 +12,9 @@
#include <dispatch.hpp>
#include <err_opencl.hpp>
#include <debug_opencl.hpp>
-#include <kernel_headers/hamming.hpp>
+#include <kernel_headers/nearest_neighbour.hpp>
#include <memory.hpp>
+#include <math.hpp>
using cl::LocalSpaceArg;
@@ -25,18 +26,18 @@ namespace kernel
static const unsigned THREADS = 256;
-template<typename T, bool use_lmem, unsigned unroll_len>
-void hamming_matcher(Param idx,
- Param dist,
- Param query,
- Param train,
- const dim_t dist_dim,
- const unsigned n_dist,
- const size_t lmem_sz)
+template<typename T, typename To, af_match_type dist_type, bool use_lmem, unsigned unroll_len>
+void nearest_neighbour(Param idx,
+ Param dist,
+ Param query,
+ Param train,
+ const dim_t dist_dim,
+ const unsigned n_dist,
+ const size_t lmem_sz)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
- static Program hammingProgs[DeviceManager::MAX_DEVICES];
+ static Program nearest_neighbourProgs[DeviceManager::MAX_DEVICES];
static Kernel huKernel[DeviceManager::MAX_DEVICES];
static Kernel hmKernel[DeviceManager::MAX_DEVICES];
static Kernel smKernel[DeviceManager::MAX_DEVICES];
@@ -44,7 +45,7 @@ void hamming_matcher(Param idx,
int device = getActiveDeviceId();
const unsigned feat_len = query.info.dims[dist_dim];
- const unsigned max_dist = feat_len * 8 * sizeof(T);
+ const To max_dist = limit_max<To>();
if (feat_len > THREADS) {
OPENCL_NOT_SUPPORTED();
@@ -54,20 +55,29 @@ void hamming_matcher(Param idx,
std::ostringstream options;
options << " -D T=" << dtype_traits<T>::getName()
+ << " -D To=" << dtype_traits<To>::getName()
<< " -D THREADS=" << THREADS
<< " -D FEAT_LEN=" << unroll_len;
+ switch(dist_type) {
+ case AF_SAD: options <<" -D DISTOP=_sad_"; break;
+ case AF_SSD: options <<" -D DISTOP=_ssd_"; break;
+ case AF_SHD: options <<" -D DISTOP=_shd_ -D __SHD__";
+ break;
+ default: break;
+ }
+
if (use_lmem)
options << " -D USE_LOCAL_MEM";
- buildProgram(hammingProgs[device],
- hamming_cl,
- hamming_cl_len,
+ buildProgram(nearest_neighbourProgs[device],
+ nearest_neighbour_cl,
+ nearest_neighbour_cl_len,
options.str());
- huKernel[device] = Kernel(hammingProgs[device], "hamming_matcher_unroll");
- hmKernel[device] = Kernel(hammingProgs[device], "hamming_matcher");
- smKernel[device] = Kernel(hammingProgs[device], "select_matches");
+ huKernel[device] = Kernel(nearest_neighbourProgs[device], "nearest_neighbour_unroll");
+ hmKernel[device] = Kernel(nearest_neighbourProgs[device], "nearest_neighbour");
+ smKernel[device] = Kernel(nearest_neighbourProgs[device], "select_matches");
});
const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;
@@ -80,7 +90,7 @@ void hamming_matcher(Param idx,
const NDRange global(nblk * THREADS, 1);
cl::Buffer *d_blk_idx = bufferAlloc(nblk * nquery * sizeof(unsigned));
- cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(unsigned));
+ cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(To));
// For each query vector, find training vector with smallest Hamming
// distance per CUDA block
@@ -88,7 +98,7 @@ void hamming_matcher(Param idx,
auto huOp = make_kernel<Buffer, Buffer,
Buffer, KParam,
Buffer, KParam,
- const unsigned,
+ const To,
LocalSpaceArg> (huKernel[device]);
huOp(EnqueueArgs(getQueue(), global, local),
@@ -100,7 +110,7 @@ void hamming_matcher(Param idx,
auto hmOp = make_kernel<Buffer, Buffer,
Buffer, KParam,
Buffer, KParam,
- const unsigned, const unsigned,
+ const To, const unsigned,
LocalSpaceArg> (hmKernel[device]);
hmOp(EnqueueArgs(getQueue(), global, local),
@@ -117,7 +127,7 @@ void hamming_matcher(Param idx,
// best match
auto smOp = make_kernel<Buffer, Buffer, Buffer, Buffer,
const unsigned, const unsigned,
- const unsigned> (smKernel[device]);
+ const To> (smKernel[device]);
smOp(EnqueueArgs(getQueue(), global_sm, local_sm),
*idx.data, *dist.data,
diff --git a/src/backend/opencl/nearest_neighbour.cpp b/src/backend/opencl/nearest_neighbour.cpp
new file mode 100644
index 0000000..cd86b7f
--- /dev/null
+++ b/src/backend/opencl/nearest_neighbour.cpp
@@ -0,0 +1,165 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_opencl.hpp>
+#include <handle.hpp>
+#include <kernel/nearest_neighbour.hpp>
+#include <kernel/transpose.hpp>
+
+using af::dim4;
+using cl::Device;
+
+namespace opencl
+{
+
+static const unsigned THREADS = 256;
+
+template<typename T, typename To, af_match_type dist_type>
+void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist)
+{
+ uint sample_dim = (dist_dim == 0) ? 1 : 0;
+ const dim4 qDims = query.dims();
+ const dim4 tDims = train.dims();
+
+ const dim4 outDims(n_dist, qDims[sample_dim]);
+
+ idx = createEmptyArray<uint>(outDims);
+ dist = createEmptyArray<To>(outDims);
+
+ const unsigned feat_len = qDims[dist_dim];
+
+ // Determine maximum feat_len capable of using shared memory (faster)
+ cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+ size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
+ size_t ltrain_sz = THREADS * feat_len * sizeof(T);
+ bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false;
+ size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef;
+
+ Array<T> queryT = query;
+ Array<T> trainT = train;
+
+ if (dist_dim == 0) {
+ const dim4 queryTDims = dim4(qDims[1], qDims[0], qDims[2], qDims[3]);
+ const dim4 trainTDims = dim4(tDims[1], tDims[0], tDims[2], tDims[3]);
+ queryT = createEmptyArray<T>(queryTDims);
+ trainT = createEmptyArray<T>(trainTDims);
+
+ bool queryIs32Multiple = false;
+ if (qDims[0] % 32 == 0 && qDims[1] % 32 == 0)
+ queryIs32Multiple = true;
+
+ bool trainIs32Multiple = false;
+ if (tDims[0] % 32 == 0 && tDims[1] % 32 == 0)
+ trainIs32Multiple = true;
+
+ if (queryIs32Multiple)
+ kernel::transpose<T, false, true >(queryT, query);
+ else
+ kernel::transpose<T, false, false>(queryT, query);
+
+ if (trainIs32Multiple)
+ kernel::transpose<T, false, true >(trainT, train);
+ else
+ kernel::transpose<T, false, false>(trainT, train);
+ }
+
+ if (use_lmem) {
+ switch (feat_len) {
+ case 1:
+ kernel::nearest_neighbour<T, To, dist_type, true , 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 2:
+ kernel::nearest_neighbour<T, To, dist_type, true , 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 4:
+ kernel::nearest_neighbour<T, To, dist_type, true , 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 8:
+ kernel::nearest_neighbour<T, To, dist_type, true , 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 16:
+ kernel::nearest_neighbour<T, To, dist_type, true , 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 32:
+ kernel::nearest_neighbour<T, To, dist_type, true , 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 64:
+ kernel::nearest_neighbour<T, To, dist_type, true , 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ default:
+ kernel::nearest_neighbour<T, To, dist_type, true , 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ }
+ } else {
+ switch (feat_len) {
+ case 1:
+ kernel::nearest_neighbour<T, To, dist_type, false, 1 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 2:
+ kernel::nearest_neighbour<T, To, dist_type, false, 2 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 4:
+ kernel::nearest_neighbour<T, To, dist_type, false, 4 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 8:
+ kernel::nearest_neighbour<T, To, dist_type, false, 8 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 16:
+ kernel::nearest_neighbour<T, To, dist_type, false, 16>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 32:
+ kernel::nearest_neighbour<T, To, dist_type, false, 32>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ case 64:
+ kernel::nearest_neighbour<T, To, dist_type, false, 64>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ default:
+ kernel::nearest_neighbour<T, To, dist_type, false, 0 >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
+ break;
+ }
+ }
+}
+
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist,
+ const af_match_type dist_type)
+{
+ switch(dist_type) {
+ case AF_SAD: nearest_neighbour_<T, To, AF_SAD>(idx, dist, query, train, dist_dim, n_dist); break;
+ case AF_SSD: nearest_neighbour_<T, To, AF_SSD>(idx, dist, query, train, dist_dim, n_dist); break;
+ case AF_SHD: nearest_neighbour_<T, To, AF_SHD>(idx, dist, query, train, dist_dim, n_dist); break;
+ default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
+ }
+}
+
+#define INSTANTIATE(T, To) \
+ template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist, \
+ const Array<T>& query, const Array<T>& train, \
+ const uint dist_dim, const uint n_dist, \
+ const af_match_type dist_type);
+
+INSTANTIATE(float , float)
+INSTANTIATE(double, double)
+INSTANTIATE(int , int)
+INSTANTIATE(uint , uint)
+INSTANTIATE(intl , intl)
+INSTANTIATE(uintl , uintl)
+INSTANTIATE(uchar , uint)
+
+INSTANTIATE(uintl, uint) // For Hamming
+
+}
diff --git a/src/backend/opencl/hamming.hpp b/src/backend/opencl/nearest_neighbour.hpp
similarity index 58%
rename from src/backend/opencl/hamming.hpp
rename to src/backend/opencl/nearest_neighbour.hpp
index 18689ef..787e4a4 100644
--- a/src/backend/opencl/hamming.hpp
+++ b/src/backend/opencl/nearest_neighbour.hpp
@@ -14,9 +14,10 @@ using af::features;
namespace opencl
{
-template<typename T>
-void hamming_matcher(Array<uint>& idx, Array<uint>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist);
+template<typename T, typename To>
+void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist,
+ const af_match_type dist_type = AF_SSD);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list