[arrayfire] 103/248: Added OpenCL implementation of GLOH
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Nov 17 15:54:10 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.
commit de6d4016e022a3631cdabc602ecc0d99c7a2c3e4
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date: Wed Oct 7 16:16:48 2015 -0400
Added OpenCL implementation of GLOH
---
src/backend/opencl/kernel/sift_nonfree.cl | 225 ++++++++++++++++++++++++++++-
src/backend/opencl/kernel/sift_nonfree.hpp | 56 +++++--
src/backend/opencl/sift.cpp | 27 ++--
src/backend/opencl/sift.hpp | 3 +-
4 files changed, 280 insertions(+), 31 deletions(-)
diff --git a/src/backend/opencl/kernel/sift_nonfree.cl b/src/backend/opencl/kernel/sift_nonfree.cl
index 7a65ffa..f62ff37 100644
--- a/src/backend/opencl/kernel/sift_nonfree.cl
+++ b/src/backend/opencl/kernel/sift_nonfree.cl
@@ -100,6 +100,8 @@
// factor used to convert floating-point descriptor to unsigned char
#define INT_DESCR_FCTR 512.f
+__constant float GLOHRadii[3] = {6.f, 11.f, 15.f};
+
#define PI_VAL 3.14159265358979323846f
void gaussianElimination(float* A, float* b, float* x, const int n)
@@ -193,6 +195,58 @@ inline void normalizeDesc(
barrier(CLK_LOCAL_MEM_FENCE);
}
+inline void normalizeGLOHDesc(
+ __local float* desc,
+ __local float* accum,
+ const int histlen,
+ int lid_x,
+ int lid_y,
+ int lsz_x)
+{
+ for (int i = lid_x; i < histlen; i += lsz_x)
+ accum[i] = desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ float sum = 0.0f;
+ for (int i = 0; i < histlen; i++)
+ sum += desc[lid_y*histlen+i]*desc[lid_y*histlen+i];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lid_x < 128)
+ accum[lid_x] += accum[lid_x+128];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid_x < 64)
+ accum[lid_x] += accum[lid_x+64];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid_x < 32)
+ accum[lid_x] += accum[lid_x+32];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid_x < 16)
+ // GLOH is 272-dimensional, accumulating last 16 descriptors
+ accum[lid_x] += accum[lid_x+16] + accum[lid_x+256];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid_x < 8)
+ accum[lid_x] += accum[lid_x+8];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid_x < 4)
+ accum[lid_x] += accum[lid_x+4];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid_x < 2)
+ accum[lid_x] += accum[lid_x+2];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid_x < 1)
+ accum[lid_x] += accum[lid_x+1];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ float len_sq = accum[0];
+ float len_inv = 1.0f / sqrt(len_sq);
+
+ for (int i = lid_x; i < histlen; i += lsz_x) {
+ desc[lid_y*histlen+i] *= len_inv;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+}
+
__kernel void sub(
__global T* out,
__global const T* in,
@@ -689,10 +743,8 @@ __kernel void computeDescriptor(
__local float* desc = l_mem;
__local float* accum = l_mem + desc_len * histsz;
- const int histlen = d*d*n;
-
- for (int i = lid_x; i < histlen*histsz; i += lsz_x)
- desc[lid_y*histlen+i] = 0.f;
+ for (int i = lid_x; i < desc_len*histsz; i += lsz_x)
+ desc[lid_y*desc_len+i] = 0.f;
barrier(CLK_LOCAL_MEM_FENCE);
if (f < total_feat) {
@@ -787,13 +839,13 @@ __kernel void computeDescriptor(
desc[l] += desc[l+desc_len];
barrier(CLK_LOCAL_MEM_FENCE);
- normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+ normalizeDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
for (int i = lid_x; i < d*d*n; i += lsz_x)
desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
barrier(CLK_LOCAL_MEM_FENCE);
- normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+ normalizeDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
if (f < total_feat) {
// Calculate final descriptor values
@@ -802,4 +854,165 @@ __kernel void computeDescriptor(
}
}
+__kernel void computeGLOHDescriptor(
+ __global float* desc_out,
+ const unsigned desc_len,
+ const unsigned histsz,
+ __global const float* x_in,
+ __global const float* y_in,
+ __global const unsigned* layer_in,
+ __global const float* response_in,
+ __global const float* size_in,
+ __global const float* ori_in,
+ const unsigned total_feat,
+ __global const T* gauss_octave,
+ KParam iGauss,
+ const int d,
+ const unsigned rb,
+ const unsigned ab,
+ const unsigned hb,
+ const float scale,
+ const int n_layers,
+ __local float* l_mem)
+{
+ const int lid_x = get_local_id(0);
+ const int lid_y = get_local_id(1);
+ const int lsz_x = get_local_size(0);
+
+ const int f = get_global_id(1);
+
+ __local float* desc = l_mem;
+ __local float* accum = l_mem + desc_len * histsz;
+
+ for (int i = lid_x; i < desc_len*histsz; i += lsz_x)
+ desc[lid_y*desc_len+i] = 0.f;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (f < total_feat) {
+ const unsigned layer = layer_in[f];
+ float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+ ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+ const float size = size_in[f];
+ const int fx = round(x_in[f] * scale);
+ const int fy = round(y_in[f] * scale);
+
+ // Points img to correct Gaussian pyramid layer
+ const int dim0 = iGauss.dims[0];
+ const int dim1 = iGauss.dims[1];
+ __global const T* img = gauss_octave + (layer * dim0 * dim1);
+
+ float cos_t = cos(ori);
+ float sin_t = sin(ori);
+ float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+ float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+ float exp_denom = GLOHRadii[rb-1] * 0.5f;
+
+ float hist_width = DESCR_SCL_FCTR * size * scale * 0.5f;
+
+ // Keep same descriptor radius used for SIFT
+ int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+ // Alternative radius size calculation, changing the radius weight
+ // (rw) in the range of 0.25f-0.75f gives different results,
+ // increasing it tends to show a better recall rate but with a
+ // smaller amount of correct matches
+ //float rw = 0.5f;
+ //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
+
+ int len = radius*2+1;
+ const int hist_off = (lid_x % histsz) * desc_len;
+
+ // Calculate orientation histogram
+ for (int l = lid_x; l < len*len; l += lsz_x) {
+ int i = l / len - radius;
+ int j = l % len - radius;
+
+ int y = fy + i;
+ int x = fx + j;
+
+ float x_rot = (j * cos_t - i * sin_t);
+ float y_rot = (j * sin_t + i * cos_t);
+
+ float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+ float theta = atan2(y_rot, x_rot);
+ while (theta < 0.0f)
+ theta += PI_VAL*2;
+ while (theta >= PI_VAL*2)
+ theta -= PI_VAL*2;
+
+ float tbin = theta * polar_bins_per_rad;
+ float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+ ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+ min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
+
+ if (r <= GLOHRadii[rb-1] &&
+ y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
+ float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+ float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+ float grad_mag = sqrt(dx*dx + dy*dy);
+ float grad_ori = atan2(dy, dx) - ori;
+ while (grad_ori < 0.0f)
+ grad_ori += PI_VAL*2;
+ while (grad_ori >= PI_VAL*2)
+ grad_ori -= PI_VAL*2;
+
+ float w = exp(-r / exp_denom);
+ float obin = grad_ori * hist_bins_per_rad;
+ float mag = grad_mag*w;
+
+ int t0 = floor(tbin);
+ int r0 = floor(rbin);
+ int o0 = floor(obin);
+ tbin -= t0;
+ rbin -= r0;
+ obin -= o0;
+
+ for (int rl = 0; rl <= 1; rl++) {
+ int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+ float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+ if (rb >= 0 && rb <= 2) {
+ for (int tl = 0; tl <= 1; tl++) {
+ int tb = (t0 + tl) % ab;
+ float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+ for (int ol = 0; ol <= 1; ol++) {
+ int ob = (o0 + ol) % hb;
+ float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+ unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+ fatomic_add(&desc[hist_off + lid_y*desc_len + idx], v_o);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Combine histograms (reduces previous atomicAdd overhead)
+ for (int l = lid_x; l < desc_len*4; l += lsz_x)
+ desc[l] += desc[l+4*desc_len];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int l = lid_x; l < desc_len*2; l += lsz_x)
+ desc[l ] += desc[l+2*desc_len];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int l = lid_x; l < desc_len; l += lsz_x)
+ desc[l] += desc[l+desc_len];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ normalizeGLOHDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
+
+ for (int i = lid_x; i < desc_len; i += lsz_x)
+ desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ normalizeGLOHDesc(desc, accum, desc_len, lid_x, lid_y, lsz_x);
+
+ if (f < total_feat) {
+ // Calculate final descriptor values
+ for (int k = lid_x; k < desc_len; k += lsz_x)
+ desc_out[f*desc_len+k] = round(min(255.f, desc[lid_y*desc_len+k] * INT_DESCR_FCTR));
+ }
+}
+
#undef IPTR
diff --git a/src/backend/opencl/kernel/sift_nonfree.hpp b/src/backend/opencl/kernel/sift_nonfree.hpp
index bc65516..0761d2a 100644
--- a/src/backend/opencl/kernel/sift_nonfree.hpp
+++ b/src/backend/opencl/kernel/sift_nonfree.hpp
@@ -130,6 +130,15 @@ static const int DescrHistBins = 8;
// default number of bins in histogram for orientation assignment
static const int OriHistBins = 36;
+// Number of GLOH bins in radial direction
+static const unsigned GLOHRadialBins = 3;
+
+// Number of GLOH angular bins (excluding the inner-most radial section)
+static const unsigned GLOHAngularBins = 8;
+
+// Number of GLOH bins per histogram in descriptor
+static const unsigned GLOHHistBins = 16;
+
static const float PI_VAL = 3.14159265358979323846f;
template<typename T>
@@ -404,7 +413,8 @@ void sift(unsigned* out_feat,
const float init_sigma,
const bool double_input,
const float img_scale,
- const float feature_ratio)
+ const float feature_ratio,
+ const bool compute_GLOH)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
@@ -415,6 +425,7 @@ void sift(unsigned* out_feat,
static std::map<int, Kernel*> coKernel;
static std::map<int, Kernel*> rdKernel;
static std::map<int, Kernel*> cdKernel;
+ static std::map<int, Kernel*> cgKernel;
int device = getActiveDeviceId();
@@ -438,6 +449,7 @@ void sift(unsigned* out_feat,
coKernel[device] = new Kernel(*siftProgs[device], "calcOrientation");
rdKernel[device] = new Kernel(*siftProgs[device], "removeDuplicates");
cdKernel[device] = new Kernel(*siftProgs[device], "computeDescriptor");
+ cgKernel[device] = new Kernel(*siftProgs[device], "computeGLOHDescriptor");
});
const unsigned min_dim = (double_input) ? min(img.info.dims[0]*2, img.info.dims[1]*2)
@@ -461,7 +473,10 @@ void sift(unsigned* out_feat,
const unsigned d = DescrWidth;
const unsigned n = DescrHistBins;
- const unsigned desc_len = d*d*n;
+ const unsigned rb = GLOHRadialBins;
+ const unsigned ab = GLOHAngularBins;
+ const unsigned hb = GLOHHistBins;
+ const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
cl::Buffer* d_count = bufferAlloc(sizeof(unsigned));
@@ -667,17 +682,32 @@ void sift(unsigned* out_feat,
const unsigned histsz = 8;
- auto cdOp = make_kernel<Buffer, unsigned, unsigned,
- Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
- Buffer, KParam, int, int, float, int,
- LocalSpaceArg> (*cdKernel[device]);
-
- cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
- *d_desc, desc_len, histsz,
- *d_oriented_x, *d_oriented_y, *d_oriented_layer,
- *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
- *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, n_layers,
- cl::Local(desc_len * (histsz+1) * sizeof(float)));
+ if (compute_GLOH) {
+ auto cgOp = make_kernel<Buffer, unsigned, unsigned,
+ Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
+ Buffer, KParam, int, unsigned, unsigned, unsigned, float, int,
+ LocalSpaceArg> (*cgKernel[device]);
+
+ cgOp(EnqueueArgs(getQueue(), global_desc, local_desc),
+ *d_desc, desc_len, histsz,
+ *d_oriented_x, *d_oriented_y, *d_oriented_layer,
+ *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
+ *gauss_pyr[o].data, gauss_pyr[o].info, d, rb, ab, hb, scale, n_layers,
+ cl::Local(desc_len * (histsz+1) * sizeof(float)));
+ }
+ else {
+ auto cdOp = make_kernel<Buffer, unsigned, unsigned,
+ Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned,
+ Buffer, KParam, int, int, float, int,
+ LocalSpaceArg> (*cdKernel[device]);
+
+ cdOp(EnqueueArgs(getQueue(), global_desc, local_desc),
+ *d_desc, desc_len, histsz,
+ *d_oriented_x, *d_oriented_y, *d_oriented_layer,
+ *d_oriented_response, *d_oriented_size, *d_oriented_ori, oriented_feat,
+ *gauss_pyr[o].data, gauss_pyr[o].info, d, n, scale, n_layers,
+ cl::Local(desc_len * (histsz+1) * sizeof(float)));
+ }
CL_DEBUG_FINISH(getQueue());
total_feat += oriented_feat;
diff --git a/src/backend/opencl/sift.cpp b/src/backend/opencl/sift.cpp
index 7e5aa6d..7f83415 100644
--- a/src/backend/opencl/sift.cpp
+++ b/src/backend/opencl/sift.cpp
@@ -31,7 +31,8 @@ unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
const Array<T>& in, const unsigned n_layers,
const float contrast_thr, const float edge_thr,
const float init_sigma, const bool double_input,
- const float img_scale, const float feature_ratio)
+ const float img_scale, const float feature_ratio,
+ const bool compute_GLOH)
{
#ifdef AF_BUILD_SIFT
unsigned nfeat_out;
@@ -46,7 +47,7 @@ unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
kernel::sift<T,convAccT>(&nfeat_out, &desc_len, x, y, score, ori, size, desc,
in, n_layers, contrast_thr, edge_thr, init_sigma,
- double_input, img_scale, feature_ratio);
+ double_input, img_scale, feature_ratio, compute_GLOH);
if (nfeat_out > 0) {
const dim4 out_dims(nfeat_out);
@@ -62,19 +63,23 @@ unsigned sift(Array<float>& x_out, Array<float>& y_out, Array<float>& score_out,
return nfeat_out;
#else
- AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
+ if (compute_GLOH)
+ AF_ERROR("ArrayFire was not built with nonfree support, GLOH disabled\n", AFF_ERR_NONFREE);
+ else
+ AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
#endif
}
-#define INSTANTIATE(T, convAccT) \
- template unsigned sift<T, convAccT>(Array<float>& x_out, Array<float>& y_out, \
- Array<float>& score_out, Array<float>& ori_out, \
- Array<float>& size_out, Array<float>& desc_out, \
- const Array<T>& in, const unsigned n_layers, \
- const float contrast_thr, const float edge_thr, \
- const float init_sigma, const bool double_input, \
- const float img_scale, const float feature_ratio);
+#define INSTANTIATE(T, convAccT) \
+ template unsigned sift<T, convAccT>(Array<float>& x_out, Array<float>& y_out, \
+ Array<float>& score_out, Array<float>& ori_out, \
+ Array<float>& size_out, Array<float>& desc_out, \
+ const Array<T>& in, const unsigned n_layers, \
+ const float contrast_thr, const float edge_thr, \
+ const float init_sigma, const bool double_input, \
+ const float img_scale, const float feature_ratio, \
+ const bool compute_GLOH);
INSTANTIATE(float , float )
INSTANTIATE(double, double)
diff --git a/src/backend/opencl/sift.hpp b/src/backend/opencl/sift.hpp
index 96b422f..1587fc9 100644
--- a/src/backend/opencl/sift.hpp
+++ b/src/backend/opencl/sift.hpp
@@ -21,6 +21,7 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
const Array<T>& in, const unsigned n_layers,
const float contrast_thr, const float edge_thr,
const float init_sigma, const bool double_input,
- const float img_scale, const float feature_ratio);
+ const float img_scale, const float feature_ratio,
+ const bool compute_GLOH);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list