[arrayfire] 102/248: Added CUDA implementation of GLOH
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Nov 17 15:54:09 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.
commit 337fcec576f5989c35328a6d5dab8791cf9b1964
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date: Wed Oct 7 16:16:10 2015 -0400
Added CUDA implementation of GLOH
---
src/backend/cuda/kernel/sift_nonfree.hpp | 270 +++++++++++++++++++++++++++++--
src/backend/cuda/sift.cu | 14 +-
src/backend/cuda/sift.hpp | 3 +-
3 files changed, 265 insertions(+), 22 deletions(-)
diff --git a/src/backend/cuda/kernel/sift_nonfree.hpp b/src/backend/cuda/kernel/sift_nonfree.hpp
index 1a1c35f..391ad42 100644
--- a/src/backend/cuda/kernel/sift_nonfree.hpp
+++ b/src/backend/cuda/kernel/sift_nonfree.hpp
@@ -142,6 +142,18 @@ static const dim_t SIFT_THREADS_Y = 8;
// factor used to convert floating-podescriptor to unsigned char
#define INT_DESCR_FCTR 512.f
+// Number of GLOH bins in radial direction
+static const unsigned GLOHRadialBins = 3;
+
+// Radii of GLOH descriptors
+__constant__ float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
+
+// Number of GLOH angular bins (excluding the inner-most radial section)
+static const unsigned GLOHAngularBins = 8;
+
+// Number of GLOH bins per histogram in descriptor
+static const unsigned GLOHHistBins = 16;
+
template<typename T>
void gaussian1D(T* out, const int dim, double sigma=0.0)
{
@@ -230,7 +242,7 @@ __inline__ __device__ void normalizeDesc(
int bsz_x = blockDim.x;
for (int i = tid_x; i < histlen; i += bsz_x)
- accum[tid_x] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
+ accum[i] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
__syncthreads();
if (tid_x < 64)
@@ -264,6 +276,54 @@ __inline__ __device__ void normalizeDesc(
__syncthreads();
}
+__inline__ __device__ void normalizeGLOHDesc(
+ float* desc,
+ float* accum,
+ const int histlen)
+{
+ int tid_x = threadIdx.x;
+ int tid_y = threadIdx.y;
+ int bsz_x = blockDim.x;
+
+ for (int i = tid_x; i < histlen; i += bsz_x)
+ accum[i] = desc[tid_y*histlen+i]*desc[tid_y*histlen+i];
+ __syncthreads();
+
+ if (tid_x < 128)
+ accum[tid_x] += accum[tid_x+128];
+ __syncthreads();
+ if (tid_x < 64)
+ accum[tid_x] += accum[tid_x+64];
+ __syncthreads();
+ if (tid_x < 32)
+ accum[tid_x] += accum[tid_x+32];
+ __syncthreads();
+ if (tid_x < 16)
+ // GLOH is 272-dimensional, accumulating last 16 descriptors
+ accum[tid_x] += accum[tid_x+16] + accum[tid_x+256];
+ __syncthreads();
+ if (tid_x < 8)
+ accum[tid_x] += accum[tid_x+8];
+ __syncthreads();
+ if (tid_x < 4)
+ accum[tid_x] += accum[tid_x+4];
+ __syncthreads();
+ if (tid_x < 2)
+ accum[tid_x] += accum[tid_x+2];
+ __syncthreads();
+ if (tid_x < 1)
+ accum[tid_x] += accum[tid_x+1];
+ __syncthreads();
+
+ float len_sq = accum[0];
+ float len_inv = 1.0f / sqrtf(len_sq);
+
+ for (int i = tid_x; i < histlen; i += bsz_x) {
+ desc[tid_y*histlen+i] *= len_inv;
+ }
+ __syncthreads();
+}
+
template<typename T>
__global__ void sub(
Param<T> out,
@@ -759,10 +819,8 @@ __global__ void computeDescriptor(
float* desc = shrdMem;
float* accum = shrdMem + desc_len * histsz;
- const int histlen = (d)*(d)*(n);
-
- for (int i = tid_x; i < histlen*histsz; i += bsz_x)
- desc[tid_y*histlen+i] = 0.f;
+ for (int i = tid_x; i < desc_len*histsz; i += bsz_x)
+ desc[tid_y*desc_len+i] = 0.f;
__syncthreads();
if (f < total_feat) {
@@ -859,17 +917,184 @@ __global__ void computeDescriptor(
desc[l] += desc[l+desc_len];
__syncthreads();
- normalizeDesc(desc, accum, histlen);
+ normalizeDesc(desc, accum, desc_len);
- for (int i = tid_x; i < d*d*n; i += bsz_x)
+ for (int i = tid_x; i < desc_len; i += bsz_x)
desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
__syncthreads();
- normalizeDesc(desc, accum, histlen);
+ normalizeDesc(desc, accum, desc_len);
if (f < total_feat) {
// Calculate final descriptor values
- for (int k = tid_x; k < d*d*n; k += bsz_x)
+ for (int k = tid_x; k < desc_len; k += bsz_x)
+ desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
+ }
+}
+
+// Computes GLOH feature descriptors for features in an array. Based on Section III-B
+// of Mikolajczyk and Schmid paper.
+template<typename T>
+__global__ void computeGLOHDescriptor(
+ float* desc_out,
+ const unsigned desc_len,
+ const unsigned histsz,
+ const float* x_in,
+ const float* y_in,
+ const unsigned* layer_in,
+ const float* response_in,
+ const float* size_in,
+ const float* ori_in,
+ const unsigned total_feat,
+ const CParam<T> gauss_octave,
+ const int d,
+ const unsigned rb,
+ const unsigned ab,
+ const unsigned hb,
+ const float scale,
+ const int n_layers)
+{
+ const int tid_x = threadIdx.x;
+ const int tid_y = threadIdx.y;
+ const int bsz_x = blockDim.x;
+ const int bsz_y = blockDim.y;
+
+ const int f = blockIdx.y * bsz_y + tid_y;
+
+ SharedMemory<float> shared;
+ float* shrdMem = shared.getPointer();
+ float* desc = shrdMem;
+ float* accum = shrdMem + desc_len * histsz;
+
+ for (int i = tid_x; i < desc_len*histsz; i += bsz_x)
+ desc[tid_y*desc_len+i] = 0.f;
+ __syncthreads();
+
+ if (f < total_feat) {
+ const unsigned layer = layer_in[f];
+ float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+ ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+ const float size = size_in[f];
+ const int fx = round(x_in[f] * scale);
+ const int fy = round(y_in[f] * scale);
+
+ const int dim0 = gauss_octave.dims[0];
+ const int dim1 = gauss_octave.dims[1];
+ const int imel = dim0 * dim1;
+
+ // Points img to correct Gaussian pyramid layer
+ const T* img_ptr = gauss_octave.ptr + layer * imel;
+
+ float cos_t = cosf(ori);
+ float sin_t = sinf(ori);
+ float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+ float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+ float exp_denom = GLOHRadii[rb-1] * 0.5f;
+
+ float hist_width = DESCR_SCL_FCTR * size * scale * 0.5f;
+
+ // Keep same descriptor radius used for SIFT
+ int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
+
+ // Alternative radius size calculation, changing the radius weight
+ // (rw) in the range of 0.25f-0.75f gives different results,
+ // increasing it tends to show a better recall rate but with a
+ // smaller amount of correct matches
+ //float rw = 0.5f;
+ //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
+
+ int len = radius*2+1;
+ const int hist_off = (tid_x % histsz) * desc_len;
+
+ // Calculate orientation histogram
+ for (int l = tid_x; l < len*len; l += bsz_x) {
+ int i = l / len - radius;
+ int j = l % len - radius;
+
+ int y = fy + i;
+ int x = fx + j;
+
+ float x_rot = (j * cos_t - i * sin_t);
+ float y_rot = (j * sin_t + i * cos_t);
+
+ float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+ float theta = atan2(y_rot, x_rot);
+ while (theta < 0.0f)
+ theta += PI_VAL*2;
+ while (theta >= PI_VAL*2)
+ theta -= PI_VAL*2;
+
+ float tbin = theta * polar_bins_per_rad;
+ float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+ ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+ min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
+
+ if (r <= GLOHRadii[rb-1] &&
+ y > 0 && y < dim0 - 1 && x > 0 && x < dim1 - 1) {
+ float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+ float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+ float grad_mag = sqrtf(dx*dx + dy*dy);
+ float grad_ori = atan2f(dy, dx) - ori;
+ while (grad_ori < 0.0f)
+ grad_ori += PI_VAL*2;
+ while (grad_ori >= PI_VAL*2)
+ grad_ori -= PI_VAL*2;
+
+ float w = exp(-r / exp_denom);
+ float obin = grad_ori * hist_bins_per_rad;
+ float mag = grad_mag*w;
+
+ int t0 = floor(tbin);
+ int r0 = floor(rbin);
+ int o0 = floor(obin);
+ tbin -= t0;
+ rbin -= r0;
+ obin -= o0;
+
+ for (int rl = 0; rl <= 1; rl++) {
+ int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+ float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+ if (rb >= 0 && rb <= 2) {
+ for (int tl = 0; tl <= 1; tl++) {
+ int tb = (t0 + tl) % ab;
+ float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+ for (int ol = 0; ol <= 1; ol++) {
+ int ob = (o0 + ol) % hb;
+ float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+ unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+ atomicAdd(&desc[hist_off + tid_y*desc_len + idx], v_o);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ __syncthreads();
+
+ // Combine histograms (reduces previous atomicAdd overhead)
+ for (int l = tid_x; l < desc_len*4; l += bsz_x)
+ desc[l] += desc[l+4*desc_len];
+ __syncthreads();
+ for (int l = tid_x; l < desc_len*2; l += bsz_x)
+ desc[l ] += desc[l+2*desc_len];
+ __syncthreads();
+ for (int l = tid_x; l < desc_len; l += bsz_x)
+ desc[l] += desc[l+desc_len];
+ __syncthreads();
+
+ normalizeGLOHDesc(desc, accum, desc_len);
+
+ for (int i = tid_x; i < desc_len; i += bsz_x)
+ desc[tid_y*desc_len+i] = min(desc[tid_y*desc_len+i], DESC_MAG_THR);
+ __syncthreads();
+
+ normalizeGLOHDesc(desc, accum, desc_len);
+
+ if (f < total_feat) {
+ // Calculate final descriptor values
+ for (int k = tid_x; k < desc_len; k += bsz_x)
desc_out[f*desc_len+k] = round(min(255.f, desc[tid_y*desc_len+k] * INT_DESCR_FCTR));
}
}
@@ -1010,7 +1235,6 @@ std::vector< Param<T> > buildGaussPyr(
const unsigned imel = tmp_pyr[idx].dims[3] * tmp_pyr[idx].strides[3];
const unsigned offset = imel * l;
- //getQueue().enqueueCopyBuffer(*tmp_pyr[idx].data, *gauss_pyr[o].data, 0, offset*sizeof(T), imel * sizeof(T));
CUDA_CHECK(cudaMemcpy(gauss_pyr[o].ptr + offset, tmp_pyr[idx].ptr, imel * sizeof(T), cudaMemcpyDeviceToDevice));
}
}
@@ -1093,7 +1317,8 @@ void sift(unsigned* out_feat,
const float init_sigma,
const bool double_input,
const float img_scale,
- const float feature_ratio)
+ const float feature_ratio,
+ const bool compute_GLOH)
{
const unsigned min_dim = (double_input) ? min(img.dims[0]*2, img.dims[1]*2)
: min(img.dims[0], img.dims[1]);
@@ -1116,7 +1341,10 @@ void sift(unsigned* out_feat,
const unsigned d = DESCR_WIDTH;
const unsigned n = DESCR_HIST_BINS;
- const unsigned desc_len = d*d*n;
+ const unsigned rb = GLOHRadialBins;
+ const unsigned ab = GLOHAngularBins;
+ const unsigned hb = GLOHHistBins;
+ const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
unsigned* d_count = memAlloc<unsigned>(1);
for (unsigned i = 0; i < n_octaves; i++) {
@@ -1302,11 +1530,19 @@ void sift(unsigned* out_feat,
const unsigned histsz = 8;
const size_t shared_size = desc_len * (histsz+1) * sizeof(float);
- CUDA_LAUNCH_SMEM((computeDescriptor<T>), blocks, threads, shared_size,
- d_desc, desc_len, histsz,
- d_oriented_x, d_oriented_y, d_oriented_layer,
- d_oriented_response, d_oriented_size, d_oriented_ori,
- oriented_feat, gauss_pyr[i], d, n, scale, n_layers);
+ if (compute_GLOH)
+ CUDA_LAUNCH_SMEM((computeGLOHDescriptor<T>), blocks, threads, shared_size,
+ d_desc, desc_len, histsz,
+ d_oriented_x, d_oriented_y, d_oriented_layer,
+ d_oriented_response, d_oriented_size, d_oriented_ori,
+ oriented_feat, gauss_pyr[i], d, rb, ab, hb,
+ scale, n_layers);
+ else
+ CUDA_LAUNCH_SMEM((computeDescriptor<T>), blocks, threads, shared_size,
+ d_desc, desc_len, histsz,
+ d_oriented_x, d_oriented_y, d_oriented_layer,
+ d_oriented_response, d_oriented_size, d_oriented_ori,
+ oriented_feat, gauss_pyr[i], d, n, scale, n_layers);
POST_LAUNCH_CHECK();
total_feat += oriented_feat;
diff --git a/src/backend/cuda/sift.cu b/src/backend/cuda/sift.cu
index 3f1e99b..0b45fa2 100644
--- a/src/backend/cuda/sift.cu
+++ b/src/backend/cuda/sift.cu
@@ -31,7 +31,8 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
const Array<T>& in, const unsigned n_layers,
const float contrast_thr, const float edge_thr,
const float init_sigma, const bool double_input,
- const float img_scale, const float feature_ratio)
+ const float img_scale, const float feature_ratio,
+ const bool compute_GLOH)
{
#ifdef AF_BUILD_SIFT
const dim4 dims = in.dims();
@@ -48,7 +49,8 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
kernel::sift<T, convAccT>(&nfeat_out, &desc_len, &x_out, &y_out, &score_out,
&orientation_out, &size_out, &desc_out,
in, n_layers, contrast_thr, edge_thr,
- init_sigma, double_input, img_scale, feature_ratio);
+ init_sigma, double_input, img_scale, feature_ratio,
+ compute_GLOH);
if (nfeat_out > 0) {
if (x_out == NULL || y_out == NULL || score_out == NULL ||
@@ -70,7 +72,10 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
return nfeat_out;
#else
- AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
+ if (compute_GLOH)
+ AF_ERROR("ArrayFire was not built with nonfree support, GLOH disabled\n", AFF_ERR_NONFREE);
+ else
+ AF_ERROR("ArrayFire was not built with nonfree support, SIFT disabled\n", AFF_ERR_NONFREE);
#endif
}
@@ -81,7 +86,8 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
const Array<T>& in, const unsigned n_layers, \
const float contrast_thr, const float edge_thr, \
const float init_sigma, const bool double_input, \
- const float img_scale, const float feature_ratio);
+ const float img_scale, const float feature_ratio, \
+ const bool compute_GLOH);
INSTANTIATE(float , float )
INSTANTIATE(double, double)
diff --git a/src/backend/cuda/sift.hpp b/src/backend/cuda/sift.hpp
index c3eda20..28b8879 100644
--- a/src/backend/cuda/sift.hpp
+++ b/src/backend/cuda/sift.hpp
@@ -21,6 +21,7 @@ unsigned sift(Array<float>& x, Array<float>& y, Array<float>& score,
const Array<T>& in, const unsigned n_layers,
const float contrast_thr, const float edge_thr,
const float init_sigma, const bool double_input,
- const float img_scale, const float feature_ratio);
+ const float img_scale, const float feature_ratio,
+ const bool compute_GLOH);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list