[arrayfire] 279/408: Moved syncthreads/barriers out of thread conditionals
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:13 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit faefa30c3a0b9b386919e40151dc966b42110ef5
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date: Tue Aug 18 14:54:32 2015 -0400
Moved syncthreads/barriers out of thread conditionals
---
src/backend/cuda/kernel/orb.hpp | 40 ++++++++++++++++----------------
src/backend/opencl/kernel/sift.cl | 48 ++++++++++++++++++++-------------------
2 files changed, 46 insertions(+), 42 deletions(-)
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index 38c6eaa..4935405 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -115,6 +115,9 @@ __global__ void harris_response(
{
unsigned f = blockDim.x * blockIdx.x + threadIdx.x;
+ float ixx = 0.f, iyy = 0.f, ixy = 0.f;
+ float size = 0.f;
+
if (f < total_feat) {
unsigned x, y;
float scl = 1.f;
@@ -130,7 +133,7 @@ __global__ void harris_response(
}
// Round feature size to nearest odd integer
- float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
+ size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
// Avoid keeping features that might be too wide and might not fit on
// the image, sqrt(2.f) is the radius when angle is 45 degrees and
@@ -141,7 +144,6 @@ __global__ void harris_response(
unsigned r = block_size / 2;
- float ixx = 0.f, iyy = 0.f, ixy = 0.f;
unsigned block_size_sq = block_size * block_size;
for (unsigned k = threadIdx.y; k < block_size_sq; k += blockDim.y) {
int i = k / block_size - r;
@@ -156,28 +158,28 @@ __global__ void harris_response(
iyy += iy*iy;
ixy += ix*iy;
}
- __syncthreads();
+ }
+ __syncthreads();
- ixx = block_reduce_sum(ixx);
- iyy = block_reduce_sum(iyy);
- ixy = block_reduce_sum(ixy);
+ ixx = block_reduce_sum(ixx);
+ iyy = block_reduce_sum(iyy);
+ ixy = block_reduce_sum(ixy);
- if (threadIdx.y == 0) {
- float tr = ixx + iyy;
- float det = ixx*iyy - ixy*ixy;
+ if (f < total_feat && threadIdx.y == 0) {
+ float tr = ixx + iyy;
+ float det = ixx*iyy - ixy*ixy;
- // Calculate Harris responses
- float resp = det - k_thr * (tr*tr);
+ // Calculate Harris responses
+ float resp = det - k_thr * (tr*tr);
- // Scale factor
- // TODO: improve response scaling
- float rscale = 0.001f;
- rscale = rscale * rscale * rscale * rscale;
+ // Scale factor
+ // TODO: improve response scaling
+ float rscale = 0.001f;
+ rscale = rscale * rscale * rscale * rscale;
- score_out[f] = resp * rscale;
- if (use_scl)
- size_out[f] = size;
- }
+ score_out[f] = resp * rscale;
+ if (use_scl)
+ size_out[f] = size;
}
}
diff --git a/src/backend/opencl/kernel/sift.cl b/src/backend/opencl/kernel/sift.cl
index 57610df..ef63cab 100644
--- a/src/backend/opencl/kernel/sift.cl
+++ b/src/backend/opencl/kernel/sift.cl
@@ -690,6 +690,12 @@ __kernel void computeDescriptor(
__local float* desc = l_mem;
__local float* accum = l_mem + desc_len * histsz;
+ const int histlen = d*d*n;
+
+ for (int i = lid_x; i < histlen*histsz; i += lsz_x)
+ desc[lid_y*histlen+i] = 0.f;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
if (f < total_feat) {
const unsigned layer = layer_in[f];
float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
@@ -711,13 +717,8 @@ __kernel void computeDescriptor(
int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
int len = radius*2+1;
- const int histlen = d*d*n;
const int hist_off = (lid_x % histsz) * desc_len;
- for (int i = lid_x; i < histlen*histsz; i += lsz_x)
- desc[lid_y*histlen+i] = 0.f;
- barrier(CLK_LOCAL_MEM_FENCE);
-
// Calculate orientation histogram
for (int l = lid_x; l < len*len; l += lsz_x) {
int i = l / len - radius;
@@ -773,31 +774,32 @@ __kernel void computeDescriptor(
}
}
}
- barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
- // Combine histograms (reduces previous atomicAdd overhead)
- for (int l = lid_x; l < desc_len*4; l += lsz_x)
- desc[l] += desc[l+4*desc_len];
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int l = lid_x; l < desc_len*2; l += lsz_x)
- desc[l ] += desc[l+2*desc_len];
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int l = lid_x; l < desc_len; l += lsz_x)
- desc[l] += desc[l+desc_len];
- barrier(CLK_LOCAL_MEM_FENCE);
+ // Combine histograms (reduces previous atomicAdd overhead)
+ for (int l = lid_x; l < desc_len*4; l += lsz_x)
+ desc[l] += desc[l+4*desc_len];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int l = lid_x; l < desc_len*2; l += lsz_x)
+ desc[l ] += desc[l+2*desc_len];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int l = lid_x; l < desc_len; l += lsz_x)
+ desc[l] += desc[l+desc_len];
+ barrier(CLK_LOCAL_MEM_FENCE);
- normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+ normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
- for (int i = lid_x; i < d*d*n; i += lsz_x)
- desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
- barrier(CLK_LOCAL_MEM_FENCE);
+ for (int i = lid_x; i < d*d*n; i += lsz_x)
+ desc[lid_y*desc_len+i] = min(desc[lid_y*desc_len+i], DESCR_MAG_THR);
+ barrier(CLK_LOCAL_MEM_FENCE);
- normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+ normalizeDesc(desc, accum, histlen, lid_x, lid_y, lsz_x);
+ if (f < total_feat) {
// Calculate final descriptor values
- for (int k = lid_x; k < d*d*n; k += lsz_x) {
+ for (int k = lid_x; k < d*d*n; k += lsz_x)
desc_out[f*desc_len+k] = round(min(255.f, desc[lid_y*desc_len+k] * INT_DESCR_FCTR));
- }
}
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list