[arrayfire] 25/29: Adding missing offsets for various OpenCL kernels
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Oct 19 17:48:55 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository arrayfire.
commit 2dfad04a4797dc52e21756fbc540d7e315e507ea
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Sat Oct 17 20:53:31 2015 -0400
Adding missing offsets for various OpenCL kernels
---
src/backend/opencl/kernel/assign.cl | 5 +++--
src/backend/opencl/kernel/fast.cl | 2 +-
src/backend/opencl/kernel/hsv_rgb.cl | 2 +-
src/backend/opencl/kernel/iir.cl | 4 ++--
src/backend/opencl/kernel/index.cl | 5 +++--
src/backend/opencl/kernel/lookup.cl | 5 +++--
src/backend/opencl/kernel/nearest_neighbour.cl | 12 ++++++------
src/backend/opencl/kernel/nearest_neighbour.hpp | 5 +++--
src/backend/opencl/kernel/regions.cl | 6 ++++--
src/backend/opencl/kernel/scan_dim.cl | 2 ++
src/backend/opencl/kernel/select.cl | 17 ++++++++++++-----
src/backend/opencl/kernel/susan.cl | 5 ++++-
src/backend/opencl/kernel/susan.hpp | 4 +++-
src/backend/opencl/kernel/where.cl | 2 +-
src/backend/opencl/kernel/wrap.cl | 2 +-
src/backend/opencl/nearest_neighbour.cpp | 11 ++++-------
src/backend/opencl/susan.cpp | 18 +++++++++---------
17 files changed, 62 insertions(+), 45 deletions(-)
diff --git a/src/backend/opencl/kernel/assign.cl b/src/backend/opencl/kernel/assign.cl
index 167031b..927ccdd 100644
--- a/src/backend/opencl/kernel/assign.cl
+++ b/src/backend/opencl/kernel/assign.cl
@@ -52,8 +52,9 @@ void assignKernel(global T * optr, KParam oInfo, global const T * iptr, KParam i
global const T *src = iptr + (gx*iInfo.strides[0]+
gy*iInfo.strides[1]+
gz*iInfo.strides[2]+
- gw*iInfo.strides[3]);
- global T *dst = optr + (i+j+k+l);
+ gw*iInfo.strides[3]+
+ iInfo.offset);
+ global T *dst = optr + (i+j+k+l) + oInfo.offset;
// set the output
dst[0] = src[0];
}
diff --git a/src/backend/opencl/kernel/fast.cl b/src/backend/opencl/kernel/fast.cl
index 1c9fdf3..695e167 100644
--- a/src/backend/opencl/kernel/fast.cl
+++ b/src/backend/opencl/kernel/fast.cl
@@ -165,7 +165,7 @@ void locate_features(
unsigned lx = bx / 2 + 3;
unsigned ly = by / 2 + 3;
- load_shared_image(in, iInfo, local_image, ix, iy, bx, by, x, y, lx, ly);
+ load_shared_image(in + iInfo.offset, iInfo, local_image, ix, iy, bx, by, x, y, lx, ly);
barrier(CLK_LOCAL_MEM_FENCE);
locate_features_core(local_image, score,
iInfo, thr, x, y, edge);
diff --git a/src/backend/opencl/kernel/hsv_rgb.cl b/src/backend/opencl/kernel/hsv_rgb.cl
index a62ab6a..d309525 100644
--- a/src/backend/opencl/kernel/hsv_rgb.cl
+++ b/src/backend/opencl/kernel/hsv_rgb.cl
@@ -24,7 +24,7 @@ void convert(global T * out, KParam oInfo, global const T * in, KParam iInfo, in
int oIdx1 = oIdx0 + oInfo.strides[2];
int oIdx2 = oIdx1 + oInfo.strides[2];
- int iIdx0 = gx * iInfo.strides[0] + gy * iInfo.strides[1];
+ int iIdx0 = gx * iInfo.strides[0] + gy * iInfo.strides[1] + iInfo.offset;
int iIdx1 = iIdx0 + iInfo.strides[2];
int iIdx2 = iIdx1 + iInfo.strides[2];
diff --git a/src/backend/opencl/kernel/iir.cl b/src/backend/opencl/kernel/iir.cl
index ce970d7..b189065 100644
--- a/src/backend/opencl/kernel/iir.cl
+++ b/src/backend/opencl/kernel/iir.cl
@@ -72,8 +72,8 @@ void iir_kernel( __global T *yptr, const KParam yinfo,
#endif
__global T *d_y = yptr + y_off;
- const __global T *d_c = cptr + c_off;
- const __global T *d_a = aptr + a_off;
+ const __global T *d_c = cptr + c_off + cinfo.offset;
+ const __global T *d_a = aptr + a_off + ainfo.offset;
const int repeat = (num_a + get_local_size(0) - 1) / get_local_size(0);
for (int ii = 0; ii < MAX_A_SIZE / get_local_size(0); ii++) {
diff --git a/src/backend/opencl/kernel/index.cl b/src/backend/opencl/kernel/index.cl
index 2fe9287..0d2839d 100644
--- a/src/backend/opencl/kernel/index.cl
+++ b/src/backend/opencl/kernel/index.cl
@@ -49,11 +49,12 @@ void indexKernel(global T * optr, KParam oInfo, global const T * iptr, KParam iI
int k = p.strds[2] * trimIndex(s2 ? gz+p.offs[2] : ptr2[gz], iInfo.dims[2]);
int l = p.strds[3] * trimIndex(s3 ? gw+p.offs[3] : ptr3[gw], iInfo.dims[3]);
// offset input and output pointers
- global const T *src = iptr + (i+j+k+l);
+ global const T *src = iptr + (i+j+k+l) + iInfo.offset;
global T *dst = optr + (gx*oInfo.strides[0]+
gy*oInfo.strides[1]+
gz*oInfo.strides[2]+
- gw*oInfo.strides[3]);
+ gw*oInfo.strides[3]+
+ oInfo.offset);
// set the output
dst[0] = src[0];
}
diff --git a/src/backend/opencl/kernel/lookup.cl b/src/backend/opencl/kernel/lookup.cl
index b573429..d24572f 100644
--- a/src/backend/opencl/kernel/lookup.cl
+++ b/src/backend/opencl/kernel/lookup.cl
@@ -45,9 +45,10 @@ void lookupND(global in_t * out,
int k = iInfo.strides[2]*(DIM==2 ? trimIndex((int)idxPtr[gz], iInfo.dims[2]): gz);
int l = iInfo.strides[3]*(DIM==3 ? trimIndex((int)idxPtr[gw], iInfo.dims[3]): gw);
- global const in_t *inPtr = in + (i+j+k+l);
+ global const in_t *inPtr = in + (i+j+k+l) + iInfo.offset;
global in_t *outPtr = out + (gx*oInfo.strides[0]+gy*oInfo.strides[1]+
- gz*oInfo.strides[2]+gw*oInfo.strides[3]);
+ gz*oInfo.strides[2]+gw*oInfo.strides[3]+
+ oInfo.offset);
if (gx<oInfo.dims[0] && gy<oInfo.dims[1] && gz<oInfo.dims[2] && gw<oInfo.dims[3]) {
outPtr[0] = inPtr[0];
diff --git a/src/backend/opencl/kernel/nearest_neighbour.cl b/src/backend/opencl/kernel/nearest_neighbour.cl
index 024fdc6..6247d1d 100644
--- a/src/backend/opencl/kernel/nearest_neighbour.cl
+++ b/src/backend/opencl/kernel/nearest_neighbour.cl
@@ -77,7 +77,7 @@ void nearest_neighbour_unroll(
// Copy local_size(0) training features to shared memory
#pragma unroll
for (unsigned i = 0; i < FEAT_LEN; i++) {
- l_train[i * get_local_size(0) + tid] = train[i * ntrain + f];
+ l_train[i * get_local_size(0) + tid] = train[i * ntrain + f + tInfo.offset];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -89,7 +89,7 @@ void nearest_neighbour_unroll(
// Load one query feature that will be tested against all training
// features in current block
if (tid < FEAT_LEN && valid_feat) {
- l_query[tid] = query[tid * nquery + j];
+ l_query[tid] = query[tid * nquery + j + qInfo.offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -102,7 +102,7 @@ void nearest_neighbour_unroll(
#ifdef USE_LOCAL_MEM
dist += DISTOP(l_train[k * get_local_size(0) + tid], l_query[k]);
#else
- dist += DISTOP(train[k * ntrain + f], l_query[k]);
+ dist += DISTOP(train[k * ntrain + f + tInfo.offset], l_query[k]);
#endif
}
}
@@ -217,7 +217,7 @@ void nearest_neighbour(
if (valid_feat) {
// Copy local_size(0) training features to shared memory
for (unsigned i = 0; i < feat_len; i++) {
- l_train[i * get_local_size(0) + tid] = train[i * ntrain + f];
+ l_train[i * get_local_size(0) + tid] = train[i * ntrain + f + tInfo.offset];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -229,7 +229,7 @@ void nearest_neighbour(
// Load one query feature that will be tested against all training
// features in current block
if (tid < feat_len && valid_feat) {
- l_query[tid] = query[tid * nquery + j];
+ l_query[tid] = query[tid * nquery + j + qInfo.offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -241,7 +241,7 @@ void nearest_neighbour(
#ifdef USE_LOCAL_MEM
dist += DISTOP(l_train[k * get_local_size(0) + tid], l_query[k]);
#else
- dist += DISTOP(train[k * ntrain + f], l_query[k]);
+ dist += DISTOP(train[k * ntrain + f + tInfo.offset], l_query[k]);
#endif
}
}
diff --git a/src/backend/opencl/kernel/nearest_neighbour.hpp b/src/backend/opencl/kernel/nearest_neighbour.hpp
index d7c800d..34688b2 100644
--- a/src/backend/opencl/kernel/nearest_neighbour.hpp
+++ b/src/backend/opencl/kernel/nearest_neighbour.hpp
@@ -28,14 +28,15 @@ namespace kernel
static const unsigned THREADS = 256;
-template<typename T, typename To, af_match_type dist_type, bool use_lmem>
+template<typename T, typename To, af_match_type dist_type>
void nearest_neighbour(Param idx,
Param dist,
Param query,
Param train,
const dim_t dist_dim,
const unsigned n_dist,
- const size_t lmem_sz)
+ const size_t lmem_sz,
+ bool use_lmem)
{
try {
const unsigned feat_len = query.info.dims[dist_dim];
diff --git a/src/backend/opencl/kernel/regions.cl b/src/backend/opencl/kernel/regions.cl
index 38bc159..51e4d9a 100644
--- a/src/backend/opencl/kernel/regions.cl
+++ b/src/backend/opencl/kernel/regions.cl
@@ -12,9 +12,10 @@
__kernel
void initial_label(global T * equiv_map,
KParam eInfo,
- global char * bin,
+ global char * bin_,
KParam bInfo)
{
+ global char *bin = bin_ + bInfo.offset;
const int base_x = (get_group_id(0) * get_local_size(0) * N_PER_THREAD) + get_local_id(0);
const int base_y = (get_group_id(1) * get_local_size(1) * N_PER_THREAD) + get_local_id(1);
@@ -36,10 +37,11 @@ void initial_label(global T * equiv_map,
__kernel
void final_relabel(global T * equiv_map,
KParam eInfo,
- global char * bin,
+ global char * bin_,
KParam bInfo,
global const T * d_tmp)
{
+ global char *bin = bin_ + bInfo.offset;
const int base_x = (get_group_id(0) * get_local_size(0) * N_PER_THREAD) + get_local_id(0);
const int base_y = (get_group_id(1) * get_local_size(1) * N_PER_THREAD) + get_local_id(1);
diff --git a/src/backend/opencl/kernel/scan_dim.cl b/src/backend/opencl/kernel/scan_dim.cl
index 5434189..b15d8ed 100644
--- a/src/backend/opencl/kernel/scan_dim.cl
+++ b/src/backend/opencl/kernel/scan_dim.cl
@@ -38,6 +38,8 @@ void scan_dim_kernel(__global To *oData, KParam oInfo,
ids[dim] = ids[dim] * DIMY * lim + lidy;
oData += ids[3] * oInfo.strides[3] + ids[2] * oInfo.strides[2] + ids[1] * oInfo.strides[1] + ids[0];
iData += ids[3] * iInfo.strides[3] + ids[2] * iInfo.strides[2] + ids[1] * iInfo.strides[1] + ids[0];
+ iData += iInfo.offset;
+
int id_dim = ids[dim];
const int out_dim = oInfo.dims[dim];
diff --git a/src/backend/opencl/kernel/select.cl b/src/backend/opencl/kernel/select.cl
index 2cfebb8..94a3603 100644
--- a/src/backend/opencl/kernel/select.cl
+++ b/src/backend/opencl/kernel/select.cl
@@ -27,12 +27,16 @@ int getOffset(dim_t *dims, dim_t *strides, dim_t *refdims)
__kernel
void select_kernel(__global T *optr, KParam oinfo,
- __global char *cptr, KParam cinfo,
- __global T *aptr, KParam ainfo,
- __global T *bptr, KParam binfo,
+ __global char *cptr_, KParam cinfo,
+ __global T *aptr_, KParam ainfo,
+ __global T *bptr_, KParam binfo,
int groups_0,
int groups_1)
{
+ __global char *cptr = cptr_ + cinfo.offset;
+ __global T *aptr = aptr_ + ainfo.offset;
+ __global T *bptr = bptr_ + binfo.offset;
+
const int idz = get_group_id(0) / groups_0;
const int idw = get_group_id(1) / groups_1;
@@ -63,12 +67,15 @@ void select_kernel(__global T *optr, KParam oinfo,
__kernel
void select_scalar_kernel(__global T *optr, KParam oinfo,
- __global char *cptr, KParam cinfo,
- __global T *aptr, KParam ainfo,
+ __global char *cptr_, KParam cinfo,
+ __global T *aptr_, KParam ainfo,
T b,
int groups_0,
int groups_1)
{
+ __global char *cptr = cptr_ + cinfo.offset;
+ __global T *aptr = aptr_ + ainfo.offset;
+
const int idz = get_group_id(0) / groups_0;
const int idw = get_group_id(1) / groups_1;
diff --git a/src/backend/opencl/kernel/susan.cl b/src/backend/opencl/kernel/susan.cl
index 5bfd094..97dc8d6 100644
--- a/src/backend/opencl/kernel/susan.cl
+++ b/src/backend/opencl/kernel/susan.cl
@@ -11,11 +11,14 @@
#ifdef RESPONSE
kernel
-void susan_responses(global T* out, global const T* in,
+void susan_responses(global T* out, global const T* in_,
+ const unsigned in_off,
const unsigned idim0, const unsigned idim1,
const float t, const float g,
const unsigned edge)
{
+ global const T* in = in_ + in_off;
+
const int rSqrd = RADIUS*RADIUS;
const int windLen = 2*RADIUS+1;
const int shrdLen = BLOCK_X + windLen-1;
diff --git a/src/backend/opencl/kernel/susan.hpp b/src/backend/opencl/kernel/susan.hpp
index 814616e..a3f669d 100644
--- a/src/backend/opencl/kernel/susan.hpp
+++ b/src/backend/opencl/kernel/susan.hpp
@@ -36,6 +36,7 @@ static const unsigned SUSAN_THREADS_Y = 16;
template<typename T, unsigned radius>
void susan(cl::Buffer* out, const cl::Buffer* in,
+ const unsigned in_off,
const unsigned idim0, const unsigned idim1,
const float t, const float g, const unsigned edge)
{
@@ -69,6 +70,7 @@ void susan(cl::Buffer* out, const cl::Buffer* in,
});
auto susanOp = make_kernel<Buffer, Buffer,
+ unsigned,
unsigned, unsigned,
float, float, unsigned>(*suKernel[device]);
@@ -76,7 +78,7 @@ void susan(cl::Buffer* out, const cl::Buffer* in,
NDRange global(divup(idim0-2*edge, local[0])*local[0],
divup(idim1-2*edge, local[1])*local[1]);
- susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, idim0, idim1, t, g, edge);
+ susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, in_off, idim0, idim1, t, g, edge);
} catch (cl::Error err) {
CL_TO_AF_ERROR(err);
diff --git a/src/backend/opencl/kernel/where.cl b/src/backend/opencl/kernel/where.cl
index ab4b31e..08c4731 100644
--- a/src/backend/opencl/kernel/where.cl
+++ b/src/backend/opencl/kernel/where.cl
@@ -42,7 +42,7 @@ void get_out_idx_kernel(__global uint *oData,
const uint gid = wid * rtInfo.strides[3] + zid * rtInfo.strides[2] + yid * rtInfo.strides[1] + groupId_x;
otData += wid * otInfo.strides[3] + zid * otInfo.strides[2] + yid * otInfo.strides[1];
- iData += wid * iInfo.strides[3] + zid * iInfo.strides[2] + yid * iInfo.strides[1];
+ iData += wid * iInfo.strides[3] + zid * iInfo.strides[2] + yid * iInfo.strides[1] + iInfo.offset;
bool cond = (yid < otInfo.dims[1]) && (zid < otInfo.dims[2]) && (wid < otInfo.dims[3]);
if (!cond) return;
diff --git a/src/backend/opencl/kernel/wrap.cl b/src/backend/opencl/kernel/wrap.cl
index 8171f9e..c8242a7 100644
--- a/src/backend/opencl/kernel/wrap.cl
+++ b/src/backend/opencl/kernel/wrap.cl
@@ -27,7 +27,7 @@ void wrap_kernel(__global T *optr, KParam out,
dim_t oidx1 = get_local_id(1) + get_local_size(1) * groupId_y;
optr += idx2 * out.strides[2] + idx3 * out.strides[3];
- iptr += idx2 * in.strides[2] + idx3 * in.strides[3];
+ iptr += idx2 * in.strides[2] + idx3 * in.strides[3] + in.offset;
if (oidx0 >= out.dims[0] || oidx1 >= out.dims[1]) return;
diff --git a/src/backend/opencl/nearest_neighbour.cpp b/src/backend/opencl/nearest_neighbour.cpp
index b2cb142..11fc832 100644
--- a/src/backend/opencl/nearest_neighbour.cpp
+++ b/src/backend/opencl/nearest_neighbour.cpp
@@ -26,8 +26,8 @@ static const unsigned THREADS = 256;
template<typename T, typename To, af_match_type dist_type>
void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist)
+ const Array<T>& query, const Array<T>& train,
+ const uint dist_dim, const uint n_dist)
{
uint sample_dim = (dist_dim == 0) ? 1 : 0;
const dim4 qDims = query.dims();
@@ -75,11 +75,8 @@ void nearest_neighbour_(Array<uint>& idx, Array<To>& dist,
kernel::transpose<T, false, false>(trainT, train);
}
- if (use_lmem) {
- kernel::nearest_neighbour<T, To, dist_type, true >(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- } else {
- kernel::nearest_neighbour<T, To, dist_type, false>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz);
- }
+ kernel::nearest_neighbour<T, To, dist_type>(idx, dist, queryT, trainT, 1, n_dist, lmem_sz, use_lmem);
+
}
template<typename T, typename To>
diff --git a/src/backend/opencl/susan.cpp b/src/backend/opencl/susan.cpp
index 71aca1b..5e9b1a6 100644
--- a/src/backend/opencl/susan.cpp
+++ b/src/backend/opencl/susan.cpp
@@ -35,15 +35,15 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
cl::Buffer* resp = bufferAlloc(in.elements()*sizeof(float));
switch(radius) {
- case 1: kernel::susan<T, 1>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 2: kernel::susan<T, 2>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 3: kernel::susan<T, 3>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 4: kernel::susan<T, 4>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 5: kernel::susan<T, 5>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 6: kernel::susan<T, 6>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 7: kernel::susan<T, 7>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 8: kernel::susan<T, 8>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
- case 9: kernel::susan<T, 9>(resp, in.get(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 1: kernel::susan<T, 1>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 2: kernel::susan<T, 2>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 3: kernel::susan<T, 3>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 4: kernel::susan<T, 4>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 5: kernel::susan<T, 5>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 6: kernel::susan<T, 6>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 7: kernel::susan<T, 7>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 8: kernel::susan<T, 8>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
+ case 9: kernel::susan<T, 9>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
}
unsigned corners_found = kernel::nonMaximal<T>(x_corners, y_corners, resp_corners,
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list