[arrayfire] 195/408: shared/local memory loading fix
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:55 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 499dc09a005de7c865429e1ce937bfef00d14337
Author: pradeep <pradeep at arrayfire.com>
Date: Wed Aug 5 16:13:55 2015 -0400
shared/local memory loading fix
---
src/backend/cuda/kernel/bilateral.hpp | 26 ++-----
src/backend/cuda/kernel/meanshift.hpp | 34 +++------
src/backend/cuda/kernel/medfilt.hpp | 34 ++-------
src/backend/cuda/kernel/morph.hpp | 129 ++++++++-------------------------
src/backend/cuda/kernel/sobel.hpp | 32 ++------
src/backend/opencl/kernel/bilateral.cl | 32 +++-----
src/backend/opencl/kernel/meanshift.cl | 36 ++++-----
src/backend/opencl/kernel/medfilt.cl | 37 +++-------
src/backend/opencl/kernel/morph.cl | 128 +++++++++-----------------------
src/backend/opencl/kernel/sobel.cl | 30 ++------
10 files changed, 139 insertions(+), 379 deletions(-)
diff --git a/src/backend/cuda/kernel/bilateral.hpp b/src/backend/cuda/kernel/bilateral.hpp
index d4e384b..566f1cc 100644
--- a/src/backend/cuda/kernel/bilateral.hpp
+++ b/src/backend/cuda/kernel/bilateral.hpp
@@ -90,26 +90,14 @@ void bilateralKernel(Param<outType> out, CParam<inType> in,
}
// pull image to local memory
- load2ShrdMem<inType, outType>(localMem, iptr, lx, ly, shrdLen,
- in.dims[0], in.dims[1], gx-radius, gy-radius, in.strides[1], in.strides[0]);
-
- int lx2 = lx + THREADS_X;
- int ly2 = ly + THREADS_Y;
- int gx2 = gx + THREADS_X;
- int gy2 = gy + THREADS_Y;
-
- if (lx<padding) {
- load2ShrdMem<inType, outType>(localMem, iptr, lx2, ly, shrdLen,
- in.dims[0], in.dims[1], gx2-radius, gy-radius, in.strides[1], in.strides[0]);
- }
- if (ly<padding) {
- load2ShrdMem<inType, outType>(localMem, iptr, lx, ly2, shrdLen,
- in.dims[0], in.dims[1], gx-radius, gy2-radius, in.strides[1], in.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2ShrdMem<inType, outType>(localMem, iptr, lx2, ly2, shrdLen,
- in.dims[0], in.dims[1], gx2-radius, gy2-radius, in.strides[1], in.strides[0]);
+ for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+ // move row_set get_local_size(1) along coloumns
+ for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+ load2ShrdMem<inType, outType>(localMem, iptr, a, b, shrdLen, in.dims[0], in.dims[1],
+ gx2-radius, gy2-radius, in.strides[1], in.strides[0]);
+ }
}
+
__syncthreads();
if (gx<in.dims[0] && gy<in.dims[1]) {
diff --git a/src/backend/cuda/kernel/meanshift.hpp b/src/backend/cuda/kernel/meanshift.hpp
index dc8c096..fef6128 100644
--- a/src/backend/cuda/kernel/meanshift.hpp
+++ b/src/backend/cuda/kernel/meanshift.hpp
@@ -64,7 +64,6 @@ void meanshiftKernel(Param<T> out, CParam<T> in,
// calculate necessary offset and window parameters
const int padding = 2*radius + 1;
- const int wind_len = padding - 1;
const int shrdLen = blockDim.x + padding;
const int schStride = shrdLen*(blockDim.y + padding);
// the variable ichStride will only effect when we have >1
@@ -84,32 +83,19 @@ void meanshiftKernel(Param<T> out, CParam<T> in,
const int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
const int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
- int gx2 = gx + blockDim.x;
- int gy2 = gy + blockDim.y;
- int lx2 = lx + blockDim.x;
- int ly2 = ly + blockDim.y;
+ // pull image to local memory
+ for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+ // move row_set get_local_size(1) along coloumns
+ for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+ load2ShrdMem<T, channels>(shrdMem, iptr, a, b, shrdLen, schStride,
+ in.dims[0], in.dims[1], gx2-radius, gy2-radius, ichStride,
+ in.strides[1], in.strides[0]);
+ }
+ }
+
int i = lx + radius;
int j = ly + radius;
- // pull image to local memory
- load2ShrdMem<T, channels>(shrdMem, iptr, lx, ly, shrdLen, schStride,
- in.dims[0], in.dims[1], gx-radius,
- gy-radius, ichStride, in.strides[1], in.strides[0]);
- if (lx<wind_len) {
- load2ShrdMem<T, channels>(shrdMem, iptr, lx2, ly, shrdLen, schStride,
- in.dims[0], in.dims[1], gx2-radius,
- gy-radius, ichStride, in.strides[1], in.strides[0]);
- }
- if (ly<wind_len) {
- load2ShrdMem<T, channels>(shrdMem, iptr, lx, ly2, shrdLen, schStride,
- in.dims[0], in.dims[1], gx-radius,
- gy2-radius, ichStride, in.strides[1], in.strides[0]);
- }
- if (lx<wind_len && ly<wind_len) {
- load2ShrdMem<T, channels>(shrdMem, iptr, lx2, ly2, shrdLen, schStride,
- in.dims[0], in.dims[1], gx2-radius,
- gy2-radius, ichStride, in.strides[1], in.strides[0]);
- }
__syncthreads();
if (gx>=in.dims[0] || gy>=in.dims[1])
diff --git a/src/backend/cuda/kernel/medfilt.hpp b/src/backend/cuda/kernel/medfilt.hpp
index 37aa967..ac614a3 100644
--- a/src/backend/cuda/kernel/medfilt.hpp
+++ b/src/backend/cuda/kernel/medfilt.hpp
@@ -90,35 +90,15 @@ void medfilt(Param<T> out, CParam<T> in, int nBBS0, int nBBS1)
int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
- // offset values for pulling image to local memory
- int lx2 = lx + blockDim.x;
- int ly2 = ly + blockDim.y;
- int gx2 = gx + blockDim.x;
- int gy2 = gy + blockDim.y;
-
// pull image to local memory
- load2ShrdMem<T, pad>(shrdMem, iptr, lx, ly, shrdLen,
- in.dims[0], in.dims[1],
- gx-halo, gy-halo,
- in.strides[1], in.strides[0]);
- if (lx<padding) {
- load2ShrdMem<T, pad>(shrdMem, iptr, lx2, ly, shrdLen,
- in.dims[0], in.dims[1],
- gx2-halo, gy-halo,
- in.strides[1], in.strides[0]);
- }
- if (ly<padding) {
- load2ShrdMem<T, pad>(shrdMem, iptr, lx, ly2, shrdLen,
- in.dims[0], in.dims[1],
- gx-halo, gy2-halo,
- in.strides[1], in.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2ShrdMem<T, pad>(shrdMem, iptr, lx2, ly2, shrdLen,
- in.dims[0], in.dims[1],
- gx2-halo, gy2-halo,
- in.strides[1], in.strides[0]);
+ for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+ // move row_set get_local_size(1) along coloumns
+ for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+ load2ShrdMem<T, pad>(shrdMem, iptr, a, b, shrdLen, in.dims[0], in.dims[1],
+ gx2-halo, gy2-halo, in.strides[1], in.strides[0]);
+ }
}
+
__syncthreads();
// Only continue if we're at a valid location
diff --git a/src/backend/cuda/kernel/morph.hpp b/src/backend/cuda/kernel/morph.hpp
index 30bee48..be99a68 100644
--- a/src/backend/cuda/kernel/morph.hpp
+++ b/src/backend/cuda/kernel/morph.hpp
@@ -72,7 +72,8 @@ static __global__ void morphKernel(Param<T> out, CParam<T> in,
// calculate necessary offset and window parameters
const int halo = windLen/2;
const int padding= 2*halo;
- const int shrdLen= blockDim.x + padding + 1;
+ const int shrdLen = blockDim.x + padding + 1;
+ const int shrdLen1 = blockDim.y + padding;
// gfor batch offsets
unsigned b2 = blockIdx.x / nBBS0;
@@ -80,48 +81,25 @@ static __global__ void morphKernel(Param<T> out, CParam<T> in,
const T* iptr = (const T *) in.ptr + (b2 * in.strides[2] + b3 * in.strides[3]);
T* optr = (T * )out.ptr + (b2 * out.strides[2] + b3 * out.strides[3]);
- int gx, gy, i, j;
- { //scoping out unnecessary variables
- // local neighborhood indices
const int lx = threadIdx.x;
const int ly = threadIdx.y;
// global indices
- gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
- gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
-
- // offset values for pulling image to local memory
- int lx2 = lx + blockDim.x;
- int ly2 = ly + blockDim.y;
- int gx2 = gx + blockDim.x;
- int gy2 = gy + blockDim.y;
+ const int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
+ const int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
// pull image to local memory
- load2ShrdMem(shrdMem, iptr, lx, ly, shrdLen,
- in.dims[0], in.dims[1],
- gx-halo, gy-halo,
- in.strides[1], in.strides[0]);
- if (lx<padding) {
- load2ShrdMem(shrdMem, iptr, lx2, ly, shrdLen,
- in.dims[0], in.dims[1],
- gx2-halo, gy-halo,
- in.strides[1], in.strides[0]);
- }
- if (ly<padding) {
- load2ShrdMem(shrdMem, iptr, lx, ly2, shrdLen,
- in.dims[0], in.dims[1],
- gx-halo, gy2-halo,
- in.strides[1], in.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2ShrdMem(shrdMem, iptr, lx2, ly2, shrdLen,
- in.dims[0], in.dims[1],
- gx2-halo, gy2-halo,
- in.strides[1], in.strides[0]);
- }
- i = lx + halo;
- j = ly + halo;
+ for (int b=ly, gy2=gy; b<shrdLen1; b+=blockDim.y, gy2+=blockDim.y) {
+ // move row_set get_local_size(1) along coloumns
+ for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+ load2ShrdMem(shrdMem, iptr, a, b, shrdLen, in.dims[0], in.dims[1],
+ gx2-halo, gy2-halo, in.strides[1], in.strides[0]);
+ }
}
+
+ int i = lx + halo;
+ int j = ly + halo;
+
__syncthreads();
const T * d_filt = (const T *)cFilter;
@@ -184,6 +162,8 @@ static __global__ void morph3DKernel(Param<T> out, CParam<T> in, int nBBS)
const int se_area = windLen*windLen;
const int shrdLen = blockDim.x + padding + 1;
+ const int shrdLen1 = blockDim.y + padding;
+ const int shrdLen2 = blockDim.z + padding;
const int shrdArea = shrdLen * (blockDim.y+padding);
// gfor batch offsets
@@ -192,76 +172,29 @@ static __global__ void morph3DKernel(Param<T> out, CParam<T> in, int nBBS)
const T* iptr = (const T *) in.ptr + (batchId * in.strides[3]);
T* optr = (T * )out.ptr + (batchId * out.strides[3]);
- int gx, gy, gz, i, j, k;
- { // scoping out unnecessary variables
const int lx = threadIdx.x;
const int ly = threadIdx.y;
const int lz = threadIdx.z;
- gx = blockDim.x * (blockIdx.x-batchId*nBBS) + lx;
- gy = blockDim.y * blockIdx.y + ly;
- gz = blockDim.z * blockIdx.z + lz;
-
- const int gx2 = gx + blockDim.x;
- const int gy2 = gy + blockDim.y;
- const int gz2 = gz + blockDim.z;
- const int lx2 = lx + blockDim.x;
- const int ly2 = ly + blockDim.y;
- const int lz2 = lz + blockDim.z;
-
- // pull volume to shared memory
- load2ShrdVolume(shrdMem, iptr, lx, ly, lz, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx-halo, gy-halo, gz-halo,
- in.strides[2], in.strides[1], in.strides[0]);
- if (lx<padding) {
- load2ShrdVolume(shrdMem, iptr, lx2, ly, lz, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx2-halo, gy-halo, gz-halo,
- in.strides[2], in.strides[1], in.strides[0]);
- }
- if (ly<padding) {
- load2ShrdVolume(shrdMem, iptr, lx, ly2, lz, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx-halo, gy2-halo, gz-halo,
- in.strides[2], in.strides[1], in.strides[0]);
- }
- if (lz<padding) {
- load2ShrdVolume(shrdMem, iptr, lx, ly, lz2, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx-halo, gy-halo, gz2-halo,
- in.strides[2], in.strides[1], in.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2ShrdVolume(shrdMem, iptr, lx2, ly2, lz, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx2-halo, gy2-halo, gz-halo,
- in.strides[2], in.strides[1], in.strides[0]);
- }
- if (ly<padding && lz<padding) {
- load2ShrdVolume(shrdMem, iptr, lx, ly2, lz2, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx-halo, gy2-halo, gz2-halo,
- in.strides[2], in.strides[1], in.strides[0]);
- }
- if (lz<padding && lx<padding) {
- load2ShrdVolume(shrdMem, iptr, lx2, ly, lz2, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx2-halo, gy-halo, gz2-halo,
- in.strides[2], in.strides[1], in.strides[0]);
- }
- if (lx<padding && ly<padding && lz<padding) {
- load2ShrdVolume(shrdMem, iptr, lx2, ly2, lz2, shrdLen, shrdArea,
- in.dims[0], in.dims[1], in.dims[2],
- gx2-halo, gy2-halo, gz2-halo,
+ const int gx = blockDim.x * (blockIdx.x-batchId*nBBS) + lx;
+ const int gy = blockDim.y * blockIdx.y + ly;
+ const int gz = blockDim.z * blockIdx.z + lz;
+
+ for (int c=lz, gz2=gz; c<shrdLen2; c+=blockDim.z, gz2+=blockDim.z) {
+ for (int b=ly, gy2=gy; b<shrdLen1; b+=blockDim.y, gy2+=blockDim.y) {
+ for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+ load2ShrdVolume(shrdMem, iptr, a, b, c, shrdLen, shrdArea,
+ in.dims[0], in.dims[1], in.dims[2], gx2-halo, gy2-halo, gz2-halo,
in.strides[2], in.strides[1], in.strides[0]);
+ }
+ }
}
+
__syncthreads();
// indices of voxel owned by current thread
- i = lx + halo;
- j = ly + halo;
- k = lz + halo;
- }
+ int i = lx + halo;
+ int j = ly + halo;
+ int k = lz + halo;
const T * d_filt = (const T *)cFilter;
T acc = shrdMem[ lIdx3D(i, j, k, shrdArea, shrdLen, 1) ];
diff --git a/src/backend/cuda/kernel/sobel.hpp b/src/backend/cuda/kernel/sobel.hpp
index 550339e..f8c9e98 100644
--- a/src/backend/cuda/kernel/sobel.hpp
+++ b/src/backend/cuda/kernel/sobel.hpp
@@ -44,6 +44,7 @@ void sobel3x3(Param<To> dx, Param<To> dy, CParam<Ti> in, int nBBS0, int nBBS1)
// calculate necessary offset and window parameters
const int radius = 1;
const int padding = 2*radius;
+ const int shrdLen = blockDim.x + padding;
// batch offsets
unsigned b2 = blockIdx.x / nBBS0;
@@ -60,31 +61,14 @@ void sobel3x3(Param<To> dx, Param<To> dy, CParam<Ti> in, int nBBS0, int nBBS1)
int gx = THREADS_X * (blockIdx.x-b2*nBBS0) + lx;
int gy = THREADS_Y * (blockIdx.y-b3*nBBS1) + ly;
- // offset values for pulling image to local memory
- int lx2 = lx + THREADS_X;
- int ly2 = ly + THREADS_Y;
- int gx2 = gx + THREADS_X;
- int gy2 = gy + THREADS_Y;
-
- // pull image to local memory
- shrdMem[lx][ly] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
- gx-radius, gy-radius,
- in.strides[1], in.strides[0]);
- if (lx<padding) {
- shrdMem[lx2][ly] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
- gx2-radius, gy-radius,
- in.strides[1], in.strides[0]);
- }
- if (ly<padding) {
- shrdMem[lx][ly2] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
- gx-radius, gy2-radius,
- in.strides[1], in.strides[0]);
- }
- if (lx<padding && ly<padding) {
- shrdMem[lx2][ly2] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
- gx2-radius, gy2-radius,
- in.strides[1], in.strides[0]);
+ for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+ for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+ shrdMem[a][b] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
+ gx2-radius, gy2-radius,
+ in.strides[1], in.strides[0]);
+ }
}
+
__syncthreads();
// Only continue if we're at a valid location
diff --git a/src/backend/opencl/kernel/bilateral.cl b/src/backend/opencl/kernel/bilateral.cl
index b54f809..05c5ac6 100644
--- a/src/backend/opencl/kernel/bilateral.cl
+++ b/src/backend/opencl/kernel/bilateral.cl
@@ -66,30 +66,18 @@ void bilateral(__global outType * d_dst,
gauss2d[ly*window_size+lx] = exp( ((x*x) + (y*y)) / (-2.f * variance_space));
}
- int lx2 = lx + get_local_size(0);
- int ly2 = ly + get_local_size(1);
- int gx2 = gx + get_local_size(0);
- int gy2 = gy + get_local_size(1);
-
+ int s0 = iInfo.strides[0];
+ int s1 = iInfo.strides[1];
+ int d0 = iInfo.dims[0];
+ int d1 = iInfo.dims[1];
// pull image to local memory
- load2LocalMem(localMem, in, lx, ly, shrdLen,
- iInfo.dims[0], iInfo.dims[1], gx-radius,
- gy-radius, iInfo.strides[1], iInfo.strides[0]);
- if (lx<padding) {
- load2LocalMem(localMem, in, lx2, ly, shrdLen,
- iInfo.dims[0], iInfo.dims[1], gx2-radius,
- gy-radius, iInfo.strides[1], iInfo.strides[0]);
- }
- if (ly<padding) {
- load2LocalMem(localMem, in, lx, ly2, shrdLen,
- iInfo.dims[0], iInfo.dims[1], gx-radius,
- gy2-radius, iInfo.strides[1], iInfo.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2LocalMem(localMem, in, lx2, ly2, shrdLen,
- iInfo.dims[0], iInfo.dims[1], gx2-radius,
- gy2-radius, iInfo.strides[1], iInfo.strides[0]);
+ for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+ // move row_set get_local_size(1) along coloumns
+ for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+ load2LocalMem(localMem, in, a, b, shrdLen, d0, d1, gx2-radius, gy2-radius, s1, s0);
+ }
}
+
barrier(CLK_LOCAL_MEM_FENCE);
if (gx<iInfo.dims[0] && gy<iInfo.dims[1]) {
diff --git a/src/backend/opencl/kernel/meanshift.cl b/src/backend/opencl/kernel/meanshift.cl
index ada4570..2b24de6 100644
--- a/src/backend/opencl/kernel/meanshift.cl
+++ b/src/backend/opencl/kernel/meanshift.cl
@@ -57,32 +57,22 @@ void meanshift(__global T * d_dst,
const int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
const int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
- int gx2 = gx + get_local_size(0);
- int gy2 = gy + get_local_size(1);
- int lx2 = lx + get_local_size(0);
- int ly2 = ly + get_local_size(1);
+ int s0 = iInfo.strides[0];
+ int s1 = iInfo.strides[1];
+ int d0 = iInfo.dims[0];
+ int d1 = iInfo.dims[1];
+ // pull image to local memory
+ for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+ // move row_set get_local_size(1) along coloumns
+ for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+ load2LocalMem(localMem, iptr, a, b, shrdLen, schStride, channels,
+ d0, d1, gx2-radius, gy2-radius, ichStride, s1, s0);
+ }
+ }
+
int i = lx + radius;
int j = ly + radius;
- // pull image to local memory
- load2LocalMem(localMem, iptr, lx, ly, shrdLen, schStride, channels,
- iInfo.dims[0], iInfo.dims[1], gx-radius,
- gy-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
- if (lx<wind_len) {
- load2LocalMem(localMem, iptr, lx2, ly, shrdLen, schStride, channels,
- iInfo.dims[0], iInfo.dims[1], gx2-radius,
- gy-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
- }
- if (ly<wind_len) {
- load2LocalMem(localMem, iptr, lx, ly2, shrdLen, schStride, channels,
- iInfo.dims[0], iInfo.dims[1], gx-radius,
- gy2-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
- }
- if (lx<wind_len && ly<wind_len) {
- load2LocalMem(localMem, iptr, lx2, ly2, shrdLen, schStride, channels,
- iInfo.dims[0], iInfo.dims[1], gx2-radius,
- gy2-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (gx<iInfo.dims[0] && gy<iInfo.dims[1])
diff --git a/src/backend/opencl/kernel/medfilt.cl b/src/backend/opencl/kernel/medfilt.cl
index de541c4..78a62b2 100644
--- a/src/backend/opencl/kernel/medfilt.cl
+++ b/src/backend/opencl/kernel/medfilt.cl
@@ -64,35 +64,18 @@ void medfilt(__global T * out,
int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
- // offset values for pulling image to local memory
- int lx2 = lx + get_local_size(0);
- int ly2 = ly + get_local_size(1);
- int gx2 = gx + get_local_size(0);
- int gy2 = gy + get_local_size(1);
-
+ int s0 = iInfo.strides[0];
+ int s1 = iInfo.strides[1];
+ int d0 = iInfo.dims[0];
+ int d1 = iInfo.dims[1];
// pull image to local memory
- load2ShrdMem(localMem, iptr, lx, ly, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx-halo, gy-halo,
- iInfo.strides[1], iInfo.strides[0]);
- if (lx<padding) {
- load2ShrdMem(localMem, iptr, lx2, ly, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx2-halo, gy-halo,
- iInfo.strides[1], iInfo.strides[0]);
- }
- if (ly<padding) {
- load2ShrdMem(localMem, iptr, lx, ly2, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx-halo, gy2-halo,
- iInfo.strides[1], iInfo.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2ShrdMem(localMem, iptr, lx2, ly2, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx2-halo, gy2-halo,
- iInfo.strides[1], iInfo.strides[0]);
+ for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+ // move row_set get_local_size(1) along coloumns
+ for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+ load2ShrdMem(localMem, iptr, a, b, shrdLen, d0, d1, gx2-halo, gy2-halo, s1, s0);
+ }
}
+
barrier(CLK_LOCAL_MEM_FENCE);
// Only continue if we're at a valid location
diff --git a/src/backend/opencl/kernel/morph.cl b/src/backend/opencl/kernel/morph.cl
index c92d06f..ac86fc1 100644
--- a/src/backend/opencl/kernel/morph.cl
+++ b/src/backend/opencl/kernel/morph.cl
@@ -38,7 +38,8 @@ void morph(__global T * out,
{
const int halo = windLen/2;
const int padding= 2*halo;
- const int shrdLen= get_local_size(0) + padding + 1;
+ const int shrdLen = get_local_size(0) + padding + 1;
+ const int shrdLen1= get_local_size(1) + padding;
// gfor batch offsets
int b2 = get_group_id(0) / nBBS0;
@@ -54,34 +55,14 @@ void morph(__global T * out,
int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
- // offset values for pulling image to local memory
- int lx2 = lx + get_local_size(0);
- int ly2 = ly + get_local_size(1);
- int gx2 = gx + get_local_size(0);
- int gy2 = gy + get_local_size(1);
-
- // pull image to local memory
- load2LocalMem(localMem, in, lx, ly, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx-halo, gy-halo,
- iInfo.strides[1], iInfo.strides[0]);
- if (lx<padding) {
- load2LocalMem(localMem, in, lx2, ly, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx2-halo, gy-halo,
- iInfo.strides[1], iInfo.strides[0]);
- }
- if (ly<padding) {
- load2LocalMem(localMem, in, lx, ly2, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx-halo, gy2-halo,
- iInfo.strides[1], iInfo.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2LocalMem(localMem, in, lx2, ly2, shrdLen,
- iInfo.dims[0], iInfo.dims[1],
- gx2-halo, gy2-halo,
- iInfo.strides[1], iInfo.strides[0]);
+ int s0 = iInfo.strides[0];
+ int s1 = iInfo.strides[1];
+ int d0 = iInfo.dims[0];
+ int d1 = iInfo.dims[1];
+ for (int b=ly, gy2=gy; b<shrdLen1; b+=get_local_size(1), gy2+=get_local_size(1)) {
+ for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+ load2LocalMem(localMem, in, a, b, shrdLen, d0, d1, gx-halo, gy-halo, s1, s0);
+ }
}
int i = lx + halo;
@@ -149,6 +130,8 @@ void morph3d(__global T * out,
const int se_area = windLen*windLen;
const int shrdLen = get_local_size(0) + padding + 1;
+ const int shrdLen1 = get_local_size(1) + padding;
+ const int shrdLen2 = get_local_size(2) + padding;
const int shrdArea = shrdLen * (get_local_size(1)+padding);
// gfor batch offsets
@@ -156,76 +139,35 @@ void morph3d(__global T * out,
in += (batchId * iInfo.strides[3] + iInfo.offset);
out += (batchId * oInfo.strides[3]);
- int gx, gy, gz, i, j, k;
- { // scoping out unnecessary variables
const int lx = get_local_id(0);
const int ly = get_local_id(1);
const int lz = get_local_id(2);
- gx = get_local_size(0) * (get_group_id(0)-batchId*nBBS) + lx;
- gy = get_local_size(1) * get_group_id(1) + ly;
- gz = get_local_size(2) * get_group_id(2) + lz;
-
- const int gx2 = gx + get_local_size(0);
- const int gy2 = gy + get_local_size(1);
- const int gz2 = gz + get_local_size(2);
- const int lx2 = lx + get_local_size(0);
- const int ly2 = ly + get_local_size(1);
- const int lz2 = lz + get_local_size(2);
-
- // pull volume to shared memory
- load2LocVolume(localMem, in, lx, ly, lz, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx-halo, gy-halo, gz-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
- if (lx<padding) {
- load2LocVolume(localMem, in, lx2, ly, lz, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx2-halo, gy-halo, gz-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
- }
- if (ly<padding) {
- load2LocVolume(localMem, in, lx, ly2, lz, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx-halo, gy2-halo, gz-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
- }
- if (lz<padding) {
- load2LocVolume(localMem, in, lx, ly, lz2, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx-halo, gy-halo, gz2-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
- }
- if (lx<padding && ly<padding) {
- load2LocVolume(localMem, in, lx2, ly2, lz, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx2-halo, gy2-halo, gz-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
- }
- if (ly<padding && lz<padding) {
- load2LocVolume(localMem, in, lx, ly2, lz2, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx-halo, gy2-halo, gz2-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
- }
- if (lz<padding && lx<padding) {
- load2LocVolume(localMem, in, lx2, ly, lz2, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx2-halo, gy-halo, gz2-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
- }
- if (lx<padding && ly<padding && lz<padding) {
- load2LocVolume(localMem, in, lx2, ly2, lz2, shrdLen, shrdArea,
- iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
- gx2-halo, gy2-halo, gz2-halo,
- iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
+ const int gx = get_local_size(0) * (get_group_id(0)-batchId*nBBS) + lx;
+ const int gy = get_local_size(1) * get_group_id(1) + ly;
+ const int gz = get_local_size(2) * get_group_id(2) + lz;
+
+ int s0 = iInfo.strides[0];
+ int s1 = iInfo.strides[1];
+ int s2 = iInfo.strides[2];
+ int d0 = iInfo.dims[0];
+ int d1 = iInfo.dims[1];
+ int d2 = iInfo.dims[2];
+
+ for (int c=lz, gz2=gz; c<shrdLen2; c+=get_local_size(2), gz2+=get_local_size(2)) {
+ for (int b=ly, gy2=gy; b<shrdLen1; b+=get_local_size(1), gy2+=get_local_size(1)) {
+ for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+ load2LocVolume(localMem, in, a, b, c, shrdLen, shrdArea, d0, d1, d2,
+ gx2-halo, gy2-halo, gz2-halo, s2, s1, s0);
+ }
+ }
}
+
barrier(CLK_LOCAL_MEM_FENCE);
- // indices of voxel owned by current thread
- i = lx + halo;
- j = ly + halo;
- k = lz + halo;
- }
+
+ int i = lx + halo;
+ int j = ly + halo;
+ int k = lz + halo;
T acc = localMem[ lIdx3D(i, j, k, shrdArea, shrdLen, 1) ];
#pragma unroll
diff --git a/src/backend/opencl/kernel/sobel.cl b/src/backend/opencl/kernel/sobel.cl
index fbb5671..0e83035 100644
--- a/src/backend/opencl/kernel/sobel.cl
+++ b/src/backend/opencl/kernel/sobel.cl
@@ -41,28 +41,14 @@ void sobel3x3(global To * dx, KParam dxInfo,
int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
- int lx2 = lx + get_local_size(0);
- int ly2 = ly + get_local_size(1);
- int gx2 = gx + get_local_size(0);
- int gy2 = gy + get_local_size(1);
-
- localMem[lx+shrdLen*ly] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
- gx-radius, gy-radius,
- iInfo.strides[1], iInfo.strides[0]);
- if (lx<padding) {
- localMem[lx2+shrdLen*ly] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
- gx2-radius, gy-radius,
- iInfo.strides[1], iInfo.strides[0]);
- }
- if (ly<padding) {
- localMem[lx+shrdLen*ly2] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
- gx-radius, gy2-radius,
- iInfo.strides[1], iInfo.strides[0]);
- }
- if (lx<padding && ly<padding) {
- localMem[lx2+shrdLen*ly2] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
- gx2-radius, gy2-radius,
- iInfo.strides[1], iInfo.strides[0]);
+ int s0 = iInfo.strides[0];
+ int s1 = iInfo.strides[1];
+ int d0 = iInfo.dims[0];
+ int d1 = iInfo.dims[1];
+ for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+ for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+ localMem[a+shrdLen*b] = load2LocalMem(iptr, d0, d1, gx2-radius, gy2-radius, s1, s0);
+ }
}
barrier(CLK_LOCAL_MEM_FENCE);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list