[arrayfire] 195/408: shared/local memory loading fix

Mon Sep 21 19:11:55 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 499dc09a005de7c865429e1ce937bfef00d14337
Author: pradeep <pradeep at arrayfire.com>
Date:   Wed Aug 5 16:13:55 2015 -0400

    shared/local memory loading fix
---
 src/backend/cuda/kernel/bilateral.hpp  |  26 ++-----
 src/backend/cuda/kernel/meanshift.hpp  |  34 +++------
 src/backend/cuda/kernel/medfilt.hpp    |  34 ++-------
 src/backend/cuda/kernel/morph.hpp      | 129 ++++++++-------------------------
 src/backend/cuda/kernel/sobel.hpp      |  32 ++------
 src/backend/opencl/kernel/bilateral.cl |  32 +++-----
 src/backend/opencl/kernel/meanshift.cl |  36 ++++-----
 src/backend/opencl/kernel/medfilt.cl   |  37 +++-------
 src/backend/opencl/kernel/morph.cl     | 128 +++++++++-----------------------
 src/backend/opencl/kernel/sobel.cl     |  30 ++------
 10 files changed, 139 insertions(+), 379 deletions(-)

diff --git a/src/backend/cuda/kernel/bilateral.hpp b/src/backend/cuda/kernel/bilateral.hpp
index d4e384b..566f1cc 100644
--- a/src/backend/cuda/kernel/bilateral.hpp
+++ b/src/backend/cuda/kernel/bilateral.hpp
@@ -90,26 +90,14 @@ void bilateralKernel(Param<outType> out, CParam<inType> in,
     }
 
     // pull image to local memory
-    load2ShrdMem<inType, outType>(localMem, iptr, lx, ly, shrdLen,
-            in.dims[0], in.dims[1], gx-radius, gy-radius, in.strides[1], in.strides[0]);
-
-    int lx2 = lx + THREADS_X;
-    int ly2 = ly + THREADS_Y;
-    int gx2 = gx + THREADS_X;
-    int gy2 = gy + THREADS_Y;
-
-    if (lx<padding) {
-        load2ShrdMem<inType, outType>(localMem, iptr, lx2, ly, shrdLen,
-                in.dims[0], in.dims[1], gx2-radius, gy-radius, in.strides[1], in.strides[0]);
-    }
-    if (ly<padding) {
-        load2ShrdMem<inType, outType>(localMem, iptr, lx, ly2, shrdLen,
-                in.dims[0], in.dims[1], gx-radius, gy2-radius, in.strides[1], in.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2ShrdMem<inType, outType>(localMem, iptr, lx2, ly2, shrdLen,
-                in.dims[0], in.dims[1], gx2-radius, gy2-radius, in.strides[1], in.strides[0]);
+    for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+        // move row_set get_local_size(1) along coloumns
+        for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+            load2ShrdMem<inType, outType>(localMem, iptr, a, b, shrdLen, in.dims[0], in.dims[1],
+                                          gx2-radius, gy2-radius, in.strides[1], in.strides[0]);
+        }
     }
+
     __syncthreads();
 
     if (gx<in.dims[0] && gy<in.dims[1]) {
diff --git a/src/backend/cuda/kernel/meanshift.hpp b/src/backend/cuda/kernel/meanshift.hpp
index dc8c096..fef6128 100644
--- a/src/backend/cuda/kernel/meanshift.hpp
+++ b/src/backend/cuda/kernel/meanshift.hpp
@@ -64,7 +64,6 @@ void meanshiftKernel(Param<T> out, CParam<T> in,
 
     // calculate necessary offset and window parameters
     const int padding     = 2*radius + 1;
-    const int wind_len    = padding - 1;
     const int shrdLen     = blockDim.x + padding;
     const int schStride   = shrdLen*(blockDim.y + padding);
     // the variable ichStride will only effect when we have >1
@@ -84,32 +83,19 @@ void meanshiftKernel(Param<T> out, CParam<T> in,
     const int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
     const int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
 
-    int gx2 = gx + blockDim.x;
-    int gy2 = gy + blockDim.y;
-    int lx2 = lx + blockDim.x;
-    int ly2 = ly + blockDim.y;
+    // pull image to local memory
+    for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+        // move row_set get_local_size(1) along coloumns
+        for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+            load2ShrdMem<T, channels>(shrdMem, iptr, a, b, shrdLen, schStride,
+                    in.dims[0], in.dims[1], gx2-radius, gy2-radius, ichStride,
+                    in.strides[1], in.strides[0]);
+        }
+    }
+
     int i   = lx + radius;
     int j   = ly + radius;
 
-    // pull image to local memory
-    load2ShrdMem<T, channels>(shrdMem, iptr, lx, ly, shrdLen, schStride,
-                              in.dims[0], in.dims[1], gx-radius,
-                              gy-radius, ichStride, in.strides[1], in.strides[0]);
-    if (lx<wind_len) {
-        load2ShrdMem<T, channels>(shrdMem, iptr, lx2, ly, shrdLen, schStride,
-                                  in.dims[0], in.dims[1], gx2-radius,
-                                  gy-radius, ichStride, in.strides[1], in.strides[0]);
-    }
-    if (ly<wind_len) {
-        load2ShrdMem<T, channels>(shrdMem, iptr, lx, ly2, shrdLen, schStride,
-                                  in.dims[0], in.dims[1], gx-radius,
-                                  gy2-radius, ichStride, in.strides[1], in.strides[0]);
-    }
-    if (lx<wind_len && ly<wind_len) {
-        load2ShrdMem<T, channels>(shrdMem, iptr, lx2, ly2, shrdLen, schStride,
-                                  in.dims[0], in.dims[1], gx2-radius,
-                                  gy2-radius, ichStride, in.strides[1], in.strides[0]);
-    }
     __syncthreads();
 
     if (gx>=in.dims[0] || gy>=in.dims[1])
diff --git a/src/backend/cuda/kernel/medfilt.hpp b/src/backend/cuda/kernel/medfilt.hpp
index 37aa967..ac614a3 100644
--- a/src/backend/cuda/kernel/medfilt.hpp
+++ b/src/backend/cuda/kernel/medfilt.hpp
@@ -90,35 +90,15 @@ void medfilt(Param<T> out, CParam<T> in, int nBBS0, int nBBS1)
     int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
     int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
 
-    // offset values for pulling image to local memory
-    int lx2 = lx + blockDim.x;
-    int ly2 = ly + blockDim.y;
-    int gx2 = gx + blockDim.x;
-    int gy2 = gy + blockDim.y;
-
     // pull image to local memory
-    load2ShrdMem<T, pad>(shrdMem, iptr, lx, ly, shrdLen,
-                         in.dims[0], in.dims[1],
-                         gx-halo, gy-halo,
-                         in.strides[1], in.strides[0]);
-    if (lx<padding) {
-        load2ShrdMem<T, pad>(shrdMem, iptr, lx2, ly, shrdLen,
-                             in.dims[0], in.dims[1],
-                             gx2-halo, gy-halo,
-                             in.strides[1], in.strides[0]);
-    }
-    if (ly<padding) {
-        load2ShrdMem<T, pad>(shrdMem, iptr, lx, ly2, shrdLen,
-                             in.dims[0], in.dims[1],
-                             gx-halo, gy2-halo,
-                             in.strides[1], in.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2ShrdMem<T, pad>(shrdMem, iptr, lx2, ly2, shrdLen,
-                             in.dims[0], in.dims[1],
-                             gx2-halo, gy2-halo,
-                             in.strides[1], in.strides[0]);
+    for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+        // move row_set get_local_size(1) along coloumns
+        for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+            load2ShrdMem<T, pad>(shrdMem, iptr, a, b, shrdLen, in.dims[0], in.dims[1],
+                    gx2-halo, gy2-halo, in.strides[1], in.strides[0]);
+        }
     }
+
     __syncthreads();
 
     // Only continue if we're at a valid location
diff --git a/src/backend/cuda/kernel/morph.hpp b/src/backend/cuda/kernel/morph.hpp
index 30bee48..be99a68 100644
--- a/src/backend/cuda/kernel/morph.hpp
+++ b/src/backend/cuda/kernel/morph.hpp
@@ -72,7 +72,8 @@ static __global__ void morphKernel(Param<T> out, CParam<T> in,
     // calculate necessary offset and window parameters
     const int halo   = windLen/2;
     const int padding= 2*halo;
-    const int shrdLen= blockDim.x + padding + 1;
+    const int shrdLen  = blockDim.x + padding + 1;
+    const int shrdLen1 = blockDim.y + padding;
 
     // gfor batch offsets
     unsigned b2 = blockIdx.x / nBBS0;
@@ -80,48 +81,25 @@ static __global__ void morphKernel(Param<T> out, CParam<T> in,
     const T* iptr    = (const T *) in.ptr + (b2 *  in.strides[2] + b3 *  in.strides[3]);
     T*       optr    = (T *      )out.ptr + (b2 * out.strides[2] + b3 * out.strides[3]);
 
-    int gx, gy, i, j;
-    { //scoping out unnecessary variables
-    // local neighborhood indices
     const int lx = threadIdx.x;
     const int ly = threadIdx.y;
 
     // global indices
-    gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
-    gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
-
-    // offset values for pulling image to local memory
-    int lx2      = lx + blockDim.x;
-    int ly2      = ly + blockDim.y;
-    int gx2      = gx + blockDim.x;
-    int gy2      = gy + blockDim.y;
+    const int gx = blockDim.x * (blockIdx.x-b2*nBBS0) + lx;
+    const int gy = blockDim.y * (blockIdx.y-b3*nBBS1) + ly;
 
     // pull image to local memory
-    load2ShrdMem(shrdMem, iptr, lx, ly, shrdLen,
-                 in.dims[0], in.dims[1],
-                 gx-halo, gy-halo,
-                 in.strides[1], in.strides[0]);
-    if (lx<padding) {
-        load2ShrdMem(shrdMem, iptr, lx2, ly, shrdLen,
-                     in.dims[0], in.dims[1],
-                     gx2-halo, gy-halo,
-                     in.strides[1], in.strides[0]);
-    }
-    if (ly<padding) {
-        load2ShrdMem(shrdMem, iptr, lx, ly2, shrdLen,
-                     in.dims[0], in.dims[1],
-                     gx-halo, gy2-halo,
-                     in.strides[1], in.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2ShrdMem(shrdMem, iptr, lx2, ly2, shrdLen,
-                     in.dims[0], in.dims[1],
-                     gx2-halo, gy2-halo,
-                     in.strides[1], in.strides[0]);
-    }
-    i = lx + halo;
-    j = ly + halo;
+    for (int b=ly, gy2=gy; b<shrdLen1; b+=blockDim.y, gy2+=blockDim.y) {
+        // move row_set get_local_size(1) along coloumns
+        for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+            load2ShrdMem(shrdMem, iptr, a, b, shrdLen, in.dims[0], in.dims[1],
+                         gx2-halo, gy2-halo, in.strides[1], in.strides[0]);
+        }
     }
+
+    int i = lx + halo;
+    int j = ly + halo;
+
     __syncthreads();
 
     const T * d_filt = (const T *)cFilter;
@@ -184,6 +162,8 @@ static __global__ void morph3DKernel(Param<T> out, CParam<T> in, int nBBS)
 
     const int se_area   = windLen*windLen;
     const int shrdLen   = blockDim.x + padding + 1;
+    const int shrdLen1  = blockDim.y + padding;
+    const int shrdLen2  = blockDim.z + padding;
     const int shrdArea  = shrdLen * (blockDim.y+padding);
 
     // gfor batch offsets
@@ -192,76 +172,29 @@ static __global__ void morph3DKernel(Param<T> out, CParam<T> in, int nBBS)
     const T* iptr    = (const T *) in.ptr + (batchId *  in.strides[3]);
     T*       optr    = (T *      )out.ptr + (batchId * out.strides[3]);
 
-    int gx, gy, gz, i, j, k;
-    { // scoping out unnecessary variables
     const int lx = threadIdx.x;
     const int ly = threadIdx.y;
     const int lz = threadIdx.z;
 
-    gx = blockDim.x * (blockIdx.x-batchId*nBBS) + lx;
-    gy = blockDim.y * blockIdx.y + ly;
-    gz = blockDim.z * blockIdx.z + lz;
-
-    const int gx2 = gx + blockDim.x;
-    const int gy2 = gy + blockDim.y;
-    const int gz2 = gz + blockDim.z;
-    const int lx2 = lx + blockDim.x;
-    const int ly2 = ly + blockDim.y;
-    const int lz2 = lz + blockDim.z;
-
-    // pull volume to shared memory
-    load2ShrdVolume(shrdMem, iptr, lx, ly, lz, shrdLen, shrdArea,
-                    in.dims[0], in.dims[1], in.dims[2],
-                    gx-halo, gy-halo, gz-halo,
-                    in.strides[2], in.strides[1], in.strides[0]);
-    if (lx<padding) {
-        load2ShrdVolume(shrdMem, iptr, lx2, ly, lz, shrdLen, shrdArea,
-                        in.dims[0], in.dims[1], in.dims[2],
-                        gx2-halo, gy-halo, gz-halo,
-                        in.strides[2], in.strides[1], in.strides[0]);
-    }
-    if (ly<padding) {
-        load2ShrdVolume(shrdMem, iptr, lx, ly2, lz, shrdLen, shrdArea,
-                        in.dims[0], in.dims[1], in.dims[2],
-                        gx-halo, gy2-halo, gz-halo,
-                        in.strides[2], in.strides[1], in.strides[0]);
-    }
-    if (lz<padding) {
-        load2ShrdVolume(shrdMem, iptr, lx, ly, lz2, shrdLen, shrdArea,
-                        in.dims[0], in.dims[1], in.dims[2],
-                        gx-halo, gy-halo, gz2-halo,
-                        in.strides[2], in.strides[1], in.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2ShrdVolume(shrdMem, iptr, lx2, ly2, lz, shrdLen, shrdArea,
-                        in.dims[0], in.dims[1], in.dims[2],
-                        gx2-halo, gy2-halo, gz-halo,
-                        in.strides[2], in.strides[1], in.strides[0]);
-    }
-    if (ly<padding && lz<padding) {
-        load2ShrdVolume(shrdMem, iptr, lx, ly2, lz2, shrdLen, shrdArea,
-                        in.dims[0], in.dims[1], in.dims[2],
-                        gx-halo, gy2-halo, gz2-halo,
-                        in.strides[2], in.strides[1], in.strides[0]);
-    }
-    if (lz<padding && lx<padding) {
-        load2ShrdVolume(shrdMem, iptr, lx2, ly, lz2, shrdLen, shrdArea,
-                        in.dims[0], in.dims[1], in.dims[2],
-                        gx2-halo, gy-halo, gz2-halo,
-                        in.strides[2], in.strides[1], in.strides[0]);
-    }
-    if (lx<padding && ly<padding && lz<padding) {
-        load2ShrdVolume(shrdMem, iptr, lx2, ly2, lz2, shrdLen, shrdArea,
-                        in.dims[0], in.dims[1], in.dims[2],
-                        gx2-halo, gy2-halo, gz2-halo,
+    const int gx = blockDim.x * (blockIdx.x-batchId*nBBS) + lx;
+    const int gy = blockDim.y * blockIdx.y + ly;
+    const int gz = blockDim.z * blockIdx.z + lz;
+
+    for (int c=lz, gz2=gz; c<shrdLen2; c+=blockDim.z, gz2+=blockDim.z) {
+        for (int b=ly, gy2=gy; b<shrdLen1; b+=blockDim.y, gy2+=blockDim.y) {
+            for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+                load2ShrdVolume(shrdMem, iptr, a, b, c, shrdLen, shrdArea,
+                        in.dims[0], in.dims[1], in.dims[2], gx2-halo, gy2-halo, gz2-halo,
                         in.strides[2], in.strides[1], in.strides[0]);
+            }
+        }
     }
+
     __syncthreads();
     // indices of voxel owned by current thread
-    i  = lx + halo;
-    j  = ly + halo;
-    k  = lz + halo;
-    }
+    int i  = lx + halo;
+    int j  = ly + halo;
+    int k  = lz + halo;
 
     const T * d_filt = (const T *)cFilter;
     T acc = shrdMem[ lIdx3D(i, j, k, shrdArea, shrdLen, 1) ];
diff --git a/src/backend/cuda/kernel/sobel.hpp b/src/backend/cuda/kernel/sobel.hpp
index 550339e..f8c9e98 100644
--- a/src/backend/cuda/kernel/sobel.hpp
+++ b/src/backend/cuda/kernel/sobel.hpp
@@ -44,6 +44,7 @@ void sobel3x3(Param<To> dx, Param<To> dy, CParam<Ti> in, int nBBS0, int nBBS1)
     // calculate necessary offset and window parameters
     const int radius  = 1;
     const int padding = 2*radius;
+    const int shrdLen = blockDim.x + padding;
 
     // batch offsets
     unsigned b2 = blockIdx.x / nBBS0;
@@ -60,31 +61,14 @@ void sobel3x3(Param<To> dx, Param<To> dy, CParam<Ti> in, int nBBS0, int nBBS1)
     int gx = THREADS_X * (blockIdx.x-b2*nBBS0) + lx;
     int gy = THREADS_Y * (blockIdx.y-b3*nBBS1) + ly;
 
-    // offset values for pulling image to local memory
-    int lx2 = lx + THREADS_X;
-    int ly2 = ly + THREADS_Y;
-    int gx2 = gx + THREADS_X;
-    int gy2 = gy + THREADS_Y;
-
-    // pull image to local memory
-    shrdMem[lx][ly] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
-                                      gx-radius, gy-radius,
-                                      in.strides[1], in.strides[0]);
-    if (lx<padding) {
-        shrdMem[lx2][ly] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
-                                           gx2-radius, gy-radius,
-                                           in.strides[1], in.strides[0]);
-    }
-    if (ly<padding) {
-        shrdMem[lx][ly2] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
-                                           gx-radius, gy2-radius,
-                                           in.strides[1], in.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        shrdMem[lx2][ly2] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
-                                            gx2-radius, gy2-radius,
-                                            in.strides[1], in.strides[0]);
+    for (int b=ly, gy2=gy; b<shrdLen; b+=blockDim.y, gy2+=blockDim.y) {
+        for (int a=lx, gx2=gx; a<shrdLen; a+=blockDim.x, gx2+=blockDim.x) {
+            shrdMem[a][b] = load2ShrdMem<Ti>(iptr, in.dims[0], in.dims[1],
+                                             gx2-radius, gy2-radius,
+                                             in.strides[1], in.strides[0]);
+        }
     }
+
     __syncthreads();
 
     // Only continue if we're at a valid location
diff --git a/src/backend/opencl/kernel/bilateral.cl b/src/backend/opencl/kernel/bilateral.cl
index b54f809..05c5ac6 100644
--- a/src/backend/opencl/kernel/bilateral.cl
+++ b/src/backend/opencl/kernel/bilateral.cl
@@ -66,30 +66,18 @@ void bilateral(__global outType *        d_dst,
         gauss2d[ly*window_size+lx] = exp( ((x*x) + (y*y)) / (-2.f * variance_space));
     }
 
-    int lx2 = lx + get_local_size(0);
-    int ly2 = ly + get_local_size(1);
-    int gx2 = gx + get_local_size(0);
-    int gy2 = gy + get_local_size(1);
-
+    int s0 = iInfo.strides[0];
+    int s1 = iInfo.strides[1];
+    int d0 = iInfo.dims[0];
+    int d1 = iInfo.dims[1];
     // pull image to local memory
-    load2LocalMem(localMem, in, lx, ly, shrdLen,
-                 iInfo.dims[0], iInfo.dims[1], gx-radius,
-                 gy-radius, iInfo.strides[1], iInfo.strides[0]);
-    if (lx<padding) {
-        load2LocalMem(localMem, in, lx2, ly, shrdLen,
-                     iInfo.dims[0], iInfo.dims[1], gx2-radius,
-                     gy-radius, iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (ly<padding) {
-        load2LocalMem(localMem, in, lx, ly2, shrdLen,
-                     iInfo.dims[0], iInfo.dims[1], gx-radius,
-                     gy2-radius, iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2LocalMem(localMem, in, lx2, ly2, shrdLen,
-                     iInfo.dims[0], iInfo.dims[1], gx2-radius,
-                     gy2-radius, iInfo.strides[1], iInfo.strides[0]);
+    for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+        // move row_set get_local_size(1) along coloumns
+        for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+            load2LocalMem(localMem, in, a, b, shrdLen, d0, d1, gx2-radius, gy2-radius, s1, s0);
+        }
     }
+
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (gx<iInfo.dims[0] && gy<iInfo.dims[1]) {
diff --git a/src/backend/opencl/kernel/meanshift.cl b/src/backend/opencl/kernel/meanshift.cl
index ada4570..2b24de6 100644
--- a/src/backend/opencl/kernel/meanshift.cl
+++ b/src/backend/opencl/kernel/meanshift.cl
@@ -57,32 +57,22 @@ void meanshift(__global T *       d_dst,
     const int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
     const int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
 
-    int gx2 = gx + get_local_size(0);
-    int gy2 = gy + get_local_size(1);
-    int lx2 = lx + get_local_size(0);
-    int ly2 = ly + get_local_size(1);
+    int s0 = iInfo.strides[0];
+    int s1 = iInfo.strides[1];
+    int d0 = iInfo.dims[0];
+    int d1 = iInfo.dims[1];
+    // pull image to local memory
+    for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+        // move row_set get_local_size(1) along coloumns
+        for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+            load2LocalMem(localMem, iptr, a, b, shrdLen, schStride, channels,
+                    d0, d1, gx2-radius, gy2-radius, ichStride, s1, s0);
+        }
+    }
+
     int i   = lx + radius;
     int j   = ly + radius;
 
-    // pull image to local memory
-    load2LocalMem(localMem, iptr, lx, ly, shrdLen, schStride, channels,
-            iInfo.dims[0], iInfo.dims[1], gx-radius,
-            gy-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
-    if (lx<wind_len) {
-        load2LocalMem(localMem, iptr, lx2, ly, shrdLen, schStride, channels,
-                iInfo.dims[0], iInfo.dims[1], gx2-radius,
-                gy-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (ly<wind_len) {
-        load2LocalMem(localMem, iptr, lx, ly2, shrdLen, schStride, channels,
-                iInfo.dims[0], iInfo.dims[1], gx-radius,
-                gy2-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lx<wind_len && ly<wind_len) {
-        load2LocalMem(localMem, iptr, lx2, ly2, shrdLen, schStride, channels,
-                iInfo.dims[0], iInfo.dims[1], gx2-radius,
-                gy2-radius, ichStride, iInfo.strides[1], iInfo.strides[0]);
-    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (gx<iInfo.dims[0] && gy<iInfo.dims[1])
diff --git a/src/backend/opencl/kernel/medfilt.cl b/src/backend/opencl/kernel/medfilt.cl
index de541c4..78a62b2 100644
--- a/src/backend/opencl/kernel/medfilt.cl
+++ b/src/backend/opencl/kernel/medfilt.cl
@@ -64,35 +64,18 @@ void medfilt(__global T *       out,
     int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
     int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
 
-    // offset values for pulling image to local memory
-    int lx2 = lx + get_local_size(0);
-    int ly2 = ly + get_local_size(1);
-    int gx2 = gx + get_local_size(0);
-    int gy2 = gy + get_local_size(1);
-
+    int s0 = iInfo.strides[0];
+    int s1 = iInfo.strides[1];
+    int d0 = iInfo.dims[0];
+    int d1 = iInfo.dims[1];
     // pull image to local memory
-    load2ShrdMem(localMem, iptr, lx, ly, shrdLen,
-                 iInfo.dims[0], iInfo.dims[1],
-                 gx-halo, gy-halo,
-                 iInfo.strides[1], iInfo.strides[0]);
-    if (lx<padding) {
-        load2ShrdMem(localMem, iptr, lx2, ly, shrdLen,
-                     iInfo.dims[0], iInfo.dims[1],
-                     gx2-halo, gy-halo,
-                     iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (ly<padding) {
-        load2ShrdMem(localMem, iptr, lx, ly2, shrdLen,
-                     iInfo.dims[0], iInfo.dims[1],
-                     gx-halo, gy2-halo,
-                     iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2ShrdMem(localMem, iptr, lx2, ly2, shrdLen,
-                     iInfo.dims[0], iInfo.dims[1],
-                     gx2-halo, gy2-halo,
-                     iInfo.strides[1], iInfo.strides[0]);
+    for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+        // move row_set get_local_size(1) along coloumns
+        for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+            load2ShrdMem(localMem, iptr, a, b, shrdLen, d0, d1, gx2-halo, gy2-halo, s1, s0);
+        }
     }
+
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // Only continue if we're at a valid location
diff --git a/src/backend/opencl/kernel/morph.cl b/src/backend/opencl/kernel/morph.cl
index c92d06f..ac86fc1 100644
--- a/src/backend/opencl/kernel/morph.cl
+++ b/src/backend/opencl/kernel/morph.cl
@@ -38,7 +38,8 @@ void morph(__global T *              out,
 {
     const int halo   = windLen/2;
     const int padding= 2*halo;
-    const int shrdLen= get_local_size(0) + padding + 1;
+    const int shrdLen = get_local_size(0) + padding + 1;
+    const int shrdLen1= get_local_size(1) + padding;
 
     // gfor batch offsets
     int b2 = get_group_id(0) / nBBS0;
@@ -54,34 +55,14 @@ void morph(__global T *              out,
     int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
     int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
 
-    // offset values for pulling image to local memory
-    int lx2      = lx + get_local_size(0);
-    int ly2      = ly + get_local_size(1);
-    int gx2      = gx + get_local_size(0);
-    int gy2      = gy + get_local_size(1);
-
-    // pull image to local memory
-    load2LocalMem(localMem, in, lx, ly, shrdLen,
-                  iInfo.dims[0], iInfo.dims[1],
-                  gx-halo, gy-halo,
-                  iInfo.strides[1], iInfo.strides[0]);
-    if (lx<padding) {
-        load2LocalMem(localMem, in, lx2, ly, shrdLen,
-                      iInfo.dims[0], iInfo.dims[1],
-                      gx2-halo, gy-halo,
-                      iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (ly<padding) {
-        load2LocalMem(localMem, in, lx, ly2, shrdLen,
-                      iInfo.dims[0], iInfo.dims[1],
-                      gx-halo, gy2-halo,
-                      iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2LocalMem(localMem, in, lx2, ly2, shrdLen,
-                      iInfo.dims[0], iInfo.dims[1],
-                      gx2-halo, gy2-halo,
-                      iInfo.strides[1], iInfo.strides[0]);
+    int s0 = iInfo.strides[0];
+    int s1 = iInfo.strides[1];
+    int d0 = iInfo.dims[0];
+    int d1 = iInfo.dims[1];
+    for (int b=ly, gy2=gy; b<shrdLen1; b+=get_local_size(1), gy2+=get_local_size(1)) {
+        for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+            load2LocalMem(localMem, in, a, b, shrdLen, d0, d1, gx-halo, gy-halo, s1, s0);
+        }
     }
 
     int i = lx + halo;
@@ -149,6 +130,8 @@ void morph3d(__global T *         out,
 
     const int se_area   = windLen*windLen;
     const int shrdLen   = get_local_size(0) + padding + 1;
+    const int shrdLen1  = get_local_size(1) + padding;
+    const int shrdLen2  = get_local_size(2) + padding;
     const int shrdArea  = shrdLen * (get_local_size(1)+padding);
 
     // gfor batch offsets
@@ -156,76 +139,35 @@ void morph3d(__global T *         out,
     in  += (batchId * iInfo.strides[3] + iInfo.offset);
     out += (batchId * oInfo.strides[3]);
 
-    int gx, gy, gz, i, j, k;
-    { // scoping out unnecessary variables
     const int lx = get_local_id(0);
     const int ly = get_local_id(1);
     const int lz = get_local_id(2);
 
-    gx = get_local_size(0) * (get_group_id(0)-batchId*nBBS) + lx;
-    gy = get_local_size(1) * get_group_id(1) + ly;
-    gz = get_local_size(2) * get_group_id(2) + lz;
-
-    const int gx2 = gx + get_local_size(0);
-    const int gy2 = gy + get_local_size(1);
-    const int gz2 = gz + get_local_size(2);
-    const int lx2 = lx + get_local_size(0);
-    const int ly2 = ly + get_local_size(1);
-    const int lz2 = lz + get_local_size(2);
-
-    // pull volume to shared memory
-    load2LocVolume(localMem, in, lx, ly, lz, shrdLen, shrdArea,
-                    iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                    gx-halo, gy-halo, gz-halo,
-                    iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
-    if (lx<padding) {
-        load2LocVolume(localMem, in, lx2, ly, lz, shrdLen, shrdArea,
-                       iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                       gx2-halo, gy-halo, gz-halo,
-                       iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (ly<padding) {
-        load2LocVolume(localMem, in, lx, ly2, lz, shrdLen, shrdArea,
-                       iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                       gx-halo, gy2-halo, gz-halo,
-                       iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lz<padding) {
-        load2LocVolume(localMem, in, lx, ly, lz2, shrdLen, shrdArea,
-                       iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                       gx-halo, gy-halo, gz2-halo,
-                       iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        load2LocVolume(localMem, in, lx2, ly2, lz, shrdLen, shrdArea,
-                       iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                       gx2-halo, gy2-halo, gz-halo,
-                       iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (ly<padding && lz<padding) {
-        load2LocVolume(localMem, in, lx, ly2, lz2, shrdLen, shrdArea,
-                       iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                       gx-halo, gy2-halo, gz2-halo,
-                       iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lz<padding && lx<padding) {
-        load2LocVolume(localMem, in, lx2, ly, lz2, shrdLen, shrdArea,
-                       iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                       gx2-halo, gy-halo, gz2-halo,
-                       iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lx<padding && ly<padding && lz<padding) {
-        load2LocVolume(localMem, in, lx2, ly2, lz2, shrdLen, shrdArea,
-                       iInfo.dims[0], iInfo.dims[1], iInfo.dims[2],
-                       gx2-halo, gy2-halo, gz2-halo,
-                       iInfo.strides[2], iInfo.strides[1], iInfo.strides[0]);
+    const int gx = get_local_size(0) * (get_group_id(0)-batchId*nBBS) + lx;
+    const int gy = get_local_size(1) * get_group_id(1) + ly;
+    const int gz = get_local_size(2) * get_group_id(2) + lz;
+
+    int s0 = iInfo.strides[0];
+    int s1 = iInfo.strides[1];
+    int s2 = iInfo.strides[2];
+    int d0 = iInfo.dims[0];
+    int d1 = iInfo.dims[1];
+    int d2 = iInfo.dims[2];
+
+    for (int c=lz, gz2=gz; c<shrdLen2; c+=get_local_size(2), gz2+=get_local_size(2)) {
+        for (int b=ly, gy2=gy; b<shrdLen1; b+=get_local_size(1), gy2+=get_local_size(1)) {
+            for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+                load2LocVolume(localMem, in, a, b, c, shrdLen, shrdArea, d0, d1, d2,
+                               gx2-halo, gy2-halo, gz2-halo, s2, s1, s0);
+            }
+        }
     }
+
     barrier(CLK_LOCAL_MEM_FENCE);
-    // indices of voxel owned by current thread
-    i  = lx + halo;
-    j  = ly + halo;
-    k  = lz + halo;
-    }
+
+    int i  = lx + halo;
+    int j  = ly + halo;
+    int k  = lz + halo;
 
     T acc = localMem[ lIdx3D(i, j, k, shrdArea, shrdLen, 1) ];
 #pragma unroll
diff --git a/src/backend/opencl/kernel/sobel.cl b/src/backend/opencl/kernel/sobel.cl
index fbb5671..0e83035 100644
--- a/src/backend/opencl/kernel/sobel.cl
+++ b/src/backend/opencl/kernel/sobel.cl
@@ -41,28 +41,14 @@ void sobel3x3(global To * dx, KParam dxInfo,
     int gx = get_local_size(0) * (get_group_id(0)-b2*nBBS0) + lx;
     int gy = get_local_size(1) * (get_group_id(1)-b3*nBBS1) + ly;
 
-    int lx2 = lx + get_local_size(0);
-    int ly2 = ly + get_local_size(1);
-    int gx2 = gx + get_local_size(0);
-    int gy2 = gy + get_local_size(1);
-
-    localMem[lx+shrdLen*ly] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
-                                   gx-radius, gy-radius,
-                                   iInfo.strides[1], iInfo.strides[0]);
-    if (lx<padding) {
-        localMem[lx2+shrdLen*ly] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
-                                        gx2-radius, gy-radius,
-                                        iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (ly<padding) {
-        localMem[lx+shrdLen*ly2] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
-                                        gx-radius, gy2-radius,
-                                        iInfo.strides[1], iInfo.strides[0]);
-    }
-    if (lx<padding && ly<padding) {
-        localMem[lx2+shrdLen*ly2] = load2LocalMem(iptr, iInfo.dims[0], iInfo.dims[1],
-                                         gx2-radius, gy2-radius,
-                                         iInfo.strides[1], iInfo.strides[0]);
+    int s0 = iInfo.strides[0];
+    int s1 = iInfo.strides[1];
+    int d0 = iInfo.dims[0];
+    int d1 = iInfo.dims[1];
+    for (int b=ly, gy2=gy; b<shrdLen; b+=get_local_size(1), gy2+=get_local_size(1)) {
+        for (int a=lx, gx2=gx; a<shrdLen; a+=get_local_size(0), gx2+=get_local_size(0)) {
+            localMem[a+shrdLen*b] = load2LocalMem(iptr, d0, d1, gx2-radius, gy2-radius, s1, s0);
+        }
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git