[arrayfire] 13/34: Removed unncessary memory overhead in histogram cuda/opencl kernels

Sun Sep 27 14:46:02 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.

commit 2f56b39d9ded33c871110067a56d0c374bb66a11
Author: pradeep <pradeep at arrayfire.com>
Date:   Mon Sep 21 17:07:09 2015 -0400

    Removed unncessary memory overhead in histogram cuda/opencl kernels
---
 src/backend/cuda/histogram.cu           | 14 +-------------
 src/backend/cuda/kernel/histogram.hpp   | 26 +++++++-------------------
 src/backend/opencl/histogram.cpp        | 14 +-------------
 src/backend/opencl/kernel/histogram.cl  | 15 ++-------------
 src/backend/opencl/kernel/histogram.hpp | 10 +++++-----
 5 files changed, 16 insertions(+), 63 deletions(-)

diff --git a/src/backend/cuda/histogram.cu b/src/backend/cuda/histogram.cu
index b5e470d..0f99982 100644
--- a/src/backend/cuda/histogram.cu
+++ b/src/backend/cuda/histogram.cu
@@ -32,19 +32,7 @@ Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const d
     dim4 outDims        = dim4(nbins, 1, dims[2], dims[3]);
     Array<outType> out  = createValueArray<outType>(outDims, outType(0));
 
-    // create an array to hold min and max values for
-    // batch operation handling, this will reduce
-    // number of concurrent reads to one single memory location
-    dim_t mmNElems= dims[2] * dims[3];
-    cfloat init;
-    init.x = minval;
-    init.y = maxval;
-    vector<cfloat> h_minmax(mmNElems, init);
-
-    dim4 minmax_dims(mmNElems*2);
-    Array<cfloat> minmax = createHostDataArray<cfloat>(minmax_dims, &h_minmax.front());
-
-    kernel::histogram<inType, outType, isLinear>(out, in, minmax.get(), nbins);
+    kernel::histogram<inType, outType, isLinear>(out, in, nbins, minval, maxval);
 
     return out;
 }
diff --git a/src/backend/cuda/kernel/histogram.hpp b/src/backend/cuda/kernel/histogram.hpp
index 10c8b3f..5d0b1e3 100644
--- a/src/backend/cuda/kernel/histogram.hpp
+++ b/src/backend/cuda/kernel/histogram.hpp
@@ -31,8 +31,7 @@ __forceinline__ __device__ int minimum(int a, int b)
 template<typename inType, typename outType, bool isLinear>
 static __global__
 void histogramKernel(Param<outType> out, CParam<inType> in,
-                     const cfloat *d_minmax, int len,
-                     int nbins, int nBBS)
+                     int len, int nbins, float minval, float maxval, int nBBS)
 {
     SharedMemory<outType> shared;
     outType * shrdMem = shared.getPointer();
@@ -42,20 +41,9 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
     const inType *iptr  =  in.ptr + b2 *  in.strides[2] + blockIdx.y *  in.strides[3];
     outType      *optr  = out.ptr + b2 * out.strides[2] + blockIdx.y * out.strides[3];
 
-    int start = (blockIdx.x-b2*nBBS) * THRD_LOAD * blockDim.x + threadIdx.x;
-    int end   = minimum((start + THRD_LOAD * blockDim.x), len);
-
-    __shared__ float min;
-    __shared__ float step;
-
-    // offset minmax array to account for batch ops
-    d_minmax += (b2 * blockIdx.x + blockIdx.y);
-
-    if (threadIdx.x == 0) {
-        float2 minmax = *d_minmax;
-        min  = minmax.x;
-        step = (minmax.y-minmax.x) / (float)nbins;
-    }
+    int start  = (blockIdx.x-b2*nBBS) * THRD_LOAD * blockDim.x + threadIdx.x;
+    int end    = minimum((start + THRD_LOAD * blockDim.x), len);
+    float step = (maxval-minval) / (float)nbins;
 
     for (int i = threadIdx.x; i < nbins; i += blockDim.x)
         shrdMem[i] = 0;
@@ -63,7 +51,7 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
 
     for (int row = start; row < end; row += blockDim.x) {
         int idx = isLinear ? row : ((row % in.dims[0]) + (row / in.dims[0])*in.strides[1]);
-        int bin = (int)((iptr[idx] - min) / step);
+        int bin = (int)((iptr[idx] - minval) / step);
         bin     = (bin < 0)      ? 0         : bin;
         bin     = (bin >= nbins) ? (nbins-1) : bin;
         atomicAdd((shrdMem + bin), 1);
@@ -76,7 +64,7 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
 }
 
 template<typename inType, typename outType, bool isLinear>
-void histogram(Param<outType> out, CParam<inType> in, cfloat *d_minmax, int nbins)
+void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval)
 {
     dim3 threads(kernel::THREADS_X, 1);
 
@@ -88,7 +76,7 @@ void histogram(Param<outType> out, CParam<inType> in, cfloat *d_minmax, int nbin
     int smem_size = nbins * sizeof(outType);
 
     CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size,
-            out, in, d_minmax, nElems, nbins, blk_x);
+            out, in, nElems, nbins, minval, maxval, blk_x);
 
     POST_LAUNCH_CHECK();
 }
diff --git a/src/backend/opencl/histogram.cpp b/src/backend/opencl/histogram.cpp
index 03f095a..3a7bd72 100644
--- a/src/backend/opencl/histogram.cpp
+++ b/src/backend/opencl/histogram.cpp
@@ -31,19 +31,7 @@ Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const d
     dim4 outDims        = dim4(nbins, 1, dims[2], dims[3]);
     Array<outType> out = createValueArray<outType>(outDims, outType(0));
 
-    // create an array to hold min and max values for
-    // batch operation handling, this will reduce
-    // number of concurrent reads to one single memory location
-    dim_t mmNElems= dims[2] * dims[3];
-    cfloat init;
-    init.s[0] = minval;
-    init.s[1] = maxval;
-    vector<cfloat> h_minmax(mmNElems, init);
-
-    dim4 minmax_dims(mmNElems*2);
-    Array<cfloat> minmax = createHostDataArray<cfloat>(minmax_dims, h_minmax.data());
-
-    kernel::histogram<inType, outType, isLinear>(out, in, minmax, nbins);
+    kernel::histogram<inType, outType, isLinear>(out, in, nbins, minval, maxval);
 
     return out;
 }
diff --git a/src/backend/opencl/kernel/histogram.cl b/src/backend/opencl/kernel/histogram.cl
index 756cae6..9e1468d 100644
--- a/src/backend/opencl/kernel/histogram.cl
+++ b/src/backend/opencl/kernel/histogram.cl
@@ -12,9 +12,8 @@ void histogram(__global outType *         d_dst,
                KParam                     oInfo,
                __global const inType *    d_src,
                KParam                     iInfo,
-               __global const float2 *    d_minmax,
                __local outType *          localMem,
-               int len, int nbins, int nBBS)
+               int len, int nbins, float minval, float maxval, int nBBS)
 {
     unsigned b2    = get_group_id(0)/nBBS;
     int start = (get_group_id(0)-b2*nBBS) * THRD_LOAD * get_local_size(0) + get_local_id(0);
@@ -24,17 +23,7 @@ void histogram(__global outType *         d_dst,
     __global const inType *in = d_src + b2 * iInfo.strides[2] + get_group_id(1) * iInfo.strides[3] + iInfo.offset;
     __global outType * out    = d_dst + b2 * oInfo.strides[2] + get_group_id(1) * oInfo.strides[3];
 
-    __local float minval;
-    __local float dx;
-
-    // offset minmax array to account for batch ops
-    __global const float2 * d_mnmx = d_minmax + (b2 * get_group_id(0) + get_group_id(1));
-
-    if (get_local_id(0) == 0) {
-        float2 minmax = *d_mnmx;
-        minval = minmax.s0;
-        dx     = (minmax.s1-minmax.s0) / (float)nbins;
-    }
+    float dx = (maxval-minval)/(float)nbins;
 
     for (int i = get_local_id(0); i < nbins; i += get_local_size(0))
         localMem[i] = 0;
diff --git a/src/backend/opencl/kernel/histogram.hpp b/src/backend/opencl/kernel/histogram.hpp
index 88b53b6..fb48023 100644
--- a/src/backend/opencl/kernel/histogram.hpp
+++ b/src/backend/opencl/kernel/histogram.hpp
@@ -31,7 +31,7 @@ static const int THREADS_X =  256;
 static const int THRD_LOAD =   16;
 
 template<typename inType, typename outType, bool isLinear>
-void histogram(Param out, const Param in, const Param minmax, int nbins)
+void histogram(Param out, const Param in, int nbins, float minval, float maxval)
 {
     try {
         static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
@@ -59,8 +59,8 @@ void histogram(Param out, const Param in, const Param minmax, int nbins)
                 });
 
         auto histogramOp = make_kernel<Buffer, KParam, Buffer, KParam,
-                                       Buffer, cl::LocalSpaceArg,
-                                       int, int, int
+                                       cl::LocalSpaceArg,
+                                       int, int, float, float, int
                                       >(*histKernels[device]);
 
         int nElems = in.info.dims[0]*in.info.dims[1];
@@ -71,8 +71,8 @@ void histogram(Param out, const Param in, const Param minmax, int nbins)
         NDRange global(blk_x*in.info.dims[2]*THREADS_X, in.info.dims[3]);
 
         histogramOp(EnqueueArgs(getQueue(), global, local),
-                *out.data, out.info, *in.data, in.info, *minmax.data,
-                cl::Local(locSize), nElems, nbins, blk_x);
+                *out.data, out.info, *in.data, in.info,
+                cl::Local(locSize), nElems, nbins, minval, maxval, blk_x);
 
         CL_DEBUG_FINISH(getQueue());
     } catch (cl::Error err) {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git