[arrayfire] 13/34: Removed unncessary memory overhead in histogram cuda/opencl kernels
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Sun Sep 27 14:46:02 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.
commit 2f56b39d9ded33c871110067a56d0c374bb66a11
Author: pradeep <pradeep at arrayfire.com>
Date: Mon Sep 21 17:07:09 2015 -0400
Removed unncessary memory overhead in histogram cuda/opencl kernels
---
src/backend/cuda/histogram.cu | 14 +-------------
src/backend/cuda/kernel/histogram.hpp | 26 +++++++-------------------
src/backend/opencl/histogram.cpp | 14 +-------------
src/backend/opencl/kernel/histogram.cl | 15 ++-------------
src/backend/opencl/kernel/histogram.hpp | 10 +++++-----
5 files changed, 16 insertions(+), 63 deletions(-)
diff --git a/src/backend/cuda/histogram.cu b/src/backend/cuda/histogram.cu
index b5e470d..0f99982 100644
--- a/src/backend/cuda/histogram.cu
+++ b/src/backend/cuda/histogram.cu
@@ -32,19 +32,7 @@ Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const d
dim4 outDims = dim4(nbins, 1, dims[2], dims[3]);
Array<outType> out = createValueArray<outType>(outDims, outType(0));
- // create an array to hold min and max values for
- // batch operation handling, this will reduce
- // number of concurrent reads to one single memory location
- dim_t mmNElems= dims[2] * dims[3];
- cfloat init;
- init.x = minval;
- init.y = maxval;
- vector<cfloat> h_minmax(mmNElems, init);
-
- dim4 minmax_dims(mmNElems*2);
- Array<cfloat> minmax = createHostDataArray<cfloat>(minmax_dims, &h_minmax.front());
-
- kernel::histogram<inType, outType, isLinear>(out, in, minmax.get(), nbins);
+ kernel::histogram<inType, outType, isLinear>(out, in, nbins, minval, maxval);
return out;
}
diff --git a/src/backend/cuda/kernel/histogram.hpp b/src/backend/cuda/kernel/histogram.hpp
index 10c8b3f..5d0b1e3 100644
--- a/src/backend/cuda/kernel/histogram.hpp
+++ b/src/backend/cuda/kernel/histogram.hpp
@@ -31,8 +31,7 @@ __forceinline__ __device__ int minimum(int a, int b)
template<typename inType, typename outType, bool isLinear>
static __global__
void histogramKernel(Param<outType> out, CParam<inType> in,
- const cfloat *d_minmax, int len,
- int nbins, int nBBS)
+ int len, int nbins, float minval, float maxval, int nBBS)
{
SharedMemory<outType> shared;
outType * shrdMem = shared.getPointer();
@@ -42,20 +41,9 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
const inType *iptr = in.ptr + b2 * in.strides[2] + blockIdx.y * in.strides[3];
outType *optr = out.ptr + b2 * out.strides[2] + blockIdx.y * out.strides[3];
- int start = (blockIdx.x-b2*nBBS) * THRD_LOAD * blockDim.x + threadIdx.x;
- int end = minimum((start + THRD_LOAD * blockDim.x), len);
-
- __shared__ float min;
- __shared__ float step;
-
- // offset minmax array to account for batch ops
- d_minmax += (b2 * blockIdx.x + blockIdx.y);
-
- if (threadIdx.x == 0) {
- float2 minmax = *d_minmax;
- min = minmax.x;
- step = (minmax.y-minmax.x) / (float)nbins;
- }
+ int start = (blockIdx.x-b2*nBBS) * THRD_LOAD * blockDim.x + threadIdx.x;
+ int end = minimum((start + THRD_LOAD * blockDim.x), len);
+ float step = (maxval-minval) / (float)nbins;
for (int i = threadIdx.x; i < nbins; i += blockDim.x)
shrdMem[i] = 0;
@@ -63,7 +51,7 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
for (int row = start; row < end; row += blockDim.x) {
int idx = isLinear ? row : ((row % in.dims[0]) + (row / in.dims[0])*in.strides[1]);
- int bin = (int)((iptr[idx] - min) / step);
+ int bin = (int)((iptr[idx] - minval) / step);
bin = (bin < 0) ? 0 : bin;
bin = (bin >= nbins) ? (nbins-1) : bin;
atomicAdd((shrdMem + bin), 1);
@@ -76,7 +64,7 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
}
template<typename inType, typename outType, bool isLinear>
-void histogram(Param<outType> out, CParam<inType> in, cfloat *d_minmax, int nbins)
+void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval)
{
dim3 threads(kernel::THREADS_X, 1);
@@ -88,7 +76,7 @@ void histogram(Param<outType> out, CParam<inType> in, cfloat *d_minmax, int nbin
int smem_size = nbins * sizeof(outType);
CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size,
- out, in, d_minmax, nElems, nbins, blk_x);
+ out, in, nElems, nbins, minval, maxval, blk_x);
POST_LAUNCH_CHECK();
}
diff --git a/src/backend/opencl/histogram.cpp b/src/backend/opencl/histogram.cpp
index 03f095a..3a7bd72 100644
--- a/src/backend/opencl/histogram.cpp
+++ b/src/backend/opencl/histogram.cpp
@@ -31,19 +31,7 @@ Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const d
dim4 outDims = dim4(nbins, 1, dims[2], dims[3]);
Array<outType> out = createValueArray<outType>(outDims, outType(0));
- // create an array to hold min and max values for
- // batch operation handling, this will reduce
- // number of concurrent reads to one single memory location
- dim_t mmNElems= dims[2] * dims[3];
- cfloat init;
- init.s[0] = minval;
- init.s[1] = maxval;
- vector<cfloat> h_minmax(mmNElems, init);
-
- dim4 minmax_dims(mmNElems*2);
- Array<cfloat> minmax = createHostDataArray<cfloat>(minmax_dims, h_minmax.data());
-
- kernel::histogram<inType, outType, isLinear>(out, in, minmax, nbins);
+ kernel::histogram<inType, outType, isLinear>(out, in, nbins, minval, maxval);
return out;
}
diff --git a/src/backend/opencl/kernel/histogram.cl b/src/backend/opencl/kernel/histogram.cl
index 756cae6..9e1468d 100644
--- a/src/backend/opencl/kernel/histogram.cl
+++ b/src/backend/opencl/kernel/histogram.cl
@@ -12,9 +12,8 @@ void histogram(__global outType * d_dst,
KParam oInfo,
__global const inType * d_src,
KParam iInfo,
- __global const float2 * d_minmax,
__local outType * localMem,
- int len, int nbins, int nBBS)
+ int len, int nbins, float minval, float maxval, int nBBS)
{
unsigned b2 = get_group_id(0)/nBBS;
int start = (get_group_id(0)-b2*nBBS) * THRD_LOAD * get_local_size(0) + get_local_id(0);
@@ -24,17 +23,7 @@ void histogram(__global outType * d_dst,
__global const inType *in = d_src + b2 * iInfo.strides[2] + get_group_id(1) * iInfo.strides[3] + iInfo.offset;
__global outType * out = d_dst + b2 * oInfo.strides[2] + get_group_id(1) * oInfo.strides[3];
- __local float minval;
- __local float dx;
-
- // offset minmax array to account for batch ops
- __global const float2 * d_mnmx = d_minmax + (b2 * get_group_id(0) + get_group_id(1));
-
- if (get_local_id(0) == 0) {
- float2 minmax = *d_mnmx;
- minval = minmax.s0;
- dx = (minmax.s1-minmax.s0) / (float)nbins;
- }
+ float dx = (maxval-minval)/(float)nbins;
for (int i = get_local_id(0); i < nbins; i += get_local_size(0))
localMem[i] = 0;
diff --git a/src/backend/opencl/kernel/histogram.hpp b/src/backend/opencl/kernel/histogram.hpp
index 88b53b6..fb48023 100644
--- a/src/backend/opencl/kernel/histogram.hpp
+++ b/src/backend/opencl/kernel/histogram.hpp
@@ -31,7 +31,7 @@ static const int THREADS_X = 256;
static const int THRD_LOAD = 16;
template<typename inType, typename outType, bool isLinear>
-void histogram(Param out, const Param in, const Param minmax, int nbins)
+void histogram(Param out, const Param in, int nbins, float minval, float maxval)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
@@ -59,8 +59,8 @@ void histogram(Param out, const Param in, const Param minmax, int nbins)
});
auto histogramOp = make_kernel<Buffer, KParam, Buffer, KParam,
- Buffer, cl::LocalSpaceArg,
- int, int, int
+ cl::LocalSpaceArg,
+ int, int, float, float, int
>(*histKernels[device]);
int nElems = in.info.dims[0]*in.info.dims[1];
@@ -71,8 +71,8 @@ void histogram(Param out, const Param in, const Param minmax, int nbins)
NDRange global(blk_x*in.info.dims[2]*THREADS_X, in.info.dims[3]);
histogramOp(EnqueueArgs(getQueue(), global, local),
- *out.data, out.info, *in.data, in.info, *minmax.data,
- cl::Local(locSize), nElems, nbins, blk_x);
+ *out.data, out.info, *in.data, in.info,
+ cl::Local(locSize), nElems, nbins, minval, maxval, blk_x);
CL_DEBUG_FINISH(getQueue());
} catch (cl::Error err) {
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list