[pyfr] 57/88: Improve the performance of axnpby.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Nov 16 12:05:29 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository pyfr.
commit a0f00ccabf69f718969a167e0159cace5280ae49
Author: Freddie Witherden <freddie at witherden.org>
Date: Sun Jul 3 14:06:34 2016 -0700
Improve the performance of axnpby.
---
pyfr/backends/cuda/blasext.py | 2 +-
pyfr/backends/cuda/kernels/axnpby.mako | 45 +++++++++++++++----------------
pyfr/backends/opencl/blasext.py | 2 +-
pyfr/backends/opencl/kernels/axnpby.mako | 46 +++++++++++++++-----------------
4 files changed, 46 insertions(+), 49 deletions(-)
diff --git a/pyfr/backends/cuda/blasext.py b/pyfr/backends/cuda/blasext.py
index 96b94fe..d2a406e 100644
--- a/pyfr/backends/cuda/blasext.py
+++ b/pyfr/backends/cuda/blasext.py
@@ -30,7 +30,7 @@ class CUDABlasExtKernels(CUDAKernelProvider):
# Determine the grid/block
block = (128, 1, 1)
- grid = get_grid_for_block(block, ncolb)
+ grid = get_grid_for_block(block, ncolb, nrow)
class AxnpbyKernel(ComputeKernel):
def run(self, queue, *consts):
diff --git a/pyfr/backends/cuda/kernels/axnpby.mako b/pyfr/backends/cuda/kernels/axnpby.mako
index c19f273..2515bea 100644
--- a/pyfr/backends/cuda/kernels/axnpby.mako
+++ b/pyfr/backends/cuda/kernels/axnpby.mako
@@ -3,35 +3,34 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
__global__ void
-axnpby(int nrow, int ncolb, int ldim,
- ${', '.join('fpdtype_t* __restrict__ x' + str(i) for i in range(nv))},
+axnpby(int nrow, int ncolb, int ldim, fpdtype_t* __restrict__ x0,
+ ${', '.join('const fpdtype_t* __restrict__ x' + str(i)
+ for i in range(1, nv))},
${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
+ int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
int idx;
if (j < ncolb && a0 == 0.0)
- for (int i = 0; i < nrow; ++i)
- {
- % for k in subdims:
- idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
- x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
- % endfor
- }
+ {
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
+ }
else if (j < ncolb && a0 == 1.0)
- for (int i = 0; i < nrow; ++i)
- {
- % for k in subdims:
- idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
- x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
- % endfor
- }
+ {
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
+ x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
+ }
else if (j < ncolb)
- for (int i = 0; i < nrow; ++i)
- {
- % for k in subdims:
- idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
- x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
- % endfor
- }
+ {
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+ % endfor
+ }
}
diff --git a/pyfr/backends/opencl/blasext.py b/pyfr/backends/opencl/blasext.py
index f388770..e073ad2 100644
--- a/pyfr/backends/opencl/blasext.py
+++ b/pyfr/backends/opencl/blasext.py
@@ -31,7 +31,7 @@ class OpenCLBlasExtKernels(OpenCLKernelProvider):
class AxnpbyKernel(ComputeKernel):
def run(self, queue, *consts):
args = [x.data for x in arr] + list(consts)
- kern(queue.cl_queue_comp, (ncolb,), None, nrow, ncolb,
+ kern(queue.cl_queue_comp, (ncolb, nrow), None, nrow, ncolb,
ldim, *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/opencl/kernels/axnpby.mako b/pyfr/backends/opencl/kernels/axnpby.mako
index 14c346e..731410d 100644
--- a/pyfr/backends/opencl/kernels/axnpby.mako
+++ b/pyfr/backends/opencl/kernels/axnpby.mako
@@ -3,35 +3,33 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
__kernel void
-axnpby(int nrow, int ncolb, int ldim,
- ${', '.join('__global fpdtype_t* restrict x' + str(i) for i in range(nv))},
+axnpby(int nrow, int ncolb, int ldim, __global fpdtype_t* restrict x0,
+ ${', '.join('__global const fpdtype_t* restrict x' + str(i)
+ for i in range(1, nv))},
${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
- int j = get_global_id(0);
+ int i = get_global_id(1), j = get_global_id(0);
int idx;
if (j < ncolb && a0 == 0.0)
- for (int i = 0; i < nrow; ++i)
- {
- % for k in subdims:
- idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
- x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
- % endfor
- }
+ {
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
+ }
else if (j < ncolb && a0 == 1.0)
- for (int i = 0; i < nrow; ++i)
- {
- % for k in subdims:
- idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
- x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
- % endfor
- }
+ {
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
+ x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
+ }
else if (j < ncolb)
- for (int i = 0; i < nrow; ++i)
- {
- % for k in subdims:
- idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
- x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
- % endfor
- }
+ {
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+ % endfor
+ }
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git
More information about the debian-science-commits
mailing list