[pyfr] 58/88: Improve the performance of OpenCL and CUDA pointwise kernels.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Nov 16 12:05:29 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository pyfr.
commit 00baa28cd13ce580789e0b8037d1272141857779
Author: Freddie Witherden <freddie at witherden.org>
Date: Sun Jul 3 17:11:52 2016 -0700
Improve the performance of OpenCL and CUDA pointwise kernels.
---
pyfr/backends/cuda/generator.py | 13 ++++++++-----
pyfr/backends/cuda/provider.py | 8 ++++----
pyfr/backends/opencl/generator.py | 12 +++++++-----
pyfr/backends/opencl/provider.py | 6 +++++-
4 files changed, 24 insertions(+), 15 deletions(-)
diff --git a/pyfr/backends/cuda/generator.py b/pyfr/backends/cuda/generator.py
index c2e7e28..66c2fab 100644
--- a/pyfr/backends/cuda/generator.py
+++ b/pyfr/backends/cuda/generator.py
@@ -9,21 +9,24 @@ class CUDAKernelGenerator(BaseKernelGenerator):
# Specialise
if self.ndim == 1:
+ self._ix = 'int _x = blockIdx.x*blockDim.x + threadIdx.x;'
self._limits = 'if (_x < _nx)'
else:
- self._limits = 'for (int _y = 0; _y < _ny && _x < _nx; ++_y)'
+ self._ix = ('int _x = blockIdx.x*blockDim.x + threadIdx.x;'
+ 'int _y = blockIdx.y*blockDim.y + threadIdx.y;')
+ self._limits = 'if (_x < _nx && _y < _ny)'
def render(self):
# Kernel spec
spec = self._render_spec()
- # Iteration limits (if statement/for loop)
- limits = self._limits
+ # Iteration indicies and limits
+ ix, limits = self._ix, self._limits
# Combine
return '''{spec}
{{
- int _x = blockIdx.x*blockDim.x + threadIdx.x;
+ {ix}
#define X_IDX (_x)
#define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
{limits}
@@ -32,7 +35,7 @@ class CUDAKernelGenerator(BaseKernelGenerator):
}}
#undef X_IDX
#undef X_IDX_AOSOA
- }}'''.format(spec=spec, limits=limits, body=self.body)
+ }}'''.format(spec=spec, ix=ix, limits=limits, body=self.body)
def _render_spec(self):
# We first need the argument list; starting with the dimensions
diff --git a/pyfr/backends/cuda/provider.py b/pyfr/backends/cuda/provider.py
index bf86f0b..46fc5a7 100644
--- a/pyfr/backends/cuda/provider.py
+++ b/pyfr/backends/cuda/provider.py
@@ -9,8 +9,8 @@ from pyfr.util import memoize
def get_grid_for_block(block, nrow, ncol=1):
- return ((nrow + (-nrow % block[0])) // block[0],
- (ncol + (-ncol % block[1])) // block[1])
+ return (int((nrow + (-nrow % block[0])) // block[0]),
+ int((ncol + (-ncol % block[1])) // block[1]))
class CUDAKernelProvider(BaseKernelProvider):
@@ -34,8 +34,8 @@ class CUDAPointwiseKernelProvider(CUDAKernelProvider,
def _instantiate_kernel(self, dims, fun, arglst):
# Determine the grid/block
- block = (128, 1, 1)
- grid = get_grid_for_block(block, dims[-1])
+ block = (128, 2, 1) if len(dims) == 2 else (16, 1, 1)
+ grid = get_grid_for_block(block, *dims[::-1])
class PointwiseKernel(ComputeKernel):
def run(self, queue, **kwargs):
diff --git a/pyfr/backends/opencl/generator.py b/pyfr/backends/opencl/generator.py
index d6e7d1e..e620c86 100644
--- a/pyfr/backends/opencl/generator.py
+++ b/pyfr/backends/opencl/generator.py
@@ -9,21 +9,23 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
# Specialise
if self.ndim == 1:
+ self._ix = 'int _x = get_global_id(0);'
self._limits = 'if (_x < _nx)'
else:
- self._limits = 'for (int _y = 0; _y < _ny && _x < _nx; ++_y)'
+ self._ix = 'int _x = get_global_id(0), _y = get_global_id(1);'
+ self._limits = 'if (_x < _nx && _y < _ny)'
def render(self):
# Kernel spec
spec = self._render_spec()
- # Iteration limits (if statement/for loop)
- limits = self._limits
+ # Iteration indicies and limits
+ ix, limits = self._ix, self._limits
# Combine
return '''{spec}
{{
- int _x = get_global_id(0);
+ {ix}
#define X_IDX (_x)
#define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
{limits}
@@ -32,7 +34,7 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
}}
#undef X_IDX
#undef X_IDX_AOSOA
- }}'''.format(spec=spec, limits=limits, body=self.body)
+ }}'''.format(spec=spec, ix=ix, limits=limits, body=self.body)
def _render_spec(self):
# We first need the argument list; starting with the dimensions
diff --git a/pyfr/backends/opencl/provider.py b/pyfr/backends/opencl/provider.py
index 0d16355..2ffa864 100644
--- a/pyfr/backends/opencl/provider.py
+++ b/pyfr/backends/opencl/provider.py
@@ -31,11 +31,15 @@ class OpenCLPointwiseKernelProvider(OpenCLKernelProvider,
kernel_generator_cls = generator.OpenCLKernelGenerator
def _instantiate_kernel(self, dims, fun, arglst):
+ # Global and local sizes
+ gs = tuple(dims[::-1])
+ ls = (128, 2) if len(dims) == 2 else (16,)
+
class PointwiseKernel(ComputeKernel):
def run(self, queue, **kwargs):
kwargs = {k: float(v) for k, v in kwargs.items()}
narglst = [kwargs.get(ka, ka) for ka in arglst]
narglst = [getattr(arg, 'data', arg) for arg in narglst]
- fun(queue.cl_queue_comp, (dims[-1],), None, *narglst)
+ fun(queue.cl_queue_comp, gs, ls, *narglst)
return PointwiseKernel()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git
More information about the debian-science-commits
mailing list