[pyfr] 01/88: Rework axnpby to operate on matrices and add support for masking.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Nov 16 12:05:23 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository pyfr.
commit a07bef79a1f3a044bb7109472785f1c2ff6d61d1
Author: Freddie Witherden <freddie at witherden.org>
Date: Wed Feb 3 21:34:31 2016 +0000
Rework axnpby to operate on matrices and add support for masking.
---
pyfr/backends/cuda/blasext.py | 33 ++++++++++++---------
pyfr/backends/cuda/kernels/axnpby.mako | 41 +++++++++++++++-----------
pyfr/backends/opencl/blasext.py | 22 +++++++-------
pyfr/backends/opencl/kernels/axnpby.mako | 39 +++++++++++++++----------
pyfr/backends/openmp/blasext.py | 22 ++++++++------
pyfr/backends/openmp/kernels/axnpby.mako | 49 +++++++++++++++-----------------
6 files changed, 115 insertions(+), 91 deletions(-)
diff --git a/pyfr/backends/cuda/blasext.py b/pyfr/backends/cuda/blasext.py
index 812f02f..1d89cf1 100644
--- a/pyfr/backends/cuda/blasext.py
+++ b/pyfr/backends/cuda/blasext.py
@@ -2,36 +2,43 @@
import numpy as np
import pycuda.driver as cuda
-from pycuda.gpuarray import GPUArray, splay
+from pycuda.gpuarray import GPUArray
from pycuda.reduction import ReductionKernel
-from pyfr.backends.cuda.provider import CUDAKernelProvider
+from pyfr.backends.cuda.provider import CUDAKernelProvider, get_grid_for_block
from pyfr.backends.base import ComputeKernel
from pyfr.nputil import npdtype_to_ctype
class CUDABlasExtKernels(CUDAKernelProvider):
- def axnpby(self, y, *xn):
- if any(y.traits != x.traits for x in xn):
+ def axnpby(self, *arr, subdims=None):
+ if any(arr[0].traits != x.traits for x in arr[1:]):
raise ValueError('Incompatible matrix types')
- nv, cnt = len(xn), y.leaddim*y.nrow
+ nv = len(arr)
+ ncola, ncolb = arr[0].datashape[1:]
+ nrow, ldim, lsdim, dtype = arr[0].traits
# Render the kernel template
- src = self.backend.lookup.get_template('axnpby').render(n=nv)
+ src = self.backend.lookup.get_template('axnpby').render(
+ subdims=subdims or range(ncola), nv=nv
+ )
- # Build
+ # Build the kernel
kern = self._build_kernel('axnpby', src,
- [np.int32] + [np.intp, y.dtype]*(1 + nv))
+ [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
- # Compute a suitable block and grid
- grid, block = splay(cnt)
+ # Determine the grid/block
+ block = (128, 1, 1)
+ grid = get_grid_for_block(block, ncolb)
class AxnpbyKernel(ComputeKernel):
- def run(self, queue, beta, *alphan):
- args = [i for axn in zip(xn, alphan) for i in axn]
+ def run(self, queue, *consts):
+ args = list(arr) + list(consts)
+
kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
- cnt, y, beta, *args)
+ nrow, ncolb, ldim, lsdim,
+ *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/cuda/kernels/axnpby.mako b/pyfr/backends/cuda/kernels/axnpby.mako
index 4b2db1e..d2fbfe5 100644
--- a/pyfr/backends/cuda/kernels/axnpby.mako
+++ b/pyfr/backends/cuda/kernels/axnpby.mako
@@ -3,22 +3,31 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
__global__ void
-axnpby(int n, fpdtype_t* y, fpdtype_t beta,
- ${', '.join('const fpdtype_t* x{0}, fpdtype_t a{0}'.format(i)
- for i in range(n))})
+axnpby(int nrow, int ncolb, int ldim, int lsdim,
+ ${', '.join('fpdtype_t* __restrict__ x' + str(i) for i in range(nv))},
+ ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
- int strt = blockIdx.x*blockDim.x + threadIdx.x;
- int incr = gridDim.x*blockDim.x;
+ int j = blockIdx.x*blockDim.x + threadIdx.x;
+ int idx;
- for (int i = strt; i < n; i += incr)
- {
- fpdtype_t axn = ${pyfr.dot('a{j}', 'x{j}[i]', j=n)};
-
- if (beta == 0.0)
- y[i] = axn;
- else if (beta == 1.0)
- y[i] += axn;
- else
- y[i] = beta*y[i] + axn;
- }
+ % for k in subdims:
+ if (j < ncolb && a0 == 0.0)
+ for (int i = 0; i < nrow; ++i)
+ {
+ idx = i*ldim + ${k}*lsdim + j;
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ }
+ else if (j < ncolb && a0 == 1.0)
+ for (int i = 0; i < nrow; ++i)
+ {
+ idx = i*ldim + ${k}*lsdim + j;
+ x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ }
+ else if (j < ncolb)
+ for (int i = 0; i < nrow; ++i)
+ {
+ idx = i*ldim + ${k}*lsdim + j;
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+ }
+ % endfor
}
diff --git a/pyfr/backends/opencl/blasext.py b/pyfr/backends/opencl/blasext.py
index 45be9e6..98faf5d 100644
--- a/pyfr/backends/opencl/blasext.py
+++ b/pyfr/backends/opencl/blasext.py
@@ -2,7 +2,7 @@
import numpy as np
import pyopencl as cl
-from pyopencl.array import Array, splay
+from pyopencl.array import Array
from pyopencl.reduction import ReductionKernel
from pyfr.backends.opencl.provider import OpenCLKernelProvider
@@ -11,30 +11,28 @@ from pyfr.nputil import npdtype_to_ctype
class OpenCLBlasExtKernels(OpenCLKernelProvider):
- def axnpby(self, *arr):
+ def axnpby(self, *arr, subdims=None):
if any(arr[0].traits != x.traits for x in arr[1:]):
raise ValueError('Incompatible matrix types')
nv = len(arr)
- nrow, leaddim, leadsubdim, dtype = arr[0].traits
+ ncola, ncolb = arr[0].datashape[1:]
+ nrow, ldim, lsdim, dtype = arr[0].traits
# Render the kernel template
- src = self.backend.lookup.get_template('axnpby').render(nv=nv)
+ src = self.backend.lookup.get_template('axnpby').render(
+ subdims=subdims or range(ncola), nv=nv
+ )
# Build the kernel
kern = self._build_kernel('axnpby', src,
- [np.int32] + [np.intp]*nv + [dtype]*nv)
-
- # Determine the total element count in the matrices
- cnt = leaddim*nrow
-
- # Compute a suitable global and local workgroup sizes
- gs, ls = splay(self.backend.qdflt, cnt)
+ [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
class AxnpbyKernel(ComputeKernel):
def run(self, queue, *consts):
args = [x.data for x in arr] + list(consts)
- kern(queue.cl_queue_comp, gs, ls, cnt, *args)
+ kern(queue.cl_queue_comp, (ncolb,), None, nrow, ncolb,
+ ldim, lsdim, *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/opencl/kernels/axnpby.mako b/pyfr/backends/opencl/kernels/axnpby.mako
index b0079ca..4d00d3d 100644
--- a/pyfr/backends/opencl/kernels/axnpby.mako
+++ b/pyfr/backends/opencl/kernels/axnpby.mako
@@ -3,22 +3,31 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
__kernel void
-axnpby(int n,
+axnpby(int nrow, int ncolb, int ldim, int lsdim,
${', '.join('__global fpdtype_t* restrict x' + str(i) for i in range(nv))},
- ${', '.join('fpdtype_t alpha' + str(i) for i in range(nv))})
+ ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
- int strt = get_global_id(0);
- int incr = get_global_size(0);
+ int j = get_global_id(0);
+ int idx;
- for (int i = strt; i < n; i += incr)
- {
- fpdtype_t axn = ${pyfr.dot('alpha{j}', 'x{j}[i]', j=(1, nv))};
-
- if (alpha0 == 0.0)
- x0[i] = axn;
- else if (alpha0 == 1.0)
- x0[i] += axn;
- else
- x0[i] = alpha0*x0[i] + axn;
- }
+ % for k in subdims:
+ if (j < ncolb && a0 == 0.0)
+ for (int i = 0; i < nrow; ++i)
+ {
+ idx = i*ldim + ${k}*lsdim + j;
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ }
+ else if (j < ncolb && a0 == 1.0)
+ for (int i = 0; i < nrow; ++i)
+ {
+ idx = i*ldim + ${k}*lsdim + j;
+ x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ }
+ else if (j < ncolb)
+ for (int i = 0; i < nrow; ++i)
+ {
+ idx = i*ldim + ${k}*lsdim + j;
+ x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+ }
+ % endfor
}
diff --git a/pyfr/backends/openmp/blasext.py b/pyfr/backends/openmp/blasext.py
index 278e026..c97bf4c 100644
--- a/pyfr/backends/openmp/blasext.py
+++ b/pyfr/backends/openmp/blasext.py
@@ -7,23 +7,27 @@ from pyfr.backends.base import ComputeKernel
class OpenMPBlasExtKernels(OpenMPKernelProvider):
- def axnpby(self, y, *xn):
- if any(y.traits != x.traits for x in xn):
+ def axnpby(self, *arr, subdims=None):
+ if any(arr[0].traits != x.traits for x in arr[1:]):
raise ValueError('Incompatible matrix types')
- nv, cnt = len(xn), y.leaddim*y.nrow
+ nv = len(arr)
+ ncola, ncolb = arr[0].datashape[1:]
+ nrow, ldim, lsdim, dtype = arr[0].traits
# Render the kernel template
- src = self.backend.lookup.get_template('axnpby').render(n=nv)
+ src = self.backend.lookup.get_template('axnpby').render(
+ subdims=subdims or range(ncola), nv=nv
+ )
- # Build
+ # Build the kernel
kern = self._build_kernel('axnpby', src,
- [np.int32] + [np.intp, y.dtype]*(1 + nv))
+ [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
class AxnpbyKernel(ComputeKernel):
- def run(self, queue, beta, *alphan):
- args = [i for axn in zip(xn, alphan) for i in axn]
- kern(cnt, y, beta, *args)
+ def run(self, queue, *consts):
+ args = list(arr) + list(consts)
+ kern(nrow, ncolb, ldim, lsdim, *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/openmp/kernels/axnpby.mako b/pyfr/backends/openmp/kernels/axnpby.mako
index a72c0de..de55a3f 100644
--- a/pyfr/backends/openmp/kernels/axnpby.mako
+++ b/pyfr/backends/openmp/kernels/axnpby.mako
@@ -2,36 +2,33 @@
<%inherit file='base'/>
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
-static PYFR_NOINLINE void
-axnpby_inner(int n, fpdtype_t *__restrict__ y, fpdtype_t beta,
- ${', '.join('const fpdtype_t *__restrict__ x{0}, '
- 'fpdtype_t a{0}'.format(i) for i in range(n))})
-{
- for (int i = 0; i < n; i++)
- {
- fpdtype_t axn = ${pyfr.dot('a{j}', 'x{j}[i]', j=n)};
-
- if (beta == 0.0)
- y[i] = axn;
- else if (beta == 1.0)
- y[i] += axn;
- else
- y[i] = beta*y[i] + axn;
- }
-}
-
void
-axnpby(int n, fpdtype_t *__restrict__ y, fpdtype_t beta,
- ${', '.join('const fpdtype_t *__restrict__ x{0}, '
- 'fpdtype_t a{0}'.format(i) for i in range(n))})
+axnpby(int nrow, int ncolb, int ldim, int lsdim,
+ ${', '.join('fpdtype_t *__restrict__ x' + str(i) for i in range(nv))},
+ ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
#pragma omp parallel
{
- int begin, end;
- loop_sched_1d(n, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);
+ int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+ int rb, re, cb, ce;
+ loop_sched_2d(nrow, ncolb, align, &rb, &re, &cb, &ce);
+
+ for (int r = rb; r < re; r++)
+ {
+ % for k in subdims:
+ for (int i = cb; i < ce; i++)
+ {
+ int idx = i + ldim*r + ${k}*lsdim;
+ fpdtype_t axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
- axnpby_inner(end - begin, y + begin, beta,
- ${', '.join('x{0} + begin, a{0}'.format(i)
- for i in range(n))});
+ if (a0 == 0.0)
+ x0[idx] = axn;
+ else if (a0 == 1.0)
+ x0[idx] += axn;
+ else
+ x0[idx] = a0*x0[idx] + axn;
+ }
+ % endfor
+ }
}
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git
More information about the debian-science-commits
mailing list