[pyfr] 56/88: Switch from SoA to AoSoA(k).
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Nov 16 12:05:29 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository pyfr.
commit 61c45aa98cf0a41d72282cb3eca66edb5e4c8348
Author: Freddie Witherden <freddie at witherden.org>
Date: Sat Jun 25 21:43:07 2016 -0700
Switch from SoA to AoSoA(k).
This reduces memory usage and improves performance.
---
pyfr/backends/base/backend.py | 11 ++--
pyfr/backends/base/generator.py | 57 ++++++++++++--------
pyfr/backends/base/kernels.py | 5 +-
pyfr/backends/base/types.py | 73 ++++++++++++++++---------
pyfr/backends/cuda/base.py | 3 ++
pyfr/backends/cuda/blasext.py | 11 ++--
pyfr/backends/cuda/generator.py | 11 ++--
pyfr/backends/cuda/kernels/axnpby.mako | 16 +++---
pyfr/backends/cuda/kernels/base.mako | 4 ++
pyfr/backends/cuda/kernels/pack.mako | 7 ++-
pyfr/backends/cuda/packing.py | 6 +--
pyfr/backends/cuda/types.py | 12 ++---
pyfr/backends/mic/base.py | 3 ++
pyfr/backends/mic/blasext.py | 9 ++--
pyfr/backends/mic/generator.py | 92 ++++++++++++++++++--------------
pyfr/backends/mic/kernels/axnpby.mako | 40 ++++++++++----
pyfr/backends/mic/kernels/base.mako | 1 +
pyfr/backends/mic/kernels/pack.mako | 9 ++--
pyfr/backends/mic/packing.py | 4 +-
pyfr/backends/mic/types.py | 12 ++---
pyfr/backends/opencl/base.py | 3 ++
pyfr/backends/opencl/blasext.py | 10 ++--
pyfr/backends/opencl/generator.py | 10 ++--
pyfr/backends/opencl/kernels/axnpby.mako | 16 +++---
pyfr/backends/opencl/kernels/base.mako | 4 ++
pyfr/backends/opencl/kernels/pack.mako | 5 +-
pyfr/backends/opencl/packing.py | 4 +-
pyfr/backends/opencl/types.py | 12 ++---
pyfr/backends/openmp/base.py | 3 ++
pyfr/backends/openmp/blasext.py | 10 ++--
pyfr/backends/openmp/generator.py | 92 ++++++++++++++++++--------------
pyfr/backends/openmp/kernels/axnpby.mako | 37 ++++++++++---
pyfr/backends/openmp/kernels/base.mako | 1 +
pyfr/backends/openmp/kernels/pack.mako | 7 ++-
pyfr/backends/openmp/packing.py | 4 +-
pyfr/backends/openmp/types.py | 15 ++----
pyfr/solvers/base/elements.py | 7 ++-
37 files changed, 375 insertions(+), 251 deletions(-)
diff --git a/pyfr/backends/base/backend.py b/pyfr/backends/base/backend.py
index 40ab6a4..e28e9f2 100644
--- a/pyfr/backends/base/backend.py
+++ b/pyfr/backends/base/backend.py
@@ -56,7 +56,8 @@ class BaseBackend(object, metaclass=ABCMeta):
@lazyprop
def lookup(self):
pkg = 'pyfr.backends.{0}.kernels'.format(self.name)
- dfltargs = dict(alignb=self.alignb, fpdtype=self.fpdtype, math=math)
+ dfltargs = dict(alignb=self.alignb, fpdtype=self.fpdtype,
+ soasz=self.soasz, math=math)
return DottedTemplateLookup(pkg, dfltargs)
@@ -147,12 +148,12 @@ class BaseBackend(object, metaclass=ABCMeta):
def xchg_matrix_for_view(self, view, tags=set()):
return self.xchg_matrix((view.nvrow, view.nvcol, view.n), tags=tags)
- def view(self, matmap, rcmap, stridemap=None, vshape=tuple(), tags=set()):
- return self.view_cls(self, matmap, rcmap, stridemap, vshape, tags)
+ def view(self, matmap, rcmap, rstridemap=None, vshape=tuple(), tags=set()):
+ return self.view_cls(self, matmap, rcmap, rstridemap, vshape, tags)
- def xchg_view(self, matmap, rcmap, stridemap=None, vshape=tuple(),
+ def xchg_view(self, matmap, rcmap, rstridemap=None, vshape=tuple(),
tags=set()):
- return self.xchg_view_cls(self, matmap, rcmap, stridemap, vshape,
+ return self.xchg_view_cls(self, matmap, rcmap, rstridemap, vshape,
tags)
def kernel(self, name, *args, **kwargs):
diff --git a/pyfr/backends/base/generator.py b/pyfr/backends/base/generator.py
index 9a7e7ca..e9bf888 100644
--- a/pyfr/backends/base/generator.py
+++ b/pyfr/backends/base/generator.py
@@ -104,7 +104,7 @@ class BaseKernelGenerator(object, metaclass=ABCMeta):
# View
if va.isview:
- argt.append([np.intp]*(2 + va.ncdim))
+ argt.append([np.intp]*(2 + (va.ncdim == 2)))
# Non-stacked vector or MPI type
elif self.ndim == 1 and (va.ncdim == 0 or va.ismpi):
argt.append([np.intp])
@@ -118,7 +118,7 @@ class BaseKernelGenerator(object, metaclass=ABCMeta):
# Return
return self.ndim, argn, argt
- def needs_lsdim(self, arg):
+ def needs_ldim(self, arg):
return ((self.ndim == 2 and not arg.isbroadcast) or
(arg.ncdim > 0 and not arg.ismpi))
@@ -127,42 +127,55 @@ class BaseKernelGenerator(object, metaclass=ABCMeta):
pass
def _deref_arg_view(self, arg):
- ptns = ['{0}_v[{0}_vix[_x]]',
- r'{0}_v[{0}_vix[_x] + {0}_vcstri[_x]*\1]',
- r'{0}_v[{0}_vix[_x] + {0}_vrstri[_x]*\1 + {0}_vcstri[_x]*\2]']
+ ptns = ['{0}_v[{0}_vix[X_IDX]]',
+ r'{0}_v[{0}_vix[X_IDX] + SOA_SZ*\1]',
+ r'{0}_v[{0}_vix[X_IDX] + {0}_vrstri[X_IDX]*\1 + SOA_SZ*\2]']
return ptns[arg.ncdim].format(arg.name)
def _deref_arg_array_1d(self, arg):
- # Leading (sub) dimension
- lsdim = 'lsd' + arg.name if not arg.ismpi else '_nx'
+ # Leading dimension
+ ldim = 'ld' + arg.name if not arg.ismpi else '_nx'
- # Vector: name_v[_x]
+ # Vector:
+ # name => name_v[X_IDX]
if arg.ncdim == 0:
- ix = '_x'
- # Stacked vector: name_v[lsdim*\1 + _x]
+ ix = 'X_IDX'
+ # Stacked vector:
+ # name[\1] => name_v[ldim*\1 + X_IDX]
elif arg.ncdim == 1:
- ix = r'{0}*\1 + _x'.format(lsdim)
- # Doubly stacked vector: name_v[(nv*\1 + \2)*lsdim + _x]
+ ix = r'{0}*\1 + X_IDX'.format(ldim)
+ # Doubly stacked MPI vector:
+ # name[\1][\2] => name_v[(nv*\1 + \2)*ldim + X_IDX]
+ elif arg.ismpi:
+ ix = r'({0}*\1 + \2)*{1} + X_IDX'.format(arg.cdims[1], ldim)
+ # Doubly stacked vector:
+ # name[\1][\2] => name_v[ldim*\1 + X_IDX_AOSOA(\2, nv)]
else:
- ix = r'({0}*\1 + \2)*{1} + _x'.format(arg.cdims[1], lsdim)
+ ix = (r'ld{0}*\1 + X_IDX_AOSOA(\2, {1})'
+ .format(arg.name, arg.cdims[1]))
return '{0}_v[{1}]'.format(arg.name, ix)
def _deref_arg_array_2d(self, arg):
- # Broadcast vector: name_v[_x]
+ # Broadcast vector:
+ # name => name_v[X_IDX]
if arg.isbroadcast:
- ix = '_x'
- # Matrix: name_v[lsdim*_y + _x]
+ ix = 'X_IDX'
+ # Matrix:
+ # name => name_v[ldim*_y + X_IDX]
elif arg.ncdim == 0:
- ix = 'lsd{}*_y + _x'.format(arg.name)
- # Stacked matrix: name_v[(_y*nv + \1)*lsdim + _x]
+ ix = 'ld{0}*_y + X_IDX'.format(arg.name)
+ # Stacked matrix:
+ # name[\1] => name_v[ldim*_y + X_IDX_AOSOA(\1, nv)]
elif arg.ncdim == 1:
- ix = r'(_y*{0} + \1)*lsd{1} + _x'.format(arg.cdims[0], arg.name)
- # Doubly stacked matrix: name_v[((\1*_ny + _y)*nv + \2)*lsdim + _x]
+ ix = (r'ld{0}*_y + X_IDX_AOSOA(\1, {1})'
+ .format(arg.name, arg.cdims[0]))
+ # Doubly stacked matrix:
+ # name[\1][\2] => name_v[(\1*ny + _y)*ldim + X_IDX_AOSOA(\2, nv)]
else:
- ix = (r'((\1*_ny + _y)*{0} + \2)*lsd{1} + _x'
- .format(arg.cdims[1], arg.name))
+ ix = (r'(\1*_ny + _y)*ld{0} + X_IDX_AOSOA(\2, {1})'
+ .format(arg.name, arg.cdims[1]))
return '{0}_v[{1}]'.format(arg.name, ix)
diff --git a/pyfr/backends/base/kernels.py b/pyfr/backends/base/kernels.py
index fe07006..ef00a34 100644
--- a/pyfr/backends/base/kernels.py
+++ b/pyfr/backends/base/kernels.py
@@ -118,7 +118,7 @@ class BasePointwiseKernelProvider(BaseKernelProvider, metaclass=ABCMeta):
# Matrix
if isinstance(ka, mattypes):
- arglst += [ka, ka.leadsubdim] if len(atypes) == 2 else [ka]
+ arglst += [ka, ka.leaddim] if len(atypes) == 2 else [ka]
# View
elif isinstance(ka, viewtypes):
if isinstance(ka, self.backend.view_cls):
@@ -127,8 +127,7 @@ class BasePointwiseKernelProvider(BaseKernelProvider, metaclass=ABCMeta):
view = ka.view
arglst += [view.basedata, view.mapping]
- arglst += [view.cstrides] if len(atypes) >= 3 else []
- arglst += [view.rstrides] if len(atypes) == 4 else []
+ arglst += [view.rstrides] if len(atypes) == 3 else []
# Other; let the backend handle it
else:
arglst.append(ka)
diff --git a/pyfr/backends/base/types.py b/pyfr/backends/base/types.py
index 73f7c27..d1f3735 100644
--- a/pyfr/backends/base/types.py
+++ b/pyfr/backends/base/types.py
@@ -25,23 +25,26 @@ class MatrixBase(object, metaclass=ABCMeta):
if ndim == 2:
nrow, ncol = shape
- elif ndim == 3 or ndim == 4:
- nrow = shape[0] if ndim == 3 else shape[0]*shape[1]
- ncol = shape[-2]*shape[-1] + (1 - shape[-2])*(shape[-1] % -ldmod)
+ aosoashape = shape
+ else:
+ nvar, narr, k = shape[-2], shape[-1], backend.soasz
+ nparr = narr - narr % -k
- # Pad the final dimension
- shape[-1] -= shape[-1] % -ldmod
+ nrow = shape[0] if ndim == 3 else shape[0]*shape[1]
+ ncol = nvar*nparr
+ aosoashape = shape[:-2] + [nparr // k, nvar, k]
# Assign
self.nrow, self.ncol = int(nrow), int(ncol)
- self.ioshape, self.datashape = ioshape, shape
+
+ self.datashape = aosoashape
+ self.ioshape = ioshape
self.leaddim = self.ncol - (self.ncol % -ldmod)
- self.leadsubdim = shape[-1]
self.pitch = self.leaddim*self.itemsize
self.nbytes = self.nrow*self.pitch
- self.traits = (self.nrow, self.leaddim, self.leadsubdim, self.dtype)
+ self.traits = (self.nrow, self.leaddim, self.dtype)
# Process the initial value
if initval is not None:
@@ -76,6 +79,28 @@ class MatrixBase(object, metaclass=ABCMeta):
def _get(self):
pass
+ def _pack(self, ary):
+ # If necessary convert from SoA to AoSoA packing
+ if ary.ndim > 2:
+ n, k = ary.shape[-1], self.backend.soasz
+
+ ary = np.pad(ary, [(0, 0)]*(ary.ndim - 1) + [(0, -n % k)],
+ mode='constant')
+ ary = ary.reshape(ary.shape[:-1] + (-1, k)).swapaxes(-2, -3)
+ ary = ary.reshape(self.nrow, self.ncol)
+
+ return np.ascontiguousarray(ary, dtype=self.dtype)
+
+ def _unpack(self, ary):
+ # If necessary unpack from AoSoA to SoA
+ if len(self.ioshape) > 2:
+ ary = ary.reshape(self.datashape)
+ ary = ary.swapaxes(-2, -3)
+ ary = ary.reshape(self.ioshape[:-1] + (-1,))
+ ary = ary[..., :self.ioshape[-1]]
+
+ return ary
+
class Matrix(MatrixBase):
_base_tags = {'dense'}
@@ -114,10 +139,10 @@ class MatrixRSlice(object):
self.p, self.q = int(p), int(q)
self.nrow, self.ncol = self.q - self.p, mat.ncol
self.dtype, self.itemsize = mat.dtype, mat.itemsize
- self.leaddim, self.leadsubdim = mat.leaddim, mat.leadsubdim
+ self.leaddim = mat.leaddim
self.pitch = self.leaddim*self.itemsize
- self.traits = (self.nrow, self.leaddim, self.leadsubdim, self.dtype)
+ self.traits = (self.nrow, self.leaddim, self.dtype)
self.tags = mat.tags | {'slice'}
@@ -184,11 +209,11 @@ class MatrixBank(Sequence):
class View(object):
- def __init__(self, backend, matmap, rcmap, stridemap, vshape, tags):
+ def __init__(self, backend, matmap, rcmap, rstridemap, vshape, tags):
self.n = len(matmap)
self.nvrow = vshape[-2] if len(vshape) == 2 else 1
self.nvcol = vshape[-1] if len(vshape) >= 1 else 1
- self.rstrides = self.cstrides = None
+ self.rstrides = None
# Get the different matrices which we map onto
self._mats = [backend.mats[i] for i in np.unique(matmap)]
@@ -211,6 +236,9 @@ class View(object):
if any(m.dtype != self.refdtype for m in self._mats):
raise TypeError('Mixed data types are not supported')
+ # SoA size
+ k = backend.soasz
+
# Base offsets and leading dimensions for each point
offset = np.empty(self.n, dtype=np.int32)
leaddim = np.empty(self.n, dtype=np.int32)
@@ -219,32 +247,27 @@ class View(object):
ix = np.where(matmap == m.mid)
offset[ix], leaddim[ix] = m.offset // m.itemsize, m.leaddim
- # Go from matrices + (row, column) indcies to displacements
- # relative to the base allocation address
- mapping = (offset + rcmap[:,0]*leaddim + rcmap[:,1])[None,:]
+ # Row/column displacements
+ rowdisp = rcmap[:, 0]*leaddim
+ coldisp = (rcmap[:, 1] // k)*(self.nvcol*k) + rcmap[:, 1] % k
+
+ mapping = (offset + rowdisp + coldisp)[None,:]
self.mapping = backend.base_matrix_cls(
backend, np.int32, (1, self.n), mapping, None, None, tags
)
# Row strides
if self.nvrow > 1:
- rstrides = (stridemap[:,0]*leaddim)[None,:]
+ rstrides = (rstridemap*leaddim)[None,:]
self.rstrides = backend.base_matrix_cls(
backend, np.int32, (1, self.n), rstrides, None, None, tags
)
- # Column strides
- if self.nvcol > 1:
- cstrides = stridemap[:,-1][None,:]
- self.cstrides = backend.base_matrix_cls(
- backend, np.int32, (1, self.n), cstrides, None, None, tags
- )
-
class XchgView(object):
- def __init__(self, backend, matmap, rcmap, stridemap, vshape, tags):
+ def __init__(self, backend, matmap, rcmap, rstridemap, vshape, tags):
# Create a normal view
- self.view = backend.view(matmap, rcmap, stridemap, vshape, tags)
+ self.view = backend.view(matmap, rcmap, rstridemap, vshape, tags)
# Dimensions
self.n = n = self.view.n
diff --git a/pyfr/backends/cuda/base.py b/pyfr/backends/cuda/base.py
index 2296df4..2541f9a 100644
--- a/pyfr/backends/cuda/base.py
+++ b/pyfr/backends/cuda/base.py
@@ -35,6 +35,9 @@ class CUDABackend(BaseBackend):
# Take the required alignment to be 128 bytes
self.alignb = 128
+ # Take the SoA size to be 32 elements
+ self.soasz = 32
+
# Get the MPI runtime type
self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard')
if self.mpitype not in {'standard', 'cuda-aware'}:
diff --git a/pyfr/backends/cuda/blasext.py b/pyfr/backends/cuda/blasext.py
index bc4df16..96b94fe 100644
--- a/pyfr/backends/cuda/blasext.py
+++ b/pyfr/backends/cuda/blasext.py
@@ -16,17 +16,17 @@ class CUDABlasExtKernels(CUDAKernelProvider):
raise ValueError('Incompatible matrix types')
nv = len(arr)
- ncola, ncolb = arr[0].datashape[1:]
- nrow, ldim, lsdim, dtype = arr[0].traits
+ nrow, ldim, dtype = arr[0].traits
+ ncola, ncolb = arr[0].ioshape[1:]
# Render the kernel template
src = self.backend.lookup.get_template('axnpby').render(
- subdims=subdims or range(ncola), nv=nv
+ subdims=subdims or range(ncola), ncola=ncola, nv=nv
)
# Build the kernel
kern = self._build_kernel('axnpby', src,
- [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+ [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
# Determine the grid/block
block = (128, 1, 1)
@@ -37,8 +37,7 @@ class CUDABlasExtKernels(CUDAKernelProvider):
args = list(arr) + list(consts)
kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
- nrow, ncolb, ldim, lsdim,
- *args)
+ nrow, ncolb, ldim, *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/cuda/generator.py b/pyfr/backends/cuda/generator.py
index 46ae875..c2e7e28 100644
--- a/pyfr/backends/cuda/generator.py
+++ b/pyfr/backends/cuda/generator.py
@@ -24,10 +24,14 @@ class CUDAKernelGenerator(BaseKernelGenerator):
return '''{spec}
{{
int _x = blockIdx.x*blockDim.x + threadIdx.x;
+ #define X_IDX (_x)
+ #define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
{limits}
{{
{body}
}}
+ #undef X_IDX
+ #undef X_IDX_AOSOA
}}'''.format(spec=spec, limits=limits, body=self.body)
def _render_spec(self):
@@ -45,9 +49,6 @@ class CUDAKernelGenerator(BaseKernelGenerator):
kargs.append('const int* __restrict__ {0.name}_vix'
.format(va))
- if va.ncdim >= 1:
- kargs.append('const int* __restrict__ {0.name}_vcstri'
- .format(va))
if va.ncdim == 2:
kargs.append('const int* __restrict__ {0.name}_vrstri'
.format(va))
@@ -59,7 +60,7 @@ class CUDAKernelGenerator(BaseKernelGenerator):
kargs.append('{0} {1.dtype}* __restrict__ {1.name}_v'
.format(const, va).strip())
- if self.needs_lsdim(va):
- kargs.append('int lsd{0.name}'.format(va))
+ if self.needs_ldim(va):
+ kargs.append('int ld{0.name}'.format(va))
return '__global__ void {0}({1})'.format(self.name, ', '.join(kargs))
diff --git a/pyfr/backends/cuda/kernels/axnpby.mako b/pyfr/backends/cuda/kernels/axnpby.mako
index d2fbfe5..c19f273 100644
--- a/pyfr/backends/cuda/kernels/axnpby.mako
+++ b/pyfr/backends/cuda/kernels/axnpby.mako
@@ -3,31 +3,35 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
__global__ void
-axnpby(int nrow, int ncolb, int ldim, int lsdim,
+axnpby(int nrow, int ncolb, int ldim,
${', '.join('fpdtype_t* __restrict__ x' + str(i) for i in range(nv))},
${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
int j = blockIdx.x*blockDim.x + threadIdx.x;
int idx;
- % for k in subdims:
if (j < ncolb && a0 == 0.0)
for (int i = 0; i < nrow; ++i)
{
- idx = i*ldim + ${k}*lsdim + j;
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
}
else if (j < ncolb && a0 == 1.0)
for (int i = 0; i < nrow; ++i)
{
- idx = i*ldim + ${k}*lsdim + j;
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
}
else if (j < ncolb)
for (int i = 0; i < nrow; ++i)
{
- idx = i*ldim + ${k}*lsdim + j;
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+ % endfor
}
- % endfor
}
diff --git a/pyfr/backends/cuda/kernels/base.mako b/pyfr/backends/cuda/kernels/base.mako
index 4b6e3b7..5eeff1e 100644
--- a/pyfr/backends/cuda/kernels/base.mako
+++ b/pyfr/backends/cuda/kernels/base.mako
@@ -1,6 +1,10 @@
# -*- coding: utf-8 -*-
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
+// AoSoA macros
+#define SOA_SZ ${soasz}
+#define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ)
+
// Typedefs
typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
diff --git a/pyfr/backends/cuda/kernels/pack.mako b/pyfr/backends/cuda/kernels/pack.mako
index a0e503f..1e987e3 100644
--- a/pyfr/backends/cuda/kernels/pack.mako
+++ b/pyfr/backends/cuda/kernels/pack.mako
@@ -5,9 +5,8 @@ __global__ void
pack_view(int n, int nrv, int ncv,
const fpdtype_t* __restrict__ v,
const int* __restrict__ vix,
- const int* __restrict__ vcstri,
const int* __restrict__ vrstri,
- fpdtype_t* __restrict__ pmat)
+ fpdtype_t* __restrict__ pmat)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
@@ -15,9 +14,9 @@ pack_view(int n, int nrv, int ncv,
pmat[i] = v[vix[i]];
else if (i < n && nrv == 1)
for (int c = 0; c < ncv; ++c)
- pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+ pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
else if (i < n)
for (int r = 0; r < nrv; ++r)
for (int c = 0; c < ncv; ++c)
- pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + vcstri[i]*c];
+ pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c];
}
diff --git a/pyfr/backends/cuda/packing.py b/pyfr/backends/cuda/packing.py
index d2c1654..a8168f3 100644
--- a/pyfr/backends/cuda/packing.py
+++ b/pyfr/backends/cuda/packing.py
@@ -16,7 +16,7 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
src = self.backend.lookup.get_template('pack').render()
# Build
- kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+ kern = self._build_kernel('pack_view', src, 'iiiPPPP')
# Compute the grid and thread-block size
block = (128, 1, 1)
@@ -31,7 +31,7 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
# Pack
kern.prepared_async_call(
grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
- v.mapping, v.cstrides or 0, v.rstrides or 0, m
+ v.mapping, v.rstrides or 0, m
)
# Otherwise, we need to both pack the buffer and copy it back
else:
@@ -46,7 +46,7 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
# Pack
kern.prepared_async_call(
grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
- v.mapping, v.cstrides or 0, v.rstrides or 0, m
+ v.mapping, v.rstrides or 0, m
)
# Copy the packed buffer to the host
diff --git a/pyfr/backends/cuda/types.py b/pyfr/backends/cuda/types.py
index 8ca891e..70e7aa2 100644
--- a/pyfr/backends/cuda/types.py
+++ b/pyfr/backends/cuda/types.py
@@ -29,18 +29,18 @@ class CUDAMatrixBase(base.MatrixBase):
def _get(self):
# Allocate an empty buffer
- buf = np.empty(self.datashape, dtype=self.dtype)
+ buf = np.empty((self.nrow, self.leaddim), dtype=self.dtype)
# Copy
cuda.memcpy_dtoh(buf, self.data)
- # Slice to give the expected I/O shape
- return buf[...,:self.ioshape[-1]]
+ # Unpack
+ return self._unpack(buf[:, :self.ncol])
def _set(self, ary):
- # Allocate a new buffer with suitable padding and assign
- buf = np.zeros(self.datashape, dtype=self.dtype)
- buf[...,:self.ioshape[-1]] = ary
+ # Allocate a new buffer with suitable padding and pack it
+ buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
+ buf[:, :self.ncol] = self._pack(ary)
# Copy
cuda.memcpy_htod(self.data, buf)
diff --git a/pyfr/backends/mic/base.py b/pyfr/backends/mic/base.py
index 2a74f15..cbb1b31 100644
--- a/pyfr/backends/mic/base.py
+++ b/pyfr/backends/mic/base.py
@@ -30,6 +30,9 @@ class MICBackend(BaseBackend):
# Take the alignment requirement to be 64-bytes
self.alignb = 64
+ # Compute the SoA size
+ self.soasz = self.alignb // np.dtype(self.fpdtype).itemsize
+
from pyfr.backends.mic import (blasext, cblas, packing, provider,
types)
diff --git a/pyfr/backends/mic/blasext.py b/pyfr/backends/mic/blasext.py
index f00a284..e0aba5b 100644
--- a/pyfr/backends/mic/blasext.py
+++ b/pyfr/backends/mic/blasext.py
@@ -12,8 +12,8 @@ class MICBlasExtKernels(MICKernelProvider):
raise ValueError('Incompatible matrix types')
nv = len(arr)
- ncola, ncolb = arr[0].datashape[1:]
- nrow, ldim, lsdim, dtype = arr[0].traits
+ nrow, ldim, dtype = arr[0].traits
+ ncola, ncolb = arr[0].ioshape[1:]
# Render the kernel template
src = self.backend.lookup.get_template('axnpby').render(
@@ -22,13 +22,12 @@ class MICBlasExtKernels(MICKernelProvider):
# Build the kernel
kern = self._build_kernel('axnpby', src,
- [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+ [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
class AxnpbyKernel(ComputeKernel):
def run(self, queue, *consts):
args = [x.data for x in arr] + list(consts)
- queue.mic_stream_comp.invoke(kern, nrow, ncolb, ldim, lsdim,
- *args)
+ queue.mic_stream_comp.invoke(kern, nrow, ncolb, ldim, *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/mic/generator.py b/pyfr/backends/mic/generator.py
index 8befc78..dc2ab76 100644
--- a/pyfr/backends/mic/generator.py
+++ b/pyfr/backends/mic/generator.py
@@ -9,42 +9,58 @@ class MICKernelGenerator(BaseKernelGenerator):
spec, unpack = self._render_spec_unpack()
if self.ndim == 1:
- tpl = '''{spec}
- {{
- {unpack}
- #pragma omp parallel
- {{
- int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
- int cb, ce;
- loop_sched_1d(_nx, align, &cb, &ce);
- #pragma omp simd
- for (int _x = cb; _x < ce; _x++)
- {{
- {body}
- }}
- }}
- }}'''
+ inner = '''
+ int cb, ce;
+ loop_sched_1d(_nx, align, &cb, &ce);
+ int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+ for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+ {{
+ #pragma omp simd
+ for (int _xj = 0; _xj < SOA_SZ; _xj++)
+ {{
+ {body}
+ }}
+ }}
+ for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi; _xj++)
+ {{
+ {body}
+ }}'''.format(body=self.body)
else:
- tpl = '''{spec}
- {{
- {unpack}
- #pragma omp parallel
- {{
- int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
- int rb, re, cb, ce;
- loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
- for (int _y = rb; _y < re; _y++)
- {{
- #pragma omp simd
- for (int _x = cb; _x < ce; _x++)
- {{
- {body}
- }}
- }}
- }}
- }}'''
+ inner = '''
+ int rb, re, cb, ce;
+ loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
+ int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+ for (int _y = rb; _y < re; _y++)
+ {{
+ for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+ {{
+ #pragma omp simd
+ for (int _xj = 0; _xj < SOA_SZ; _xj++)
+ {{
+ {body}
+ }}
+ }}
+ for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi;
+ _xj++)
+ {{
+ {body}
+ }}
+ }}'''.format(body=self.body)
- return tpl.format(spec=spec, unpack=unpack, body=self.body)
+ return '''{spec}
+ {{
+ {unpack}
+ #define X_IDX (_xi + _xj)
+ #define X_IDX_AOSOA(v, nv)\
+ ((_xi/SOA_SZ*(nv) + (v))*SOA_SZ + _xj)
+ int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+ #pragma omp parallel
+ {{
+ {inner}
+ }}
+ #undef X_IDX
+ #undef X_IDX_AOSOA
+ }}'''.format(spec=spec, unpack=unpack, inner=inner)
def _render_spec_unpack(self):
# Start by unpacking the dimensions
@@ -68,10 +84,6 @@ class MICKernelGenerator(BaseKernelGenerator):
kpack.append('const int *{0.name}_vix = *arg{{0}};'
.format(va))
- if va.ncdim >= 1:
- kspec.append('void **arg{0}')
- kpack.append('const int *{0.name}_vcstri = *arg{{0}};'
- .format(va))
if va.ncdim == 2:
kspec.append('void **arg{0}')
kpack.append('const int *{0.name}_vrstri = *arg{{0}};'
@@ -85,9 +97,9 @@ class MICKernelGenerator(BaseKernelGenerator):
kpack.append('{0} {1.dtype}* {1.name}_v = *arg{{0}};'
.format(const, va).strip())
- if self.needs_lsdim(va):
+ if self.needs_ldim(va):
kspec.append('long *arg{0}')
- kpack.append('int lsd{0.name} = *arg{{0}};'.format(va))
+ kpack.append('int ld{0.name} = *arg{{0}};'.format(va))
# Number the arguments
params = ', '.join(a.format(i) for i, a in enumerate(kspec))
diff --git a/pyfr/backends/mic/kernels/axnpby.mako b/pyfr/backends/mic/kernels/axnpby.mako
index ddbea4b..96f1211 100644
--- a/pyfr/backends/mic/kernels/axnpby.mako
+++ b/pyfr/backends/mic/kernels/axnpby.mako
@@ -3,12 +3,12 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
void
-axnpby(long *nrowp, long *ncolbp, long *ldimp, long *lsdimp,
+axnpby(long *nrowp, long *ncolbp, long *ldimp,
${', '.join('fpdtype_t **xp' + str(i) for i in range(nv))},
${', '.join('double *ap' + str(i) for i in range(nv))})
{
+ #define X_IDX_AOSOA(v, nv) ((ci/SOA_SZ*(nv) + (v))*SOA_SZ + cj)
int ldim = *ldimp;
- int lsdim = *lsdimp;
% for i in range(nv):
fpdtype_t *x${i} = *xp${i};
@@ -18,16 +18,37 @@ axnpby(long *nrowp, long *ncolbp, long *ldimp, long *lsdimp,
#pragma omp parallel
{
int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
- int rb, re, cb, ce;
- loop_sched_2d(*nrowp, *ncolbp, align, &rb, &re, &cb, &ce);
+ int rb, re, cb, ce, idx;
+ fpdtype_t axn;
+ loop_sched_2d(nrow, ncolb, align, &rb, &re, &cb, &ce);
+ int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
for (int r = rb; r < re; r++)
{
- % for k in subdims:
- for (int i = cb; i < ce; i++)
+ for (int ci = cb; ci < cb + nci; ci += SOA_SZ)
{
- int idx = i + ldim*r + ${k}*lsdim;
- fpdtype_t axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ #pragma omp simd
+ for (int cj = 0; cj < SOA_SZ; cj++)
+ {
+ % for k in subdims:
+ idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+ axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+
+ if (a0 == 0.0)
+ x0[idx] = axn;
+ else if (a0 == 1.0)
+ x0[idx] += axn;
+ else
+ x0[idx] = a0*x0[idx] + axn;
+ % endfor
+ }
+ }
+
+ for (int ci = cb + nci, cj = 0; cj < ce - ci; cj++)
+ {
+ % for k in subdims:
+ idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+ axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
if (a0 == 0.0)
x0[idx] = axn;
@@ -35,8 +56,9 @@ axnpby(long *nrowp, long *ncolbp, long *ldimp, long *lsdimp,
x0[idx] += axn;
else
x0[idx] = a0*x0[idx] + axn;
+ % endfor
}
- % endfor
}
}
+ #undef X_IDX_AOSOA
}
diff --git a/pyfr/backends/mic/kernels/base.mako b/pyfr/backends/mic/kernels/base.mako
index 7cd4d75..2980406 100644
--- a/pyfr/backends/mic/kernels/base.mako
+++ b/pyfr/backends/mic/kernels/base.mako
@@ -6,6 +6,7 @@
#include <tgmath.h>
#define PYFR_ALIGN_BYTES ${alignb}
+#define SOA_SZ ${soasz}
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
diff --git a/pyfr/backends/mic/kernels/pack.mako b/pyfr/backends/mic/kernels/pack.mako
index 9f32a92..8b7874d 100644
--- a/pyfr/backends/mic/kernels/pack.mako
+++ b/pyfr/backends/mic/kernels/pack.mako
@@ -3,7 +3,7 @@
void
pack_view(long *n_a, long *nrv_a, long *ncv_a,
- void **v_a, void **vix_a, void **vcstri_a, void **vrstri_a,
+ void **v_a, void **vix_a, void **vrstri_a,
void **pmat_a)
{
int n = *n_a;
@@ -12,7 +12,6 @@ pack_view(long *n_a, long *nrv_a, long *ncv_a,
fpdtype_t *v = *v_a;
int *vix = *vix_a;
- int *vcstri = (vcstri_a) ? *vcstri_a : 0;
int *vrstri = (vrstri_a) ? *vrstri_a : 0;
fpdtype_t *pmat = *pmat_a;
@@ -22,11 +21,11 @@ pack_view(long *n_a, long *nrv_a, long *ncv_a,
else if (nrv == 1)
for (int i = 0; i < n; i++)
for (int c = 0; c < ncv; c++)
- pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+ pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
else
for (int i = 0; i < n; i++)
for (int r = 0; r < nrv; r++)
for (int c = 0; c < ncv; c++)
- pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r
- + vcstri[i]*c];
+ pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r +
+ SOA_SZ*c];
}
diff --git a/pyfr/backends/mic/packing.py b/pyfr/backends/mic/packing.py
index 361965c..11dd7e6 100644
--- a/pyfr/backends/mic/packing.py
+++ b/pyfr/backends/mic/packing.py
@@ -14,13 +14,13 @@ class MICPackingKernels(MICKernelProvider, BasePackingKernels):
src = self.backend.lookup.get_template('pack').render()
# Build
- kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+ kern = self._build_kernel('pack_view', src, 'iiiPPPP')
class PackXchgViewKernel(ComputeKernel):
def run(self, queue):
# Kernel arguments
args = [v.n, v.nvrow, v.nvcol, v.basedata.dev_ptr,
- v.mapping, v.cstrides, v.rstrides, m]
+ v.mapping, v.rstrides, m]
args = [getattr(arg, 'data', arg) for arg in args]
# Pack
diff --git a/pyfr/backends/mic/types.py b/pyfr/backends/mic/types.py
index 7abe53f..ec83c77 100644
--- a/pyfr/backends/mic/types.py
+++ b/pyfr/backends/mic/types.py
@@ -21,7 +21,7 @@ class MICMatrixBase(base.MatrixBase):
def _get(self):
# Allocate an empty buffer
- buf = np.empty(self.datashape, dtype=self.dtype)
+ buf = np.empty((self.nrow, self.leaddim), dtype=self.dtype)
# Copy using the default stream
self.backend.sdflt.transfer_device2host(
@@ -30,13 +30,13 @@ class MICMatrixBase(base.MatrixBase):
)
self.backend.sdflt.sync()
- # Slice to give the expected I/O shape
- return buf[...,:self.ioshape[-1]]
+ # Unpack
+ return self._unpack(buf[:, :self.ncol])
def _set(self, ary):
- # Allocate a new buffer with suitable padding and assign
- buf = np.zeros(self.datashape, dtype=self.dtype)
- buf[...,:self.ioshape[-1]] = ary
+ # Allocate a new buffer with suitable padding and pack it
+ buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
+ buf[:, :self.ncol] = self._pack(ary)
# Copy using the default stream
self.backend.sdflt.transfer_host2device(
diff --git a/pyfr/backends/opencl/base.py b/pyfr/backends/opencl/base.py
index fea11c7..03a67b4 100644
--- a/pyfr/backends/opencl/base.py
+++ b/pyfr/backends/opencl/base.py
@@ -53,6 +53,9 @@ class OpenCLBackend(BaseBackend):
# Compute the alignment requirement for the context
self.alignb = device.mem_base_addr_align // 8
+ # Compute the SoA size
+ self.soasz = 2*self.alignb // np.dtype(self.fpdtype).itemsize
+
from pyfr.backends.opencl import (blasext, clblas, gimmik, packing,
provider, types)
diff --git a/pyfr/backends/opencl/blasext.py b/pyfr/backends/opencl/blasext.py
index c4e378a..f388770 100644
--- a/pyfr/backends/opencl/blasext.py
+++ b/pyfr/backends/opencl/blasext.py
@@ -16,23 +16,23 @@ class OpenCLBlasExtKernels(OpenCLKernelProvider):
raise ValueError('Incompatible matrix types')
nv = len(arr)
- ncola, ncolb = arr[0].datashape[1:]
- nrow, ldim, lsdim, dtype = arr[0].traits
+ nrow, ldim, dtype = arr[0].traits
+ ncola, ncolb = arr[0].ioshape[1:]
# Render the kernel template
src = self.backend.lookup.get_template('axnpby').render(
- subdims=subdims or range(ncola), nv=nv
+ subdims=subdims or range(ncola), ncola=ncola, nv=nv
)
# Build the kernel
kern = self._build_kernel('axnpby', src,
- [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+ [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
class AxnpbyKernel(ComputeKernel):
def run(self, queue, *consts):
args = [x.data for x in arr] + list(consts)
kern(queue.cl_queue_comp, (ncolb,), None, nrow, ncolb,
- ldim, lsdim, *args)
+ ldim, *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/opencl/generator.py b/pyfr/backends/opencl/generator.py
index 94320ba..d6e7d1e 100644
--- a/pyfr/backends/opencl/generator.py
+++ b/pyfr/backends/opencl/generator.py
@@ -24,10 +24,14 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
return '''{spec}
{{
int _x = get_global_id(0);
+ #define X_IDX (_x)
+ #define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
{limits}
{{
{body}
}}
+ #undef X_IDX
+ #undef X_IDX_AOSOA
}}'''.format(spec=spec, limits=limits, body=self.body)
def _render_spec(self):
@@ -46,8 +50,6 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
ka.append('__global {0.dtype}* restrict {0.name}_v')
ka.append('__global const int* restrict {0.name}_vix')
- if va.ncdim >= 1:
- ka.append('__global const int* restrict {0.name}_vcstri')
if va.ncdim == 2:
ka.append('__global const int* restrict {0.name}_vrstri')
# Arrays
@@ -57,8 +59,8 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
else:
ka.append('__global {0.dtype}* restrict {0.name}_v')
- if self.needs_lsdim(va):
- ka.append('int lsd{0.name}')
+ if self.needs_ldim(va):
+ ka.append('int ld{0.name}')
# Format
kargs.extend(k.format(va) for k in ka)
diff --git a/pyfr/backends/opencl/kernels/axnpby.mako b/pyfr/backends/opencl/kernels/axnpby.mako
index 4d00d3d..14c346e 100644
--- a/pyfr/backends/opencl/kernels/axnpby.mako
+++ b/pyfr/backends/opencl/kernels/axnpby.mako
@@ -3,31 +3,35 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
__kernel void
-axnpby(int nrow, int ncolb, int ldim, int lsdim,
+axnpby(int nrow, int ncolb, int ldim,
${', '.join('__global fpdtype_t* restrict x' + str(i) for i in range(nv))},
${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
int j = get_global_id(0);
int idx;
- % for k in subdims:
if (j < ncolb && a0 == 0.0)
for (int i = 0; i < nrow; ++i)
{
- idx = i*ldim + ${k}*lsdim + j;
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
}
else if (j < ncolb && a0 == 1.0)
for (int i = 0; i < nrow; ++i)
{
- idx = i*ldim + ${k}*lsdim + j;
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % endfor
}
else if (j < ncolb)
for (int i = 0; i < nrow; ++i)
{
- idx = i*ldim + ${k}*lsdim + j;
+ % for k in subdims:
+ idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+ % endfor
}
- % endfor
}
diff --git a/pyfr/backends/opencl/kernels/base.mako b/pyfr/backends/opencl/kernels/base.mako
index 13ce4d8..a4ae785 100644
--- a/pyfr/backends/opencl/kernels/base.mako
+++ b/pyfr/backends/opencl/kernels/base.mako
@@ -6,6 +6,10 @@
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
+// AoSoA macros
+#define SOA_SZ ${soasz}
+#define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ)
+
// Typedefs
typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
diff --git a/pyfr/backends/opencl/kernels/pack.mako b/pyfr/backends/opencl/kernels/pack.mako
index 04e3762..64647fd 100644
--- a/pyfr/backends/opencl/kernels/pack.mako
+++ b/pyfr/backends/opencl/kernels/pack.mako
@@ -5,7 +5,6 @@ __kernel void
pack_view(int n, int nrv, int ncv,
__global const fpdtype_t* restrict v,
__global const int* restrict vix,
- __global const int* restrict vcstri,
__global const int* restrict vrstri,
__global fpdtype_t* restrict pmat)
{
@@ -15,9 +14,9 @@ pack_view(int n, int nrv, int ncv,
pmat[i] = v[vix[i]];
else if (i < n && nrv == 1)
for (int c = 0; c < ncv; ++c)
- pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+ pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
else if (i < n)
for (int r = 0; r < nrv; ++r)
for (int c = 0; c < ncv; ++c)
- pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + vcstri[i]*c];
+ pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c];
}
diff --git a/pyfr/backends/opencl/packing.py b/pyfr/backends/opencl/packing.py
index c0b88b0..8a731fe 100644
--- a/pyfr/backends/opencl/packing.py
+++ b/pyfr/backends/opencl/packing.py
@@ -17,13 +17,13 @@ class OpenCLPackingKernels(OpenCLKernelProvider, BasePackingKernels):
src = self.backend.lookup.get_template('pack').render()
# Build
- kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*5)
+ kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*4)
class PackXchgViewKernel(ComputeKernel):
def run(self, queue):
# Kernel arguments
args = [v.n, v.nvrow, v.nvcol, v.basedata, v.mapping,
- v.cstrides, v.rstrides, m]
+ v.rstrides, m]
args = [getattr(arg, 'data', arg) for arg in args]
# Pack
diff --git a/pyfr/backends/opencl/types.py b/pyfr/backends/opencl/types.py
index 31abf18..3cc5cfd 100644
--- a/pyfr/backends/opencl/types.py
+++ b/pyfr/backends/opencl/types.py
@@ -22,18 +22,18 @@ class OpenCLMatrixBase(base.MatrixBase):
def _get(self):
# Allocate an empty buffer
- buf = np.empty(self.datashape, dtype=self.dtype)
+ buf = np.empty((self.nrow, self.leaddim), dtype=self.dtype)
# Copy
cl.enqueue_copy(self.backend.qdflt, buf, self.data)
- # Slice to give the expected I/O shape
- return buf[...,:self.ioshape[-1]]
+ # Unpack
+ return self._unpack(buf[:, :self.ncol])
def _set(self, ary):
- # Allocate a new buffer with suitable padding and assign
- buf = np.zeros(self.datashape, dtype=self.dtype)
- buf[...,:self.ioshape[-1]] = ary
+ # Allocate a new buffer with suitable padding and pack it
+ buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
+ buf[:, :self.ncol] = self._pack(ary)
# Copy
cl.enqueue_copy(self.backend.qdflt, self.data, buf)
diff --git a/pyfr/backends/openmp/base.py b/pyfr/backends/openmp/base.py
index 115d657..6c6a6e5 100644
--- a/pyfr/backends/openmp/base.py
+++ b/pyfr/backends/openmp/base.py
@@ -14,6 +14,9 @@ class OpenMPBackend(BaseBackend):
# Take the alignment requirement to be 32-bytes
self.alignb = 32
+ # Compute the SoA size
+ self.soasz = self.alignb // np.dtype(self.fpdtype).itemsize
+
from pyfr.backends.openmp import (blasext, cblas, gimmik, packing,
provider, types)
diff --git a/pyfr/backends/openmp/blasext.py b/pyfr/backends/openmp/blasext.py
index 8343004..ebc29a9 100644
--- a/pyfr/backends/openmp/blasext.py
+++ b/pyfr/backends/openmp/blasext.py
@@ -12,22 +12,22 @@ class OpenMPBlasExtKernels(OpenMPKernelProvider):
raise ValueError('Incompatible matrix types')
nv = len(arr)
- ncola, ncolb = arr[0].datashape[1:]
- nrow, ldim, lsdim, dtype = arr[0].traits
+ nrow, ldim, dtype = arr[0].traits
+ ncola, ncolb = arr[0].ioshape[1:]
# Render the kernel template
src = self.backend.lookup.get_template('axnpby').render(
- subdims=subdims or range(ncola), nv=nv
+ subdims=subdims or range(ncola), ncola=ncola, nv=nv
)
# Build the kernel
kern = self._build_kernel('axnpby', src,
- [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+ [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
class AxnpbyKernel(ComputeKernel):
def run(self, queue, *consts):
args = list(arr) + list(consts)
- kern(nrow, ncolb, ldim, lsdim, *args)
+ kern(nrow, ncolb, ldim, *args)
return AxnpbyKernel()
diff --git a/pyfr/backends/openmp/generator.py b/pyfr/backends/openmp/generator.py
index c9677b4..2b0874c 100644
--- a/pyfr/backends/openmp/generator.py
+++ b/pyfr/backends/openmp/generator.py
@@ -5,45 +5,58 @@ from pyfr.backends.base.generator import BaseKernelGenerator
class OpenMPKernelGenerator(BaseKernelGenerator):
def render(self):
- # Kernel spec
- spec = self._render_spec()
-
if self.ndim == 1:
- tpl = '''
- {spec}
- {{
- #pragma omp parallel
- {{
- int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
- int cb, ce;
- loop_sched_1d(_nx, align, &cb, &ce);
- #pragma omp simd
- for (int _x = cb; _x < ce; _x++)
- {{
- {body}
- }}
- }}
- }}'''
+ inner = '''
+ int cb, ce;
+ loop_sched_1d(_nx, align, &cb, &ce);
+ int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+ for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+ {{
+ #pragma omp simd
+ for (int _xj = 0; _xj < SOA_SZ; _xj++)
+ {{
+ {body}
+ }}
+ }}
+ for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi; _xj++)
+ {{
+ {body}
+ }}'''.format(body=self.body)
else:
- tpl = '''{spec}
- {{
- #pragma omp parallel
- {{
- int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
- int rb, re, cb, ce;
- loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
- for (int _y = rb; _y < re; _y++)
- {{
- #pragma omp simd
- for (int _x = cb; _x < ce; _x++)
- {{
- {body}
- }}
- }}
- }}
- }}'''
+ inner = '''
+ int rb, re, cb, ce;
+ loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
+ int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+ for (int _y = rb; _y < re; _y++)
+ {{
+ for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+ {{
+ #pragma omp simd
+ for (int _xj = 0; _xj < SOA_SZ; _xj++)
+ {{
+ {body}
+ }}
+ }}
+ for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi;
+ _xj++)
+ {{
+ {body}
+ }}
+ }}'''.format(body=self.body)
- return tpl.format(spec=spec, body=self.body)
+ return '''{spec}
+ {{
+ #define X_IDX (_xi + _xj)
+ #define X_IDX_AOSOA(v, nv)\
+ ((_xi/SOA_SZ*(nv) + (v))*SOA_SZ + _xj)
+ int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+ #pragma omp parallel
+ {{
+ {inner}
+ }}
+ #undef X_IDX
+ #undef X_IDX_AOSOA
+ }}'''.format(spec=self._render_spec(), inner=inner)
def _render_spec(self):
# We first need the argument list; starting with the dimensions
@@ -60,9 +73,6 @@ class OpenMPKernelGenerator(BaseKernelGenerator):
kargs.append('const int* __restrict__ {0.name}_vix'
.format(va))
- if va.ncdim >= 1:
- kargs.append('const int* __restrict__ {0.name}_vcstri'
- .format(va))
if va.ncdim == 2:
kargs.append('const int* __restrict__ {0.name}_vrstri'
.format(va))
@@ -74,7 +84,7 @@ class OpenMPKernelGenerator(BaseKernelGenerator):
kargs.append('{0} {1.dtype}* __restrict__ {1.name}_v'
.format(const, va).strip())
- if self.needs_lsdim(va):
- kargs.append('int lsd{0.name}'.format(va))
+ if self.needs_ldim(va):
+ kargs.append('int ld{0.name}'.format(va))
return 'void {0}({1})'.format(self.name, ', '.join(kargs))
diff --git a/pyfr/backends/openmp/kernels/axnpby.mako b/pyfr/backends/openmp/kernels/axnpby.mako
index de55a3f..7e2d591 100644
--- a/pyfr/backends/openmp/kernels/axnpby.mako
+++ b/pyfr/backends/openmp/kernels/axnpby.mako
@@ -3,23 +3,45 @@
<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
void
-axnpby(int nrow, int ncolb, int ldim, int lsdim,
+axnpby(int nrow, int ncolb, int ldim,
${', '.join('fpdtype_t *__restrict__ x' + str(i) for i in range(nv))},
${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
{
+ #define X_IDX_AOSOA(v, nv) ((ci/SOA_SZ*(nv) + (v))*SOA_SZ + cj)
#pragma omp parallel
{
int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
- int rb, re, cb, ce;
+ int rb, re, cb, ce, idx;
+ fpdtype_t axn;
loop_sched_2d(nrow, ncolb, align, &rb, &re, &cb, &ce);
+ int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
for (int r = rb; r < re; r++)
{
- % for k in subdims:
- for (int i = cb; i < ce; i++)
+ for (int ci = cb; ci < cb + nci; ci += SOA_SZ)
+ {
+ #pragma omp simd
+ for (int cj = 0; cj < SOA_SZ; cj++)
+ {
+ % for k in subdims:
+ idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+ axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+
+ if (a0 == 0.0)
+ x0[idx] = axn;
+ else if (a0 == 1.0)
+ x0[idx] += axn;
+ else
+ x0[idx] = a0*x0[idx] + axn;
+ % endfor
+ }
+ }
+
+ for (int ci = cb + nci, cj = 0; cj < ce - ci; cj++)
{
- int idx = i + ldim*r + ${k}*lsdim;
- fpdtype_t axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+ % for k in subdims:
+ idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+ axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
if (a0 == 0.0)
x0[idx] = axn;
@@ -27,8 +49,9 @@ axnpby(int nrow, int ncolb, int ldim, int lsdim,
x0[idx] += axn;
else
x0[idx] = a0*x0[idx] + axn;
- }
% endfor
+ }
}
}
+ #undef X_IDX_AOSOA
}
diff --git a/pyfr/backends/openmp/kernels/base.mako b/pyfr/backends/openmp/kernels/base.mako
index 7cd4d75..2980406 100644
--- a/pyfr/backends/openmp/kernels/base.mako
+++ b/pyfr/backends/openmp/kernels/base.mako
@@ -6,6 +6,7 @@
#include <tgmath.h>
#define PYFR_ALIGN_BYTES ${alignb}
+#define SOA_SZ ${soasz}
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
diff --git a/pyfr/backends/openmp/kernels/pack.mako b/pyfr/backends/openmp/kernels/pack.mako
index 2ccf3d1..ba72df5 100644
--- a/pyfr/backends/openmp/kernels/pack.mako
+++ b/pyfr/backends/openmp/kernels/pack.mako
@@ -5,7 +5,6 @@ void
pack_view(int n, int nrv, int ncv,
const fpdtype_t *__restrict__ v,
const int *__restrict__ vix,
- const int *__restrict__ vcstri,
const int *__restrict__ vrstri,
fpdtype_t *__restrict__ pmat)
{
@@ -15,11 +14,11 @@ pack_view(int n, int nrv, int ncv,
else if (nrv == 1)
for (int i = 0; i < n; i++)
for (int c = 0; c < ncv; c++)
- pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+ pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
else
for (int i = 0; i < n; i++)
for (int r = 0; r < nrv; r++)
for (int c = 0; c < ncv; c++)
- pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r
- + vcstri[i]*c];
+ pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r +
+ SOA_SZ*c];
}
diff --git a/pyfr/backends/openmp/packing.py b/pyfr/backends/openmp/packing.py
index ef916c4..ed1548a 100644
--- a/pyfr/backends/openmp/packing.py
+++ b/pyfr/backends/openmp/packing.py
@@ -14,12 +14,12 @@ class OpenMPPackingKernels(OpenMPKernelProvider, BasePackingKernels):
src = self.backend.lookup.get_template('pack').render()
# Build
- kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+ kern = self._build_kernel('pack_view', src, 'iiiPPPP')
class PackXchgViewKernel(ComputeKernel):
def run(self, queue):
kern(v.n, v.nvrow, v.nvcol, v.basedata, v.mapping,
- v.cstrides or 0, v.rstrides or 0, m)
+ v.rstrides or 0, m)
return PackXchgViewKernel()
diff --git a/pyfr/backends/openmp/types.py b/pyfr/backends/openmp/types.py
index a2726aa..a888a65 100644
--- a/pyfr/backends/openmp/types.py
+++ b/pyfr/backends/openmp/types.py
@@ -9,7 +9,8 @@ class OpenMPMatrixBase(base.MatrixBase):
self.basedata = basedata.ctypes.data
self.data = basedata[offset:offset + self.nrow*self.pitch]
- self.data = self.data.view(self.dtype).reshape(self.datashape)
+ self.data = self.data.view(self.dtype)
+ self.data = self.data.reshape(self.nrow, self.leaddim)
self.offset = offset
@@ -24,12 +25,10 @@ class OpenMPMatrixBase(base.MatrixBase):
del self._initval
def _get(self):
- # Trim any padding in the final dimension
- return self.data[...,:self.ioshape[-1]]
+ return self._unpack(self.data[:, :self.ncol])
def _set(self, ary):
- # Assign
- self.data[...,:ary.shape[-1]] = ary
+ self.data[:, :self.ncol] = self._pack(ary)
class OpenMPMatrix(OpenMPMatrixBase, base.Matrix):
@@ -41,11 +40,7 @@ class OpenMPMatrix(OpenMPMatrixBase, base.Matrix):
class OpenMPMatrixRSlice(base.MatrixRSlice):
@lazyprop
def data(self):
- # Since slices do not retain any information about the
- # high-order structure of an array it is fine to compact mat
- # down to two dimensions and simply slice this
- mat = self.parent
- return mat.data.reshape(mat.nrow, mat.leaddim)[self.p:self.q]
+ return self.parent.data[self.p:self.q]
@lazyprop
def _as_parameter_(self):
diff --git a/pyfr/solvers/base/elements.py b/pyfr/solvers/base/elements.py
index 2bab10a..4b453f2 100644
--- a/pyfr/solvers/base/elements.py
+++ b/pyfr/solvers/base/elements.py
@@ -347,17 +347,16 @@ class BaseElements(object, metaclass=ABCMeta):
nfp = self.nfacefpts[fidx]
rcmap = [(fpidx, eidx) for fpidx in self._srtd_face_fpts[fidx][eidx]]
- cstri = ((self._scal_fpts.leadsubdim,),)*nfp
- return (self._scal_fpts.mid,)*nfp, rcmap, cstri
+ return (self._scal_fpts.mid,)*nfp, rcmap
def get_vect_fpts_for_inter(self, eidx, fidx):
nfp = self.nfacefpts[fidx]
rcmap = [(fpidx, eidx) for fpidx in self._srtd_face_fpts[fidx][eidx]]
- rcstri = ((self.nfpts, self._vect_fpts.leadsubdim),)*nfp
+ rstri = (self.nfpts,)*nfp
- return (self._vect_fpts.mid,)*nfp, rcmap, rcstri
+ return (self._vect_fpts.mid,)*nfp, rcmap, rstri
def get_ploc_for_inter(self, eidx, fidx):
fpts_idx = self._srtd_face_fpts[fidx][eidx]
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git
More information about the debian-science-commits
mailing list