[pyfr] 16/32: Add a backend for the MIC based around pyMIC.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Apr 21 08:21:51 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository pyfr.
commit ee26db758a6e0c32c24c56b139766d665f0589e9
Author: Freddie Witherden <freddie at witherden.org>
Date: Tue Jan 19 16:38:05 2016 +0000
Add a backend for the MIC based around pyMIC.
---
pyfr/backends/__init__.py | 1 +
pyfr/backends/mic/__init__.py | 3 +
pyfr/backends/mic/base.py | 78 +++++++++++
pyfr/backends/mic/blasext.py | 77 +++++++++++
pyfr/backends/mic/cblas.py | 44 ++++++
pyfr/backends/mic/compiler.py | 54 ++++++++
pyfr/backends/mic/generator.py | 217 ++++++++++++++++++++++++++++++
pyfr/backends/mic/kernels/__init__.py | 1 +
pyfr/backends/mic/kernels/axnpby.mako | 37 +++++
pyfr/backends/mic/kernels/base.mako | 20 +++
pyfr/backends/mic/kernels/errest.mako | 20 +++
pyfr/backends/mic/kernels/gemm.mako | 14 ++
pyfr/backends/mic/kernels/loop-sched.mako | 69 ++++++++++
pyfr/backends/mic/kernels/pack.mako | 32 +++++
pyfr/backends/mic/packing.py | 45 +++++++
pyfr/backends/mic/provider.py | 29 ++++
pyfr/backends/mic/types.py | 141 +++++++++++++++++++
setup.py | 4 +
18 files changed, 886 insertions(+)
diff --git a/pyfr/backends/__init__.py b/pyfr/backends/__init__.py
index d875db9..0891085 100644
--- a/pyfr/backends/__init__.py
+++ b/pyfr/backends/__init__.py
@@ -2,6 +2,7 @@
from pyfr.backends.base import BaseBackend
from pyfr.backends.cuda import CUDABackend
+from pyfr.backends.mic import MICBackend
from pyfr.backends.opencl import OpenCLBackend
from pyfr.backends.openmp import OpenMPBackend
from pyfr.util import subclass_where
diff --git a/pyfr/backends/mic/__init__.py b/pyfr/backends/mic/__init__.py
new file mode 100644
index 0000000..46f201c
--- /dev/null
+++ b/pyfr/backends/mic/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+from pyfr.backends.mic.base import MICBackend
diff --git a/pyfr/backends/mic/base.py b/pyfr/backends/mic/base.py
new file mode 100644
index 0000000..512ef81
--- /dev/null
+++ b/pyfr/backends/mic/base.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pyfr.backends.base import BaseBackend
+from pyfr.mpiutil import get_local_rank
+from pyfr.template import DottedTemplateLookup
+
+
+class MICBackend(BaseBackend):
+ name = 'mic'
+
+ def __init__(self, cfg):
+ super().__init__(cfg)
+
+ import pymic as mic
+
+ # Get the device ID to use
+ devid = cfg.get('backend-mic', 'device-id', 'local-rank')
+
+ # Handle the local-rank case
+ if devid == 'local-rank':
+ devid = str(get_local_rank())
+
+ # Get a handle to the desired device
+ self.dev = mic.devices[int(devid)]
+
+ # Default stream
+ self.sdflt = self.dev.get_default_stream()
+
+ # Take the alignment requirement to be 64-bytes
+ self.alignb = 64
+
+ from pyfr.backends.mic import (blasext, cblas, packing, provider,
+ types)
+
+ # Register our data types
+ self.base_matrix_cls = types.MICMatrixBase
+ self.const_matrix_cls = types.MICConstMatrix
+ self.matrix_cls = types.MICMatrix
+ self.matrix_bank_cls = types.MICMatrixBank
+ self.matrix_rslice_cls = types.MICMatrixRSlice
+ self.queue_cls = types.MICQueue
+ self.view_cls = types.MICView
+ self.xchg_matrix_cls = types.MICXchgMatrix
+ self.xchg_view_cls = types.MICXchgView
+
+ # Template lookup
+ self.lookup = DottedTemplateLookup(
+ 'pyfr.backends.mic.kernels',
+ fpdtype=self.fpdtype, alignb=self.alignb
+ )
+
+ # Kernel provider classes
+ kprovcls = [provider.MICPointwiseKernelProvider,
+ blasext.MICBlasExtKernels,
+ packing.MICPackingKernels,
+ cblas.MICCBLASKernels]
+ self._providers = [k(self) for k in kprovcls]
+
+ # Pointwise kernels
+ self.pointwise = self._providers[0]
+
+ def _malloc_impl(self, nbytes):
+ stream = self.sdflt
+
+ # Allocate an empty buffer on the device
+ buf = stream.allocate_device_memory(nbytes)
+
+ # Attach the raw device pointer
+ buf.dev_ptr = stream.translate_device_pointer(buf)
+
+ # Zero the buffer
+ zeros = np.zeros(nbytes, dtype=np.uint8)
+ stream.transfer_host2device(zeros.ctypes.data, buf, nbytes)
+ stream.sync()
+
+ return buf
diff --git a/pyfr/backends/mic/blasext.py b/pyfr/backends/mic/blasext.py
new file mode 100644
index 0000000..cb740fb
--- /dev/null
+++ b/pyfr/backends/mic/blasext.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pyfr.backends.mic.provider import MICKernelProvider
+from pyfr.backends.base import ComputeKernel
+
+
+class MICBlasExtKernels(MICKernelProvider):
+ def axnpby(self, *arr):
+ if any(arr[0].traits != x.traits for x in arr[1:]):
+ raise ValueError('Incompatible matrix types')
+
+ nv = len(arr)
+ nrow, leaddim, leadsubdim, dtype = arr[0].traits
+
+ # Render the kernel template
+ src = self.backend.lookup.get_template('axnpby').render(nv=nv)
+
+ # Build the kernel
+ kern = self._build_kernel('axnpby', src,
+ [np.int32] + [np.intp]*nv + [dtype]*nv)
+
+ # Determine the total element count in the matrices
+ cnt = leaddim*nrow
+
+ class AxnpbyKernel(ComputeKernel):
+ def run(self, queue, *consts):
+ args = [x.data for x in arr] + list(consts)
+ queue.mic_stream_comp.invoke(kern, cnt, *args)
+
+ return AxnpbyKernel()
+
+ def copy(self, dst, src):
+ if dst.traits != src.traits:
+ raise ValueError('Incompatible matrix types')
+
+ class CopyKernel(ComputeKernel):
+ def run(self, queue):
+ queue.mic_stream_comp.transfer_device2device(
+ src.basedata, dst.basedata, dst.nbytes, src.offset,
+ dst.offset
+ )
+
+ return CopyKernel()
+
+ def errest(self, x, y, z):
+ if x.traits != y.traits != z.traits:
+ raise ValueError('Incompatible matrix types')
+
+ cnt = x.leaddim*x.nrow
+ dtype = x.dtype
+
+ # Allocate space for the return value
+ reth = np.zeros(1)
+ retd = self.backend.sdflt.bind(reth, update_device=False)
+
+ # Render the reduction kernel template
+ src = self.backend.lookup.get_template('errest').render()
+
+ # Build
+ rkern = self._build_kernel(
+ 'errest', src, [np.int32] + [np.intp]*4 + [dtype]*2, restype=dtype
+ )
+
+ class ErrestKernel(ComputeKernel):
+ @property
+ def retval(self):
+ return float(reth[0])
+
+ def run(self, queue, atol, rtol):
+ queue.mic_stream_comp.invoke(
+ rkern, cnt, retd, x.data, y.data, z.data, atol, rtol
+ )
+ retd.update_host()
+
+ return ErrestKernel()
diff --git a/pyfr/backends/mic/cblas.py b/pyfr/backends/mic/cblas.py
new file mode 100644
index 0000000..2037e77
--- /dev/null
+++ b/pyfr/backends/mic/cblas.py
@@ -0,0 +1,44 @@
+ # -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pyfr.backends.base import ComputeKernel
+from pyfr.backends.mic.provider import MICKernelProvider
+
+
+class MICCBLASKernels(MICKernelProvider):
+ def mul(self, a, b, out, alpha=1.0, beta=0.0):
+ # Ensure the matrices are compatible
+ if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
+ raise ValueError('Incompatible matrices for out = a*b')
+
+ m, n, k = a.nrow, b.ncol, a.ncol
+
+ # Data type
+ if a.dtype == np.float64:
+ cblas_gemm = 'cblas_dgemm'
+ else:
+ cblas_gemm = 'cblas_sgemm'
+
+ # Render the kernel template
+ tpl = self.backend.lookup.get_template('gemm')
+ src = tpl.render(cblas_gemm=cblas_gemm)
+
+ # Argument types for gemm
+ argt = [
+ np.intp, np.int32, np.int32, np.int32,
+ a.dtype, np.intp, np.int32, np.intp, np.int32,
+ a.dtype, np.intp, np.int32
+ ]
+
+ # Build
+ gemm = self._build_kernel('gemm', src, argt)
+
+ class MulKernel(ComputeKernel):
+ def run(self, queue):
+ queue.mic_stream_comp.invoke(
+ gemm, m, n, k, a.data, b.data, out.data, a.leaddim,
+ b.leaddim, out.leaddim, alpha, beta
+ )
+
+ return MulKernel()
diff --git a/pyfr/backends/mic/compiler.py b/pyfr/backends/mic/compiler.py
new file mode 100644
index 0000000..f550ce7
--- /dev/null
+++ b/pyfr/backends/mic/compiler.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+import itertools as it
+import os
+import tempfile
+
+from pytools.prefork import call_capture_output
+
+from pyfr.util import rm
+
+
+class MICSourceModule(object):
+ _dir_seq = it.count()
+
+ def __init__(self, src, dev, cfg):
+ # Create a scratch directory
+ tmpidx = next(self._dir_seq)
+ tmpdir = tempfile.mkdtemp(prefix='pyfr-{0}-'.format(tmpidx))
+
+ # Find MKL
+ mklroot = cfg.get('backend-mic', 'mkl-root', '$MKLROOT')
+
+ try:
+ # File names
+ cn, ln = 'tmp.c', 'libtmp.so'
+
+ # Write the source code out
+ with open(os.path.join(tmpdir, cn), 'w') as f:
+ f.write(src)
+
+ # Compile and link
+ cmd = [
+ 'icc',
+ '-shared', # Create a shared library
+ '-std=c99', # Enable C99 support
+ '-Ofast', # Optimise
+ '-mmic', # Compile for the MIC
+ '-fopenmp', # Enable OpenMP support
+ '-L{0}/lib/mic'.format(mklroot), # MKL stuff
+ '-lmkl_intel_lp64', # ...
+ '-lmkl_core', # ...
+ '-lmkl_intel_thread', # ...
+ '-fPIC', # Position-independent code
+ '-o', ln, cn
+ ]
+ call_capture_output(cmd, cwd=tmpdir)
+
+ # Load
+ self._mod = dev.load_library(os.path.join(tmpdir, ln))
+ finally:
+ pass #rm(tmpdir)
+
+ def function(self, name, argtypes, restype=None):
+ return getattr(self._mod, name)
diff --git a/pyfr/backends/mic/generator.py b/pyfr/backends/mic/generator.py
new file mode 100644
index 0000000..4608acb
--- /dev/null
+++ b/pyfr/backends/mic/generator.py
@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+from pyfr.backends.base.generator import BaseKernelGenerator
+from pyfr.util import ndrange
+
+
+class MICKernelGenerator(BaseKernelGenerator):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # Specialise
+ self._dims = ['_nx'] if self.ndim == 1 else ['_ny', '_nx']
+
+ def render(self):
+ # Argument unpacking
+ spec, unpack = self._emit_spec_unpack()
+
+ if self.ndim == 1:
+ body = self._emit_body_1d()
+ return '''
+ void {name}({spec})
+ {{
+ {unpack}
+ #pragma omp parallel
+ {{
+ int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+ int cb, ce;
+ loop_sched_1d(_nx, align, &cb, &ce);
+ for (int _x = cb; _x < ce; _x++)
+ {{
+ {body}
+ }}
+ }}
+ }}'''.format(name=self.name, spec=spec, unpack=unpack,
+ body=body)
+ else:
+ innerfn = self._emit_inner_func()
+ innercall = self._emit_inner_call()
+ return '''{innerfn}
+ void {name}({spec})
+ {{
+ {unpack}
+ #pragma omp parallel
+ {{
+ int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+ int rb, re, cb, ce;
+ loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
+ for (int _y = rb; _y < re; _y++)
+ {{
+ {innercall}
+ }}
+ }}
+ }}'''.format(innerfn=innerfn, spec=spec, unpack=unpack,
+ name=self.name, innercall=innercall)
+
+ def _emit_inner_func(self):
+ # Get the specification and body
+ spec = self._emit_inner_spec()
+ body = self._emit_body_2d()
+
+ # Combine
+ return '''{spec}
+ {{
+ for (int _x = 0; _x < _nx; _x++)
+ {{
+ {body}
+ }}
+ }}'''.format(spec=spec, body=body)
+
+ def _emit_inner_call(self):
+ # Arguments for the inner function
+ iargs = ['ce - cb']
+ iargs.extend(sa.name for sa in self.scalargs)
+
+ for va in self.vectargs:
+ iargs.extend(self._offset_arg_array_2d(va))
+
+ return '{0}_inner({1});'.format(self.name, ', '.join(iargs))
+
+ def _emit_inner_spec(self):
+ # Inner dimension
+ ikargs = ['int _nx']
+
+ # Add any scalar arguments
+ ikargs.extend('{0.dtype} {0.name}'.format(sa) for sa in self.scalargs)
+
+ # Vector arguments (always arrays as we're 2D)
+ for va in self.vectargs:
+ const = 'const' if va.intent == 'in' else ''
+ stmt = '{0} {1.dtype} *__restrict__ {1.name}_v'.format(const, va)
+ stmt = stmt.strip()
+
+ if va.ncdim == 0:
+ ikargs.append(stmt)
+ else:
+ for ij in ndrange(*va.cdims):
+ ikargs.append(stmt + 'v'.join(str(n) for n in ij))
+
+ return ('static PYFR_NOINLINE void {0}_inner({1})'
+ .format(self.name, ', '.join(ikargs)))
+
+ def _emit_spec_unpack(self):
+ # Start by unpacking the dimensions
+ kspec = ['long *arg{0}' for d in self._dims]
+ kpack = ['int {0} = *arg{{0}};'.format(d) for d in self._dims]
+
+ # Next unpack any scalar arguments
+ kspec.extend('double *arg{0}' for sa in self.scalargs)
+ kpack.extend('{0.dtype} {0.name} = *arg{{0}};'.format(sa)
+ for sa in self.scalargs)
+
+ # Finally, add the vector arguments
+ for va in self.vectargs:
+ # Views
+ if va.isview:
+ kspec.append('void **arg{0}')
+ kpack.append('{0.dtype} *{0.name}_v = *arg{{0}};'
+ .format(va))
+
+ kspec.append('void **arg{0}')
+ kpack.append('const int *{0.name}_vix = *arg{{0}};'
+ .format(va))
+
+ if va.ncdim >= 1:
+ kspec.append('void **arg{0}')
+ kpack.append('const int *{0.name}_vcstri = *arg{{0}};'
+ .format(va))
+ if va.ncdim == 2:
+ kspec.append('void **arg{0}')
+ kpack.append('const int *{0.name}_vrstri = *arg{{0}};'
+ .format(va))
+ # Arrays
+ else:
+ # Intent in arguments should be marked constant
+ const = 'const' if va.intent == 'in' else ''
+
+ kspec.append('void **arg{0}')
+ kpack.append('{0} {1.dtype}* {1.name}_v = *arg{{0}};'
+ .format(const, va).strip())
+
+ # If we are a matrix (ndim = 2) or a non-MPI stacked
+ # vector then a leading (sub) dimension is required
+ if self.ndim == 2 or (va.ncdim > 0 and not va.ismpi):
+ kspec.append('long *arg{0}')
+ kpack.append('int lsd{0.name} = *arg{{0}};'.format(va))
+
+ return (', '.join(a.format(i) for i, a in enumerate(kspec)),
+ '\n'.join(a.format(i) for i, a in enumerate(kpack)))
+
+ def _emit_body_1d(self):
+ body = self.body
+ ptns = [r'\b{0}\b', r'\b{0}\[(\d+)\]', r'\b{0}\[(\d+)\]\[(\d+)\]']
+
+ for va in self.vectargs:
+ # Dereference the argument
+ darg = self._deref_arg(va)
+
+ # Substitute
+ body = re.sub(ptns[va.ncdim].format(va.name), darg, body)
+
+ return body
+
+ def _emit_body_2d(self):
+ body = self.body
+ ptns = [r'\b{0}\b', r'\b{0}\[(\d+)\]', r'\b{0}\[(\d+)\]\[(\d+)\]']
+ subs = ['{0}_v[_x]', r'{0}_v\1[_x]', r'{0}_v\1v\2[_x]']
+
+ for va in self.vectargs:
+ body = re.sub(ptns[va.ncdim].format(va.name),
+ subs[va.ncdim].format(va.name), body)
+
+ return body
+
+ def _deref_arg(self, arg):
+ if arg.isview:
+ ptns = ['{0}_v[{0}_vix[_x]]',
+ r'{0}_v[{0}_vix[_x] + {0}_vcstri[_x]*\1]',
+ r'{0}_v[{0}_vix[_x] + {0}_vrstri[_x]*\1'
+ r' + {0}_vcstri[_x]*\2]']
+
+ return ptns[arg.ncdim].format(arg.name)
+ else:
+ # Leading (sub) dimension
+ lsdim = 'lsd' + arg.name if not arg.ismpi else '_nx'
+
+ # Vector name_v[_x]
+ if arg.ncdim == 0:
+ ix = '_x'
+ # Stacked vector; name_v[lsdim*\1 + _x]
+ elif arg.ncdim == 1:
+ ix = r'{0}*\1 + _x'.format(lsdim)
+ # Doubly stacked vector; name_v[lsdim*nv*\1 + lsdim*\2 + _x]
+ else:
+ ix = r'{0}*{1}*\1 + {0}*\2 + _x'.format(lsdim, arg.cdims[1])
+
+ return '{0}_v[{1}]'.format(arg.name, ix)
+
+ def _offset_arg_array_2d(self, arg):
+ stmts = []
+
+ # Matrix; name + _y*lsdim + cb
+ if arg.ncdim == 0:
+ stmts.append('{0}_v + _y*lsd{0} + cb'.format(arg.name))
+ # Stacked matrix; name + (_y*nv + <0>)*lsdim + cb
+ elif arg.ncdim == 1:
+ stmts.extend('{0}_v + (_y*{1} + {2})*lsd{0} + cb'
+ .format(arg.name, arg.cdims[0], i)
+ for i in range(arg.cdims[0]))
+ # Doubly stacked matrix; name + ((<0>*_ny + _y)*nv + <1>)*lsdim + cb
+ else:
+ stmts.extend('{0}_v + (({1}*_ny + _y)*{2} + {3})*lsd{0} + cb'
+ .format(arg.name, i, arg.cdims[1], j)
+ for i, j in ndrange(*arg.cdims))
+
+ return stmts
diff --git a/pyfr/backends/mic/kernels/__init__.py b/pyfr/backends/mic/kernels/__init__.py
new file mode 100644
index 0000000..40a96af
--- /dev/null
+++ b/pyfr/backends/mic/kernels/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
diff --git a/pyfr/backends/mic/kernels/axnpby.mako b/pyfr/backends/mic/kernels/axnpby.mako
new file mode 100644
index 0000000..a21160e
--- /dev/null
+++ b/pyfr/backends/mic/kernels/axnpby.mako
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
+
+static PYFR_NOINLINE void
+axnpby_inner(int n,
+ ${', '.join('fpdtype_t *__restrict__ x{0}, '
+ 'fpdtype_t a{0}'.format(i) for i in range(nv))})
+{
+ for (int i = 0; i < n; i++)
+ {
+ fpdtype_t axn = ${pyfr.dot('a{j}', 'x{j}[i]', j=(1, nv))};
+
+ if (a0 == 0.0)
+ x0[i] = axn;
+ else if (a0 == 1.0)
+ x0[i] += axn;
+ else
+ x0[i] = a0*x0[i] + axn;
+ }
+}
+
+void
+axnpby(long *n,
+ ${', '.join('fpdtype_t **x{0}'.format(i) for i in range(nv))},
+ ${', '.join('double *a{0}'.format(i) for i in range(nv))})
+{
+ #pragma omp parallel
+ {
+ int begin, end;
+ loop_sched_1d(*n, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);
+
+ axnpby_inner(end - begin,
+ ${', '.join('*x{0} + begin, *a{0}'.format(i)
+ for i in range(nv))});
+ }
+}
diff --git a/pyfr/backends/mic/kernels/base.mako b/pyfr/backends/mic/kernels/base.mako
new file mode 100644
index 0000000..1f4d487
--- /dev/null
+++ b/pyfr/backends/mic/kernels/base.mako
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
+
+#include <omp.h>
+#include <stdlib.h>
+#include <tgmath.h>
+
+#define PYFR_ALIGN_BYTES ${alignb}
+#define PYFR_NOINLINE __attribute__ ((noinline))
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+// Typedefs
+typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
+
+// OpenMP static loop scheduling functions
+<%include file='loop-sched'/>
+
+${next.body()}
diff --git a/pyfr/backends/mic/kernels/errest.mako b/pyfr/backends/mic/kernels/errest.mako
new file mode 100644
index 0000000..78b17a2
--- /dev/null
+++ b/pyfr/backends/mic/kernels/errest.mako
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
+
+void
+errest(long *n, double *out,
+ fpdtype_t **xp, fpdtype_t **yp, fpdtype_t **zp,
+ double *atolp, double *rtolp)
+{
+ fpdtype_t *x = *xp, *y = *yp, *z = *zp;
+ fpdtype_t atol = *atolp, rtol = *rtolp;
+
+ fpdtype_t sum = 0.0;
+
+ #pragma omp parallel for reduction(+:sum)
+ for (int i = 0; i < *n; i++)
+ sum += pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2);
+
+ *out = sum;
+}
diff --git a/pyfr/backends/mic/kernels/gemm.mako b/pyfr/backends/mic/kernels/gemm.mako
new file mode 100644
index 0000000..0018239
--- /dev/null
+++ b/pyfr/backends/mic/kernels/gemm.mako
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+
+#include <mkl.h>
+
+void
+gemm(long *M, long *N, long *K,
+ fpdtype_t **A, fpdtype_t **B, fpdtype_t **C,
+ long *lda, long *ldb, long *ldc,
+ double *alpha, double *beta)
+{
+ ${cblas_gemm}(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+ *M, *N, *K, *alpha, *A, *lda, *B, *ldb, *beta, *C, *ldc);
+}
diff --git a/pyfr/backends/mic/kernels/loop-sched.mako b/pyfr/backends/mic/kernels/loop-sched.mako
new file mode 100644
index 0000000..9e3f4c0
--- /dev/null
+++ b/pyfr/backends/mic/kernels/loop-sched.mako
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+static inline int
+gcd(int a, int b)
+{
+ return (a == 0) ? b : gcd(b % a, a);
+}
+
+static inline void
+loop_sched_1d(int n, int align, int *b, int *e)
+{
+ int tid = omp_get_thread_num();
+ int nth = omp_get_num_threads();
+
+ // Round up n to be a multiple of nth
+ int rn = n + nth - 1 - (n - 1) % nth;
+
+ // Nominal tile size
+ int sz = rn / nth;
+
+ // Handle alignment
+ sz += align - 1 - (sz - 1) % align;
+
+ // Assign the starting and ending index
+ *b = sz * tid;
+ *e = min(*b + sz, n);
+
+ // Clamp
+ if (*b >= n)
+ *b = *e = 0;
+}
+
+static inline void
+loop_sched_2d(int nrow, int ncol, int colalign,
+ int *rowb, int *rowe, int *colb, int *cole)
+{
+ int tid = omp_get_thread_num();
+ int nth = omp_get_num_threads();
+
+ // Distribute threads
+ int nrowth = gcd(nrow, nth);
+ int ncolth = nth / nrowth;
+
+ // Row and column indices for our thread
+ int rowix = tid / ncolth;
+ int colix = tid % ncolth;
+
+ // Round up ncol to be a multiple of ncolth
+ int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
+
+ // Nominal tile size
+ int ntilerow = nrow / nrowth;
+ int ntilecol = rncol / ncolth;
+
+ // Handle column alignment
+ ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
+
+ // Assign the starting and ending row to each thread
+ *rowb = ntilerow * rowix;
+ *rowe = *rowb + ntilerow;
+
+ // Assign the starting and ending column to each thread
+ *colb = ntilecol * colix;
+ *cole = min(*colb + ntilecol, ncol);
+
+ // Clamp
+ if (*colb >= ncol)
+ *colb = *cole = 0;
+}
diff --git a/pyfr/backends/mic/kernels/pack.mako b/pyfr/backends/mic/kernels/pack.mako
new file mode 100644
index 0000000..9f32a92
--- /dev/null
+++ b/pyfr/backends/mic/kernels/pack.mako
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+
+void
+pack_view(long *n_a, long *nrv_a, long *ncv_a,
+ void **v_a, void **vix_a, void **vcstri_a, void **vrstri_a,
+ void **pmat_a)
+{
+ int n = *n_a;
+ int nrv = *nrv_a;
+ int ncv = *ncv_a;
+
+ fpdtype_t *v = *v_a;
+ int *vix = *vix_a;
+ int *vcstri = (vcstri_a) ? *vcstri_a : 0;
+ int *vrstri = (vrstri_a) ? *vrstri_a : 0;
+ fpdtype_t *pmat = *pmat_a;
+
+ if (ncv == 1)
+ for (int i = 0; i < n; i++)
+ pmat[i] = v[vix[i]];
+ else if (nrv == 1)
+ for (int i = 0; i < n; i++)
+ for (int c = 0; c < ncv; c++)
+ pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+ else
+ for (int i = 0; i < n; i++)
+ for (int r = 0; r < nrv; r++)
+ for (int c = 0; c < ncv; c++)
+ pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r
+ + vcstri[i]*c];
+}
diff --git a/pyfr/backends/mic/packing.py b/pyfr/backends/mic/packing.py
new file mode 100644
index 0000000..043975d
--- /dev/null
+++ b/pyfr/backends/mic/packing.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+from pyfr.backends.base import ComputeKernel, NullComputeKernel
+from pyfr.backends.base.packing import BasePackingKernels
+from pyfr.backends.mic.provider import MICKernelProvider
+
+
+class MICPackingKernels(MICKernelProvider, BasePackingKernels):
+ def pack(self, mv):
+ # An exchange view is simply a regular view plus an exchange matrix
+ m, v = mv.xchgmat, mv.view
+
+ # Render the kernel template
+ src = self.backend.lookup.get_template('pack').render()
+
+ # Build
+ kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+
+ class PackXchgViewKernel(ComputeKernel):
+ def run(self, queue):
+ # Kernel arguments
+ args = [v.n, v.nvrow, v.nvcol, v.basedata.dev_ptr,
+ v.mapping, v.cstrides, v.rstrides, m]
+ args = [getattr(arg, 'data', arg) for arg in args]
+
+ # Pack
+ queue.mic_stream_comp.invoke(kern, *args)
+
+ # Copy the packed buffer to the host
+ queue.mic_stream_comp.transfer_device2host(
+ m.basedata, m.hdata.ctypes.data, m.nbytes,
+ offset_device=m.offset
+ )
+
+ return PackXchgViewKernel()
+
+ def unpack(self, mv):
+ class UnpackXchgMatrixKernel(ComputeKernel):
+ def run(self, queue):
+ queue.mic_stream_comp.transfer_host2device(
+ mv.hdata.ctypes.data, mv.basedata, mv.nbytes,
+ offset_device=mv.offset
+ )
+
+ return UnpackXchgMatrixKernel()
diff --git a/pyfr/backends/mic/provider.py b/pyfr/backends/mic/provider.py
new file mode 100644
index 0000000..545c919
--- /dev/null
+++ b/pyfr/backends/mic/provider.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+from pyfr.backends.base import (BaseKernelProvider,
+ BasePointwiseKernelProvider, ComputeKernel)
+from pyfr.backends.mic.compiler import MICSourceModule
+import pyfr.backends.mic.generator as generator
+from pyfr.util import memoize
+
+
+class MICKernelProvider(BaseKernelProvider):
+ @memoize
+ def _build_kernel(self, name, src, argtypes, restype=None):
+ mod = MICSourceModule(src, self.backend.dev, self.backend.cfg)
+ return mod.function(name, argtypes, restype)
+
+
+class MICPointwiseKernelProvider(MICKernelProvider,
+ BasePointwiseKernelProvider):
+ kernel_generator_cls = generator.MICKernelGenerator
+
+ def _instantiate_kernel(self, dims, fun, arglst):
+ class PointwiseKernel(ComputeKernel):
+ def run(self, queue, **kwargs):
+ narglst = [kwargs.get(ka, ka) for ka in arglst]
+ narglst = [getattr(arg, 'dev_ptr', arg) for arg in narglst]
+ narglst = [getattr(arg, 'data', arg) for arg in narglst]
+ queue.mic_stream_comp.invoke(fun, *narglst)
+
+ return PointwiseKernel()
diff --git a/pyfr/backends/mic/types.py b/pyfr/backends/mic/types.py
new file mode 100644
index 0000000..c29ccf4
--- /dev/null
+++ b/pyfr/backends/mic/types.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+import pyfr.backends.base as base
+from pyfr.util import lazyprop
+
+
+class MICMatrixBase(base.MatrixBase):
+ def onalloc(self, basedata, offset):
+ self.basedata = basedata
+ self.data = basedata.dev_ptr + offset
+
+ self.offset = offset
+
+ # Process any initial value
+ if self._initval is not None:
+ self._set(self._initval)
+
+ # Remove
+ del self._initval
+
+ def _get(self):
+ # Allocate an empty buffer
+ buf = np.empty(self.datashape, dtype=self.dtype)
+
+ # Copy using the default stream
+ self.backend.sdflt.transfer_device2host(
+ self.basedata, buf.ctypes.data, self.nbytes,
+ offset_device=self.offset
+ )
+ self.backend.sdflt.sync()
+
+ # Slice to give the expected I/O shape
+ return buf[...,:self.ioshape[-1]]
+
+ def _set(self, ary):
+ # Allocate a new buffer with suitable padding and assign
+ buf = np.zeros(self.datashape, dtype=self.dtype)
+ buf[...,:self.ioshape[-1]] = ary
+
+ # Copy using the default stream
+ self.backend.sdflt.transfer_host2device(
+ buf.ctypes.data, self.basedata, self.nbytes,
+ offset_device=self.offset
+ )
+ self.backend.sdflt.sync()
+
+
+class MICMatrix(MICMatrixBase, base.Matrix):
+ pass
+
+
+class MICMatrixRSlice(base.MatrixRSlice):
+ @property
+ def data(self):
+ return self.basedata.dev_ptr + self.offset
+
+
+class MICMatrixBank(base.MatrixBank):
+ pass
+
+
+class MICConstMatrix(MICMatrixBase, base.ConstMatrix):
+ pass
+
+
+class MICXchgMatrix(MICMatrix, base.XchgMatrix):
+ def __init__(self, backend, ioshape, initval, extent, aliases, tags):
+ # Call the standard matrix constructor
+ super().__init__(backend, ioshape, initval, extent, aliases, tags)
+
+ # Allocate an empty buffer on the host for MPI to send/recv from
+ self.hdata = np.empty((self.nrow, self.ncol), self.dtype)
+
+
+class MICXchgView(base.XchgView):
+ pass
+
+
+class MICView(base.View):
+ pass
+
+
+class MICQueue(base.Queue):
+ def __init__(self, backend):
+ super().__init__(backend)
+
+ # MIC stream
+ self.mic_stream_comp = backend.sdflt
+
+ def _exec_item(self, item, args, kwargs):
+ item.run(self, *args, **kwargs)
+
+ self.mic_stream_comp.sync()
+
+ self._last = item
+
+ def _exec_nonblock(self):
+ while self._items:
+ kern = self._items[0][0]
+
+ # See if kern will block
+ if self._at_sequence_point(kern) or kern.ktype == 'compute':
+ break
+
+ self._exec_item(*self._items.popleft())
+
+ def _wait(self):
+ if self._last and self._last.ktype == 'mpi':
+ from mpi4py import MPI
+
+ MPI.Prequest.Waitall(self.mpi_reqs)
+ self.mpi_reqs = []
+
+ self._last = None
+
+ def _at_sequence_point(self, item):
+ last = self._last
+
+ return last and last.ktype == 'mpi' and item.ktype != 'mpi'
+
+ @staticmethod
+ def runall(queues):
+ # Fire off any non-blocking kernels
+ for q in queues:
+ q._exec_nonblock()
+
+ while any(queues):
+ # Execute a (potentially) blocking item from each queue
+ for q in filter(None, queues):
+ q._exec_nowait()
+
+ # Now consider kernels which will wait
+ for q in filter(None, queues):
+ q._exec_next()
+ q._exec_nonblock()
+
+ # Wait for all tasks to complete
+ for q in queues:
+ q._wait()
diff --git a/setup.py b/setup.py
index 3739294..ef4b03b 100755
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,8 @@ modules = [
'pyfr.backends.base',
'pyfr.backends.cuda',
'pyfr.backends.cuda.kernels',
+ 'pyfr.backends.mic',
+ 'pyfr.backends.mic.kernels',
'pyfr.backends.opencl',
'pyfr.backends.opencl.kernels',
'pyfr.backends.openmp',
@@ -60,6 +62,7 @@ tests = [
# Data
package_data = {
'pyfr.backends.cuda.kernels': ['*.mako'],
+ 'pyfr.backends.mic.kernels': ['*.mako'],
'pyfr.backends.opencl.kernels': ['*.mako'],
'pyfr.backends.openmp.kernels': ['*.mako'],
'pyfr.quadrules': [
@@ -98,6 +101,7 @@ install_requires = [
# Soft dependencies
extras_require = {
'cuda': ['pycuda >= 2011.2'],
+ 'mic': ['pymic >= 0.7'],
'opencl': ['pyopencl >= 2015.2.4']
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git
More information about the debian-science-commits
mailing list