[pyfr] 16/32: Add a backend for the MIC based around pyMIC.

Thu Apr 21 08:21:51 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository pyfr.

commit ee26db758a6e0c32c24c56b139766d665f0589e9
Author: Freddie Witherden <freddie at witherden.org>
Date:   Tue Jan 19 16:38:05 2016 +0000

    Add a backend for the MIC based around pyMIC.
---
 pyfr/backends/__init__.py                 |   1 +
 pyfr/backends/mic/__init__.py             |   3 +
 pyfr/backends/mic/base.py                 |  78 +++++++++++
 pyfr/backends/mic/blasext.py              |  77 +++++++++++
 pyfr/backends/mic/cblas.py                |  44 ++++++
 pyfr/backends/mic/compiler.py             |  54 ++++++++
 pyfr/backends/mic/generator.py            | 217 ++++++++++++++++++++++++++++++
 pyfr/backends/mic/kernels/__init__.py     |   1 +
 pyfr/backends/mic/kernels/axnpby.mako     |  37 +++++
 pyfr/backends/mic/kernels/base.mako       |  20 +++
 pyfr/backends/mic/kernels/errest.mako     |  20 +++
 pyfr/backends/mic/kernels/gemm.mako       |  14 ++
 pyfr/backends/mic/kernels/loop-sched.mako |  69 ++++++++++
 pyfr/backends/mic/kernels/pack.mako       |  32 +++++
 pyfr/backends/mic/packing.py              |  45 +++++++
 pyfr/backends/mic/provider.py             |  29 ++++
 pyfr/backends/mic/types.py                | 141 +++++++++++++++++++
 setup.py                                  |   4 +
 18 files changed, 886 insertions(+)

diff --git a/pyfr/backends/__init__.py b/pyfr/backends/__init__.py
index d875db9..0891085 100644
--- a/pyfr/backends/__init__.py
+++ b/pyfr/backends/__init__.py
@@ -2,6 +2,7 @@
 
 from pyfr.backends.base import BaseBackend
 from pyfr.backends.cuda import CUDABackend
+from pyfr.backends.mic import MICBackend
 from pyfr.backends.opencl import OpenCLBackend
 from pyfr.backends.openmp import OpenMPBackend
 from pyfr.util import subclass_where
diff --git a/pyfr/backends/mic/__init__.py b/pyfr/backends/mic/__init__.py
new file mode 100644
index 0000000..46f201c
--- /dev/null
+++ b/pyfr/backends/mic/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+from pyfr.backends.mic.base import MICBackend
diff --git a/pyfr/backends/mic/base.py b/pyfr/backends/mic/base.py
new file mode 100644
index 0000000..512ef81
--- /dev/null
+++ b/pyfr/backends/mic/base.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pyfr.backends.base import BaseBackend
+from pyfr.mpiutil import get_local_rank
+from pyfr.template import DottedTemplateLookup
+
+
+class MICBackend(BaseBackend):
+    name = 'mic'
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        import pymic as mic
+
+        # Get the device ID to use
+        devid = cfg.get('backend-mic', 'device-id', 'local-rank')
+
+        # Handle the local-rank case
+        if devid == 'local-rank':
+            devid = str(get_local_rank())
+
+        # Get a handle to the desired device
+        self.dev = mic.devices[int(devid)]
+
+        # Default stream
+        self.sdflt = self.dev.get_default_stream()
+
+        # Take the alignment requirement to be 64-bytes
+        self.alignb = 64
+
+        from pyfr.backends.mic import (blasext, cblas, packing, provider,
+                                       types)
+
+        # Register our data types
+        self.base_matrix_cls = types.MICMatrixBase
+        self.const_matrix_cls = types.MICConstMatrix
+        self.matrix_cls = types.MICMatrix
+        self.matrix_bank_cls = types.MICMatrixBank
+        self.matrix_rslice_cls = types.MICMatrixRSlice
+        self.queue_cls = types.MICQueue
+        self.view_cls = types.MICView
+        self.xchg_matrix_cls = types.MICXchgMatrix
+        self.xchg_view_cls = types.MICXchgView
+
+        # Template lookup
+        self.lookup = DottedTemplateLookup(
+            'pyfr.backends.mic.kernels',
+            fpdtype=self.fpdtype, alignb=self.alignb
+        )
+
+        # Kernel provider classes
+        kprovcls = [provider.MICPointwiseKernelProvider,
+                    blasext.MICBlasExtKernels,
+                    packing.MICPackingKernels,
+                    cblas.MICCBLASKernels]
+        self._providers = [k(self) for k in kprovcls]
+
+        # Pointwise kernels
+        self.pointwise = self._providers[0]
+
+    def _malloc_impl(self, nbytes):
+        stream = self.sdflt
+
+        # Allocate an empty buffer on the device
+        buf = stream.allocate_device_memory(nbytes)
+
+        # Attach the raw device pointer
+        buf.dev_ptr = stream.translate_device_pointer(buf)
+
+        # Zero the buffer
+        zeros = np.zeros(nbytes, dtype=np.uint8)
+        stream.transfer_host2device(zeros.ctypes.data, buf, nbytes)
+        stream.sync()
+
+        return buf
diff --git a/pyfr/backends/mic/blasext.py b/pyfr/backends/mic/blasext.py
new file mode 100644
index 0000000..cb740fb
--- /dev/null
+++ b/pyfr/backends/mic/blasext.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pyfr.backends.mic.provider import MICKernelProvider
+from pyfr.backends.base import ComputeKernel
+
+
+class MICBlasExtKernels(MICKernelProvider):
+    def axnpby(self, *arr):
+        if any(arr[0].traits != x.traits for x in arr[1:]):
+            raise ValueError('Incompatible matrix types')
+
+        nv = len(arr)
+        nrow, leaddim, leadsubdim, dtype = arr[0].traits
+
+        # Render the kernel template
+        src = self.backend.lookup.get_template('axnpby').render(nv=nv)
+
+        # Build the kernel
+        kern = self._build_kernel('axnpby', src,
+                                  [np.int32] + [np.intp]*nv + [dtype]*nv)
+
+        # Determine the total element count in the matrices
+        cnt = leaddim*nrow
+
+        class AxnpbyKernel(ComputeKernel):
+            def run(self, queue, *consts):
+                args = [x.data for x in arr] + list(consts)
+                queue.mic_stream_comp.invoke(kern, cnt, *args)
+
+        return AxnpbyKernel()
+
+    def copy(self, dst, src):
+        if dst.traits != src.traits:
+            raise ValueError('Incompatible matrix types')
+
+        class CopyKernel(ComputeKernel):
+            def run(self, queue):
+                queue.mic_stream_comp.transfer_device2device(
+                    src.basedata, dst.basedata, dst.nbytes, src.offset,
+                    dst.offset
+                )
+
+        return CopyKernel()
+
+    def errest(self, x, y, z):
+        if x.traits != y.traits != z.traits:
+            raise ValueError('Incompatible matrix types')
+
+        cnt = x.leaddim*x.nrow
+        dtype = x.dtype
+
+        # Allocate space for the return value
+        reth = np.zeros(1)
+        retd = self.backend.sdflt.bind(reth, update_device=False)
+
+        # Render the reduction kernel template
+        src = self.backend.lookup.get_template('errest').render()
+
+        # Build
+        rkern = self._build_kernel(
+            'errest', src, [np.int32] + [np.intp]*4 + [dtype]*2, restype=dtype
+        )
+
+        class ErrestKernel(ComputeKernel):
+            @property
+            def retval(self):
+                return float(reth[0])
+
+            def run(self, queue, atol, rtol):
+                queue.mic_stream_comp.invoke(
+                    rkern, cnt, retd, x.data, y.data, z.data, atol, rtol
+                )
+                retd.update_host()
+
+        return ErrestKernel()
diff --git a/pyfr/backends/mic/cblas.py b/pyfr/backends/mic/cblas.py
new file mode 100644
index 0000000..2037e77
--- /dev/null
+++ b/pyfr/backends/mic/cblas.py
@@ -0,0 +1,44 @@
+ # -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pyfr.backends.base import ComputeKernel
+from pyfr.backends.mic.provider import MICKernelProvider
+
+
+class MICCBLASKernels(MICKernelProvider):
+    def mul(self, a, b, out, alpha=1.0, beta=0.0):
+        # Ensure the matrices are compatible
+        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
+            raise ValueError('Incompatible matrices for out = a*b')
+
+        m, n, k = a.nrow, b.ncol, a.ncol
+
+        # Data type
+        if a.dtype == np.float64:
+            cblas_gemm = 'cblas_dgemm'
+        else:
+            cblas_gemm = 'cblas_sgemm'
+
+        # Render the kernel template
+        tpl = self.backend.lookup.get_template('gemm')
+        src = tpl.render(cblas_gemm=cblas_gemm)
+
+        # Argument types for gemm
+        argt = [
+            np.intp, np.int32, np.int32, np.int32,
+            a.dtype, np.intp, np.int32, np.intp, np.int32,
+            a.dtype, np.intp, np.int32
+        ]
+
+        # Build
+        gemm = self._build_kernel('gemm', src, argt)
+
+        class MulKernel(ComputeKernel):
+            def run(self, queue):
+                queue.mic_stream_comp.invoke(
+                    gemm, m, n, k, a.data, b.data, out.data, a.leaddim,
+                    b.leaddim, out.leaddim, alpha, beta
+                )
+
+        return MulKernel()
diff --git a/pyfr/backends/mic/compiler.py b/pyfr/backends/mic/compiler.py
new file mode 100644
index 0000000..f550ce7
--- /dev/null
+++ b/pyfr/backends/mic/compiler.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+import itertools as it
+import os
+import tempfile
+
+from pytools.prefork import call_capture_output
+
+from pyfr.util import rm
+
+
+class MICSourceModule(object):
+    _dir_seq = it.count()
+
+    def __init__(self, src, dev, cfg):
+        # Create a scratch directory
+        tmpidx = next(self._dir_seq)
+        tmpdir = tempfile.mkdtemp(prefix='pyfr-{0}-'.format(tmpidx))
+
+        # Find MKL
+        mklroot = cfg.get('backend-mic', 'mkl-root', '$MKLROOT')
+
+        try:
+            # File names
+            cn, ln = 'tmp.c', 'libtmp.so'
+
+            # Write the source code out
+            with open(os.path.join(tmpdir, cn), 'w') as f:
+                f.write(src)
+
+            # Compile and link
+            cmd = [
+                'icc',
+                '-shared',                       # Create a shared library
+                '-std=c99',                      # Enable C99 support
+                '-Ofast',                        # Optimise
+                '-mmic',                         # Compile for the MIC
+                '-fopenmp',                      # Enable OpenMP support
+                '-L{0}/lib/mic'.format(mklroot), # MKL stuff
+                '-lmkl_intel_lp64',              #  ...
+                '-lmkl_core',                    #  ...
+                '-lmkl_intel_thread',            #  ...
+                '-fPIC',                         # Position-independent code
+                '-o', ln, cn
+            ]
+            call_capture_output(cmd, cwd=tmpdir)
+
+            # Load
+            self._mod = dev.load_library(os.path.join(tmpdir, ln))
+        finally:
+            pass #rm(tmpdir)
+
+    def function(self, name, argtypes, restype=None):
+        return getattr(self._mod, name)
diff --git a/pyfr/backends/mic/generator.py b/pyfr/backends/mic/generator.py
new file mode 100644
index 0000000..4608acb
--- /dev/null
+++ b/pyfr/backends/mic/generator.py
@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+from pyfr.backends.base.generator import BaseKernelGenerator
+from pyfr.util import ndrange
+
+
+class MICKernelGenerator(BaseKernelGenerator):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Specialise
+        self._dims = ['_nx'] if self.ndim == 1 else ['_ny', '_nx']
+
+    def render(self):
+        # Argument unpacking
+        spec, unpack = self._emit_spec_unpack()
+
+        if self.ndim == 1:
+            body = self._emit_body_1d()
+            return '''
+                   void {name}({spec})
+                   {{
+                       {unpack}
+                       #pragma omp parallel
+                       {{
+                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+                           int cb, ce;
+                           loop_sched_1d(_nx, align, &cb, &ce);
+                           for (int _x = cb; _x < ce; _x++)
+                           {{
+                               {body}
+                           }}
+                       }}
+                   }}'''.format(name=self.name, spec=spec, unpack=unpack,
+                                body=body)
+        else:
+            innerfn = self._emit_inner_func()
+            innercall = self._emit_inner_call()
+            return '''{innerfn}
+                   void {name}({spec})
+                   {{
+                       {unpack}
+                       #pragma omp parallel
+                       {{
+                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+                           int rb, re, cb, ce;
+                           loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
+                           for (int _y = rb; _y < re; _y++)
+                           {{
+                               {innercall}
+                           }}
+                       }}
+                   }}'''.format(innerfn=innerfn, spec=spec, unpack=unpack,
+                                name=self.name, innercall=innercall)
+
+    def _emit_inner_func(self):
+        # Get the specification and body
+        spec = self._emit_inner_spec()
+        body = self._emit_body_2d()
+
+        # Combine
+        return '''{spec}
+               {{
+                   for (int _x = 0; _x < _nx; _x++)
+                   {{
+                       {body}
+                   }}
+               }}'''.format(spec=spec, body=body)
+
+    def _emit_inner_call(self):
+        # Arguments for the inner function
+        iargs = ['ce - cb']
+        iargs.extend(sa.name for sa in self.scalargs)
+
+        for va in self.vectargs:
+            iargs.extend(self._offset_arg_array_2d(va))
+
+        return '{0}_inner({1});'.format(self.name, ', '.join(iargs))
+
+    def _emit_inner_spec(self):
+        # Inner dimension
+        ikargs = ['int _nx']
+
+        # Add any scalar arguments
+        ikargs.extend('{0.dtype} {0.name}'.format(sa) for sa in self.scalargs)
+
+        # Vector arguments (always arrays as we're 2D)
+        for va in self.vectargs:
+            const = 'const' if va.intent == 'in' else ''
+            stmt = '{0} {1.dtype} *__restrict__ {1.name}_v'.format(const, va)
+            stmt = stmt.strip()
+
+            if va.ncdim == 0:
+                ikargs.append(stmt)
+            else:
+                for ij in ndrange(*va.cdims):
+                    ikargs.append(stmt + 'v'.join(str(n) for n in ij))
+
+        return ('static PYFR_NOINLINE void {0}_inner({1})'
+                .format(self.name, ', '.join(ikargs)))
+
+    def _emit_spec_unpack(self):
+        # Start by unpacking the dimensions
+        kspec = ['long *arg{0}' for d in self._dims]
+        kpack = ['int {0} = *arg{{0}};'.format(d) for d in self._dims]
+
+        # Next unpack any scalar arguments
+        kspec.extend('double *arg{0}' for sa in self.scalargs)
+        kpack.extend('{0.dtype} {0.name} = *arg{{0}};'.format(sa)
+                     for sa in self.scalargs)
+
+        # Finally, add the vector arguments
+        for va in self.vectargs:
+            # Views
+            if va.isview:
+                kspec.append('void **arg{0}')
+                kpack.append('{0.dtype} *{0.name}_v = *arg{{0}};'
+                             .format(va))
+
+                kspec.append('void **arg{0}')
+                kpack.append('const int *{0.name}_vix = *arg{{0}};'
+                             .format(va))
+
+                if va.ncdim >= 1:
+                    kspec.append('void **arg{0}')
+                    kpack.append('const int *{0.name}_vcstri = *arg{{0}};'
+                                 .format(va))
+                if va.ncdim == 2:
+                    kspec.append('void **arg{0}')
+                    kpack.append('const int *{0.name}_vrstri = *arg{{0}};'
+                                 .format(va))
+            # Arrays
+            else:
+                # Intent in arguments should be marked constant
+                const = 'const' if va.intent == 'in' else ''
+
+                kspec.append('void **arg{0}')
+                kpack.append('{0} {1.dtype}* {1.name}_v = *arg{{0}};'
+                             .format(const, va).strip())
+
+                # If we are a matrix (ndim = 2) or a non-MPI stacked
+                # vector then a leading (sub) dimension is required
+                if self.ndim == 2 or (va.ncdim > 0 and not va.ismpi):
+                    kspec.append('long *arg{0}')
+                    kpack.append('int lsd{0.name} = *arg{{0}};'.format(va))
+
+        return (', '.join(a.format(i) for i, a in enumerate(kspec)),
+                '\n'.join(a.format(i) for i, a in enumerate(kpack)))
+
+    def _emit_body_1d(self):
+        body = self.body
+        ptns = [r'\b{0}\b', r'\b{0}\[(\d+)\]', r'\b{0}\[(\d+)\]\[(\d+)\]']
+
+        for va in self.vectargs:
+            # Dereference the argument
+            darg = self._deref_arg(va)
+
+            # Substitute
+            body = re.sub(ptns[va.ncdim].format(va.name), darg, body)
+
+        return body
+
+    def _emit_body_2d(self):
+        body = self.body
+        ptns = [r'\b{0}\b', r'\b{0}\[(\d+)\]', r'\b{0}\[(\d+)\]\[(\d+)\]']
+        subs = ['{0}_v[_x]', r'{0}_v\1[_x]', r'{0}_v\1v\2[_x]']
+
+        for va in self.vectargs:
+            body = re.sub(ptns[va.ncdim].format(va.name),
+                          subs[va.ncdim].format(va.name), body)
+
+        return body
+
+    def _deref_arg(self, arg):
+        if arg.isview:
+            ptns = ['{0}_v[{0}_vix[_x]]',
+                    r'{0}_v[{0}_vix[_x] + {0}_vcstri[_x]*\1]',
+                    r'{0}_v[{0}_vix[_x] + {0}_vrstri[_x]*\1'
+                    r' + {0}_vcstri[_x]*\2]']
+
+            return ptns[arg.ncdim].format(arg.name)
+        else:
+            # Leading (sub) dimension
+            lsdim = 'lsd' + arg.name if not arg.ismpi else '_nx'
+
+            # Vector name_v[_x]
+            if arg.ncdim == 0:
+                ix = '_x'
+            # Stacked vector; name_v[lsdim*\1 + _x]
+            elif arg.ncdim == 1:
+                ix = r'{0}*\1 + _x'.format(lsdim)
+            # Doubly stacked vector; name_v[lsdim*nv*\1 + lsdim*\2 + _x]
+            else:
+                ix = r'{0}*{1}*\1 + {0}*\2 + _x'.format(lsdim, arg.cdims[1])
+
+            return '{0}_v[{1}]'.format(arg.name, ix)
+
+    def _offset_arg_array_2d(self, arg):
+        stmts = []
+
+        # Matrix; name + _y*lsdim + cb
+        if arg.ncdim == 0:
+            stmts.append('{0}_v + _y*lsd{0} + cb'.format(arg.name))
+        # Stacked matrix; name + (_y*nv + <0>)*lsdim + cb
+        elif arg.ncdim == 1:
+            stmts.extend('{0}_v + (_y*{1} + {2})*lsd{0} + cb'
+                         .format(arg.name, arg.cdims[0], i)
+                         for i in range(arg.cdims[0]))
+        # Doubly stacked matrix; name + ((<0>*_ny + _y)*nv + <1>)*lsdim + cb
+        else:
+            stmts.extend('{0}_v + (({1}*_ny + _y)*{2} + {3})*lsd{0} + cb'
+                         .format(arg.name, i, arg.cdims[1], j)
+                         for i, j in ndrange(*arg.cdims))
+
+        return stmts
diff --git a/pyfr/backends/mic/kernels/__init__.py b/pyfr/backends/mic/kernels/__init__.py
new file mode 100644
index 0000000..40a96af
--- /dev/null
+++ b/pyfr/backends/mic/kernels/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
diff --git a/pyfr/backends/mic/kernels/axnpby.mako b/pyfr/backends/mic/kernels/axnpby.mako
new file mode 100644
index 0000000..a21160e
--- /dev/null
+++ b/pyfr/backends/mic/kernels/axnpby.mako
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
+
+static PYFR_NOINLINE void
+axnpby_inner(int n,
+             ${', '.join('fpdtype_t *__restrict__ x{0}, '
+                         'fpdtype_t a{0}'.format(i) for i in range(nv))})
+{
+    for (int i = 0; i < n; i++)
+    {
+        fpdtype_t axn = ${pyfr.dot('a{j}', 'x{j}[i]', j=(1, nv))};
+
+        if (a0 == 0.0)
+            x0[i] = axn;
+        else if (a0 == 1.0)
+            x0[i] += axn;
+        else
+            x0[i] = a0*x0[i] + axn;
+    }
+}
+
+void
+axnpby(long *n,
+       ${', '.join('fpdtype_t **x{0}'.format(i) for i in range(nv))},
+       ${', '.join('double *a{0}'.format(i) for i in range(nv))})
+{
+    #pragma omp parallel
+    {
+        int begin, end;
+        loop_sched_1d(*n, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);
+
+        axnpby_inner(end - begin,
+                     ${', '.join('*x{0} + begin, *a{0}'.format(i)
+                                 for i in range(nv))});
+    }
+}
diff --git a/pyfr/backends/mic/kernels/base.mako b/pyfr/backends/mic/kernels/base.mako
new file mode 100644
index 0000000..1f4d487
--- /dev/null
+++ b/pyfr/backends/mic/kernels/base.mako
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
+
+#include <omp.h>
+#include <stdlib.h>
+#include <tgmath.h>
+
+#define PYFR_ALIGN_BYTES ${alignb}
+#define PYFR_NOINLINE __attribute__ ((noinline))
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+// Typedefs
+typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
+
+// OpenMP static loop scheduling functions
+<%include file='loop-sched'/>
+
+${next.body()}
diff --git a/pyfr/backends/mic/kernels/errest.mako b/pyfr/backends/mic/kernels/errest.mako
new file mode 100644
index 0000000..78b17a2
--- /dev/null
+++ b/pyfr/backends/mic/kernels/errest.mako
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+<%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
+
+void
+errest(long *n, double *out,
+       fpdtype_t **xp, fpdtype_t **yp, fpdtype_t **zp,
+       double *atolp, double *rtolp)
+{
+    fpdtype_t *x = *xp, *y = *yp, *z = *zp;
+    fpdtype_t atol = *atolp, rtol = *rtolp;
+
+    fpdtype_t sum = 0.0;
+
+    #pragma omp parallel for reduction(+:sum)
+    for (int i = 0; i < *n; i++)
+        sum += pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2);
+
+    *out = sum;
+}
diff --git a/pyfr/backends/mic/kernels/gemm.mako b/pyfr/backends/mic/kernels/gemm.mako
new file mode 100644
index 0000000..0018239
--- /dev/null
+++ b/pyfr/backends/mic/kernels/gemm.mako
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+
+#include <mkl.h>
+
+void
+gemm(long *M, long *N, long *K,
+     fpdtype_t **A, fpdtype_t **B, fpdtype_t **C,
+     long *lda, long *ldb, long *ldc,
+     double *alpha, double *beta)
+{
+    ${cblas_gemm}(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+            *M, *N, *K, *alpha, *A, *lda, *B, *ldb, *beta, *C, *ldc);
+}
diff --git a/pyfr/backends/mic/kernels/loop-sched.mako b/pyfr/backends/mic/kernels/loop-sched.mako
new file mode 100644
index 0000000..9e3f4c0
--- /dev/null
+++ b/pyfr/backends/mic/kernels/loop-sched.mako
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+static inline int
+gcd(int a, int b)
+{
+    return (a == 0) ? b : gcd(b % a, a);
+}
+
+static inline void
+loop_sched_1d(int n, int align, int *b, int *e)
+{
+    int tid = omp_get_thread_num();
+    int nth = omp_get_num_threads();
+
+    // Round up n to be a multiple of nth
+    int rn = n + nth - 1 - (n - 1) % nth;
+
+    // Nominal tile size
+    int sz = rn / nth;
+
+    // Handle alignment
+    sz += align - 1 - (sz - 1) % align;
+
+    // Assign the starting and ending index
+    *b = sz * tid;
+    *e = min(*b + sz, n);
+
+    // Clamp
+    if (*b >= n)
+        *b = *e = 0;
+}
+
+static inline void
+loop_sched_2d(int nrow, int ncol, int colalign,
+              int *rowb, int *rowe, int *colb, int *cole)
+{
+    int tid = omp_get_thread_num();
+    int nth = omp_get_num_threads();
+
+    // Distribute threads
+    int nrowth = gcd(nrow, nth);
+    int ncolth = nth / nrowth;
+
+    // Row and column indices for our thread
+    int rowix = tid / ncolth;
+    int colix = tid % ncolth;
+
+    // Round up ncol to be a multiple of ncolth
+    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
+
+    // Nominal tile size
+    int ntilerow = nrow / nrowth;
+    int ntilecol = rncol / ncolth;
+
+    // Handle column alignment
+    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
+
+    // Assign the starting and ending row to each thread
+    *rowb = ntilerow * rowix;
+    *rowe = *rowb + ntilerow;
+
+    // Assign the starting and ending column to each thread
+    *colb = ntilecol * colix;
+    *cole = min(*colb + ntilecol, ncol);
+
+    // Clamp
+    if (*colb >= ncol)
+        *colb = *cole = 0;
+}
diff --git a/pyfr/backends/mic/kernels/pack.mako b/pyfr/backends/mic/kernels/pack.mako
new file mode 100644
index 0000000..9f32a92
--- /dev/null
+++ b/pyfr/backends/mic/kernels/pack.mako
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+<%inherit file='base'/>
+
+void
+pack_view(long *n_a, long *nrv_a, long *ncv_a,
+          void **v_a, void **vix_a, void **vcstri_a, void **vrstri_a,
+          void **pmat_a)
+{
+    int n = *n_a;
+    int nrv = *nrv_a;
+    int ncv = *ncv_a;
+
+    fpdtype_t *v = *v_a;
+    int *vix = *vix_a;
+    int *vcstri = (vcstri_a) ? *vcstri_a : 0;
+    int *vrstri = (vrstri_a) ? *vrstri_a : 0;
+    fpdtype_t *pmat = *pmat_a;
+
+    if (ncv == 1)
+        for (int i = 0; i < n; i++)
+            pmat[i] = v[vix[i]];
+    else if (nrv == 1)
+        for (int i = 0; i < n; i++)
+            for (int c = 0; c < ncv; c++)
+                pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+    else
+        for (int i = 0; i < n; i++)
+            for (int r = 0; r < nrv; r++)
+                for (int c = 0; c < ncv; c++)
+                    pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r
+                                                + vcstri[i]*c];
+}
diff --git a/pyfr/backends/mic/packing.py b/pyfr/backends/mic/packing.py
new file mode 100644
index 0000000..043975d
--- /dev/null
+++ b/pyfr/backends/mic/packing.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+from pyfr.backends.base import ComputeKernel, NullComputeKernel
+from pyfr.backends.base.packing import BasePackingKernels
+from pyfr.backends.mic.provider import MICKernelProvider
+
+
+class MICPackingKernels(MICKernelProvider, BasePackingKernels):
+    def pack(self, mv):
+        # An exchange view is simply a regular view plus an exchange matrix
+        m, v = mv.xchgmat, mv.view
+
+        # Render the kernel template
+        src = self.backend.lookup.get_template('pack').render()
+
+        # Build
+        kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+
+        class PackXchgViewKernel(ComputeKernel):
+            def run(self, queue):
+                # Kernel arguments
+                args = [v.n, v.nvrow, v.nvcol, v.basedata.dev_ptr,
+                        v.mapping, v.cstrides, v.rstrides, m]
+                args = [getattr(arg, 'data', arg) for arg in args]
+
+                # Pack
+                queue.mic_stream_comp.invoke(kern, *args)
+
+                # Copy the packed buffer to the host
+                queue.mic_stream_comp.transfer_device2host(
+                    m.basedata, m.hdata.ctypes.data, m.nbytes,
+                    offset_device=m.offset
+                )
+
+        return PackXchgViewKernel()
+
+    def unpack(self, mv):
+        class UnpackXchgMatrixKernel(ComputeKernel):
+            def run(self, queue):
+                queue.mic_stream_comp.transfer_host2device(
+                    mv.hdata.ctypes.data, mv.basedata, mv.nbytes,
+                    offset_device=mv.offset
+                )
+
+        return UnpackXchgMatrixKernel()
diff --git a/pyfr/backends/mic/provider.py b/pyfr/backends/mic/provider.py
new file mode 100644
index 0000000..545c919
--- /dev/null
+++ b/pyfr/backends/mic/provider.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+from pyfr.backends.base import (BaseKernelProvider,
+                                BasePointwiseKernelProvider, ComputeKernel)
+from pyfr.backends.mic.compiler import MICSourceModule
+import pyfr.backends.mic.generator as generator
+from pyfr.util import memoize
+
+
+class MICKernelProvider(BaseKernelProvider):
+    @memoize
+    def _build_kernel(self, name, src, argtypes, restype=None):
+        mod = MICSourceModule(src, self.backend.dev, self.backend.cfg)
+        return mod.function(name, argtypes, restype)
+
+
+class MICPointwiseKernelProvider(MICKernelProvider,
+                                 BasePointwiseKernelProvider):
+    kernel_generator_cls = generator.MICKernelGenerator
+
+    def _instantiate_kernel(self, dims, fun, arglst):
+        class PointwiseKernel(ComputeKernel):
+            def run(self, queue, **kwargs):
+                narglst = [kwargs.get(ka, ka) for ka in arglst]
+                narglst = [getattr(arg, 'dev_ptr', arg) for arg in narglst]
+                narglst = [getattr(arg, 'data', arg) for arg in narglst]
+                queue.mic_stream_comp.invoke(fun, *narglst)
+
+        return PointwiseKernel()
diff --git a/pyfr/backends/mic/types.py b/pyfr/backends/mic/types.py
new file mode 100644
index 0000000..c29ccf4
--- /dev/null
+++ b/pyfr/backends/mic/types.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+import pyfr.backends.base as base
+from pyfr.util import lazyprop
+
+
+class MICMatrixBase(base.MatrixBase):
+    def onalloc(self, basedata, offset):
+        self.basedata = basedata
+        self.data = basedata.dev_ptr + offset
+
+        self.offset = offset
+
+        # Process any initial value
+        if self._initval is not None:
+            self._set(self._initval)
+
+        # Remove
+        del self._initval
+
+    def _get(self):
+        # Allocate an empty buffer
+        buf = np.empty(self.datashape, dtype=self.dtype)
+
+        # Copy using the default stream
+        self.backend.sdflt.transfer_device2host(
+            self.basedata, buf.ctypes.data, self.nbytes,
+            offset_device=self.offset
+        )
+        self.backend.sdflt.sync()
+
+        # Slice to give the expected I/O shape
+        return buf[...,:self.ioshape[-1]]
+
+    def _set(self, ary):
+        # Allocate a new buffer with suitable padding and assign
+        buf = np.zeros(self.datashape, dtype=self.dtype)
+        buf[...,:self.ioshape[-1]] = ary
+
+        # Copy using the default stream
+        self.backend.sdflt.transfer_host2device(
+            buf.ctypes.data, self.basedata, self.nbytes,
+            offset_device=self.offset
+        )
+        self.backend.sdflt.sync()
+
+
+class MICMatrix(MICMatrixBase, base.Matrix):
+    pass
+
+
+class MICMatrixRSlice(base.MatrixRSlice):
+    @property
+    def data(self):
+        return self.basedata.dev_ptr + self.offset
+
+
+class MICMatrixBank(base.MatrixBank):
+    pass
+
+
+class MICConstMatrix(MICMatrixBase, base.ConstMatrix):
+    pass
+
+
+class MICXchgMatrix(MICMatrix, base.XchgMatrix):
+    def __init__(self, backend, ioshape, initval, extent, aliases, tags):
+        # Call the standard matrix constructor
+        super().__init__(backend, ioshape, initval, extent, aliases, tags)
+
+        # Allocate an empty buffer on the host for MPI to send/recv from
+        self.hdata = np.empty((self.nrow, self.ncol), self.dtype)
+
+
+class MICXchgView(base.XchgView):
+    pass
+
+
+class MICView(base.View):
+    pass
+
+
+class MICQueue(base.Queue):
+    def __init__(self, backend):
+        super().__init__(backend)
+
+        # MIC stream
+        self.mic_stream_comp = backend.sdflt
+
+    def _exec_item(self, item, args, kwargs):
+        item.run(self, *args, **kwargs)
+
+        self.mic_stream_comp.sync()
+
+        self._last = item
+
+    def _exec_nonblock(self):
+        while self._items:
+            kern = self._items[0][0]
+
+            # See if kern will block
+            if self._at_sequence_point(kern) or kern.ktype == 'compute':
+                break
+
+            self._exec_item(*self._items.popleft())
+
+    def _wait(self):
+        if self._last and self._last.ktype == 'mpi':
+            from mpi4py import MPI
+
+            MPI.Prequest.Waitall(self.mpi_reqs)
+            self.mpi_reqs = []
+
+        self._last = None
+
+    def _at_sequence_point(self, item):
+        last = self._last
+
+        return last and last.ktype == 'mpi' and item.ktype != 'mpi'
+
+    @staticmethod
+    def runall(queues):
+        # Fire off any non-blocking kernels
+        for q in queues:
+            q._exec_nonblock()
+
+        while any(queues):
+            # Execute a (potentially) blocking item from each queue
+            for q in filter(None, queues):
+                q._exec_nowait()
+
+            # Now consider kernels which will wait
+            for q in filter(None, queues):
+                q._exec_next()
+                q._exec_nonblock()
+
+        # Wait for all tasks to complete
+        for q in queues:
+            q._wait()
diff --git a/setup.py b/setup.py
index 3739294..ef4b03b 100755
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,8 @@ modules = [
     'pyfr.backends.base',
     'pyfr.backends.cuda',
     'pyfr.backends.cuda.kernels',
+    'pyfr.backends.mic',
+    'pyfr.backends.mic.kernels',
     'pyfr.backends.opencl',
     'pyfr.backends.opencl.kernels',
     'pyfr.backends.openmp',
@@ -60,6 +62,7 @@ tests = [
 # Data
 package_data = {
     'pyfr.backends.cuda.kernels': ['*.mako'],
+    'pyfr.backends.mic.kernels': ['*.mako'],
     'pyfr.backends.opencl.kernels': ['*.mako'],
     'pyfr.backends.openmp.kernels': ['*.mako'],
     'pyfr.quadrules': [
@@ -98,6 +101,7 @@ install_requires = [
 # Soft dependencies
 extras_require = {
     'cuda': ['pycuda >= 2011.2'],
+    'mic': ['pymic >= 0.7'],
     'opencl': ['pyopencl >= 2015.2.4']
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git