[arrayfire] 164/284: Added matmul offloading to CPU

Sun Feb 7 18:59:29 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.

commit 3c1ab9f0902a37bd7b3c31bc533790d950407b52
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date:   Thu Jan 7 16:10:43 2016 -0500

    Added matmul offloading to CPU
---
 src/backend/cpu/blas.hpp                           |   6 +
 src/backend/opencl/CMakeLists.txt                  |  12 +
 src/backend/opencl/blas.cpp                        |   6 +
 src/backend/opencl/cpu/cpu_blas.cpp                | 268 +++++++++++++++++++++
 src/backend/opencl/cpu/cpu_blas.hpp                |  23 ++
 .../{cpu/blas.hpp => opencl/cpu/cpu_helper.hpp}    |  14 +-
 test/blas.cpp                                      |   1 +
 7 files changed, 322 insertions(+), 8 deletions(-)

diff --git a/src/backend/cpu/blas.hpp b/src/backend/cpu/blas.hpp
index 117d3a2..934a2c6 100644
--- a/src/backend/cpu/blas.hpp
+++ b/src/backend/cpu/blas.hpp
@@ -45,4 +45,10 @@ template<typename T>
 Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
              af_mat_prop optLhs, af_mat_prop optRhs);
 
+typedef std::complex<float>     cfloat;
+typedef std::complex<double>    cdouble;
+
+template<typename T> struct is_complex          { static const bool value = false;  };
+template<> struct           is_complex<cfloat>  { static const bool value = true;   };
+template<> struct           is_complex<cdouble> { static const bool value = true;   };
 }
diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt
index 86ba1b2..c9c47d0 100644
--- a/src/backend/opencl/CMakeLists.txt
+++ b/src/backend/opencl/CMakeLists.txt
@@ -123,6 +123,12 @@ FILE(GLOB conv_ker_headers
 FILE(GLOB conv_ker_sources
      "kernel/convolve/*.cpp")
 
+FILE(GLOB cpu_headers
+     "cpu/*.hpp")
+
+FILE(GLOB cpu_sources
+     "cpu/*.cpp")
+
 source_group(backend\\opencl\\Headers FILES ${opencl_headers})
 source_group(backend\\opencl\\Sources FILES ${opencl_sources})
 source_group(backend\\opencl\\JIT FILES ${jit_sources})
@@ -131,6 +137,8 @@ source_group(backend\\opencl\\kernel\\cl FILES ${opencl_kernels})
 source_group(backend\\opencl\\kernel\\Sources FILES ${kernel_sources})
 source_group(backend\\opencl\\kernel\\convolve\\Headers FILES ${conv_ker_headers})
 source_group(backend\\opencl\\kernel\\convolve\\Sources FILES ${conv_ker_sources})
+source_group(backend\\opencl\\cpu\\Headers FILES ${cpu_headers})
+source_group(backend\\opencl\\cpu\\Sources FILES ${cpu_sources})
 
 IF(LAPACK_FOUND)
     FILE(GLOB magma_sources
@@ -206,6 +214,8 @@ IF(DEFINED BLAS_SYM_FILE)
                 ${kernel_sources}
                 ${conv_ker_headers}
                 ${conv_ker_sources}
+                ${cpu_headers}
+                ${cpu_sources}
                 ${backend_headers}
                 ${backend_sources}
                 ${magma_sources}
@@ -244,6 +254,8 @@ ELSE(DEFINED BLAS_SYM_FILE)
                 ${kernel_sources}
                 ${conv_ker_headers}
                 ${conv_ker_sources}
+                ${cpu_sources}
+                ${cpu_sources}
                 ${backend_headers}
                 ${backend_sources}
                 ${c_headers}
diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp
index 6173a68..f9f8af1 100644
--- a/src/backend/opencl/blas.cpp
+++ b/src/backend/opencl/blas.cpp
@@ -20,6 +20,8 @@
 #include <math.hpp>
 #include <transpose.hpp>
 
+#include <cpu/cpu_blas.hpp>
+
 namespace opencl
 {
 
@@ -113,6 +115,10 @@ template<typename T>
 Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
                 af_mat_prop optLhs, af_mat_prop optRhs)
 {
+    if(OpenCLCPUOffload()) {
+        return cpu::matmul(lhs, rhs, optLhs, optRhs);
+    }
+
     initBlas();
     clblasTranspose lOpts = toClblasTranspose(optLhs);
     clblasTranspose rOpts = toClblasTranspose(optRhs);
diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp
new file mode 100644
index 0000000..524777a
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_blas.cpp
@@ -0,0 +1,268 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <cpu/cpu_blas.hpp>
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <cassert>
+#include <err_common.hpp>
+#include <platform.hpp>
+#include <af/macros.h>
+
+namespace opencl
+{
+namespace cpu
+{
+
+using std::add_const;
+using std::add_pointer;
+using std::enable_if;
+using std::is_floating_point;
+using std::remove_const;
+using std::conditional;
+
+// Some implementations of BLAS require void* for complex pointers while others use float*/double*
+//
+// Sample cgemm API
+// OpenBLAS
+// void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+//                  OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+//                  OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda,
+//                  OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta,
+//                  float *C, OPENBLAS_CONST blasint ldc);
+//
+// MKL
+// void cblas_cgemm(const  CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, const  CBLAS_TRANSPOSE TransB,
+//                  const MKL_INT M, const MKL_INT N, const MKL_INT K,
+//                  const void *alpha, const void *A, const MKL_INT lda,
+//                  const void *B, const MKL_INT ldb, const void *beta,
+//                  void *C, const MKL_INT ldc);
+// atlas cblas
+// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+//                  const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+//                  const void *alpha, const void *A, const int lda,
+//                  const void *B, const int ldb, const void *beta,
+//                  void *C, const int ldc);
+//
+// LAPACKE
+// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+//                  const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+//                  const void *alpha, const void *A, const int lda,
+//                  const void *B, const int ldb, const void *beta,
+//                  void *C, const int ldc);
+#if defined(IS_OPENBLAS)
+    static const bool cplx_void_ptr = false;
+#else
+    static const bool cplx_void_ptr = true;
+#endif
+
+template<typename T, class Enable = void>
+struct blas_base {
+    using type = typename dtype_traits<T>::base_type;
+};
+
+template<typename T>
+struct blas_base <T, typename enable_if<is_complex<T>::value && cplx_void_ptr>::type> {
+    using type = void;
+};
+
+
+template<typename T>
+using cptr_type     =   typename conditional<   is_complex<T>::value,
+                                                const typename blas_base<T>::type *,
+                                                const T*>::type;
+template<typename T>
+using ptr_type     =    typename conditional<   is_complex<T>::value,
+                                                typename blas_base<T>::type *,
+                                                T*>::type;
+template<typename T>
+using scale_type   =    typename conditional<   is_complex<T>::value,
+                                                const typename blas_base<T>::type *,
+                                                const T>::type;
+
+template<typename T>
+using gemm_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE,
+                                const blasint, const blasint, const blasint,
+                                scale_type<T>, cptr_type<T>, const blasint,
+                                cptr_type<T>, const blasint,
+                                scale_type<T>, ptr_type<T>, const blasint);
+
+template<typename T>
+using gemv_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE,
+                                const blasint, const blasint,
+                                scale_type<T>, cptr_type<T>, const blasint,
+                                cptr_type<T>, const blasint,
+                                scale_type<T>, ptr_type<T>, const blasint);
+
+#define BLAS_FUNC_DEF( FUNC )                           \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+#define BLAS_FUNC( FUNC, TYPE, PREFIX )                 \
+  template<> FUNC##_func_def<TYPE> FUNC##_func<TYPE>()  \
+{ return &cblas_##PREFIX##FUNC; }
+
+BLAS_FUNC_DEF( gemm )
+BLAS_FUNC(gemm , float   , s)
+BLAS_FUNC(gemm , double  , d)
+BLAS_FUNC(gemm , cfloat  , c)
+BLAS_FUNC(gemm , cdouble , z)
+
+BLAS_FUNC_DEF(gemv)
+BLAS_FUNC(gemv , float   , s)
+BLAS_FUNC(gemv , double  , d)
+BLAS_FUNC(gemv , cfloat  , c)
+BLAS_FUNC(gemv , cdouble , z)
+
+template<typename T, int value>
+typename enable_if<is_floating_point<T>::value, scale_type<T>>::type
+getScale() { return T(value); }
+
+template<typename T, int value>
+typename enable_if<is_complex<T>::value, scale_type<T>>::type
+getScale()
+{
+    static T val = scalar<T>(value);
+    return (const typename blas_base<T>::type *)&val;
+}
+
+CBLAS_TRANSPOSE
+toCblasTranspose(af_mat_prop opt)
+{
+    CBLAS_TRANSPOSE out = CblasNoTrans;
+    switch(opt) {
+        case AF_MAT_NONE        : out = CblasNoTrans;   break;
+        case AF_MAT_TRANS       : out = CblasTrans;     break;
+        case AF_MAT_CTRANS      : out = CblasConjTrans; break;
+        default                 : AF_ERROR("INVALID af_mat_prop", AF_ERR_ARG);
+    }
+    return out;
+}
+
+template<typename T>
+Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
+                af_mat_prop optLhs, af_mat_prop optRhs)
+{
+    CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs);
+    CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs);
+
+    int aRowDim = (lOpts == CblasNoTrans) ? 0 : 1;
+    int aColDim = (lOpts == CblasNoTrans) ? 1 : 0;
+    int bColDim = (rOpts == CblasNoTrans) ? 1 : 0;
+
+    dim4 lDims = lhs.dims();
+    dim4 rDims = rhs.dims();
+    int M = lDims[aRowDim];
+    int N = rDims[bColDim];
+    int K = lDims[aColDim];
+
+    //FIXME: Leaks on errors.
+    Array<T> out = createValueArray<T>(af::dim4(M, N, 1, 1), scalar<T>(0));
+    auto alpha = getScale<T, 1>();
+    auto beta  = getScale<T, 0>();
+
+    dim4 lStrides = lhs.strides();
+    dim4 rStrides = rhs.strides();
+    using BT  =       typename blas_base<T>::type;
+    using CBT = const typename blas_base<T>::type;
+
+    // get host pointers from mapped memory
+    BT *lPtr = getMappedPtr<BT>(lhs.get());
+    BT *rPtr = getMappedPtr<BT>(rhs.get());
+    BT *oPtr = getMappedPtr<BT>(out.get());
+
+    if(rDims[bColDim] == 1) {
+        N = lDims[aColDim];
+        gemv_func<T>()(
+            CblasColMajor, lOpts,
+            lDims[0], lDims[1],
+            alpha,
+            lPtr, lStrides[1],
+            rPtr, rStrides[0],
+            beta,
+            oPtr, 1);
+    } else {
+        gemm_func<T>()(
+            CblasColMajor, lOpts, rOpts,
+            M, N, K,
+            alpha,
+            lPtr, lStrides[1],
+            rPtr, rStrides[1],
+            beta,
+            oPtr, out.dims()[0]);
+    }
+
+    unmapPtr(lhs.get(), lPtr);
+    unmapPtr(rhs.get(), rPtr);
+    unmapPtr(out.get(), oPtr);
+
+    return out;
+}
+
+//template<typename T> T
+//conj(T  x) { return x; }
+//
+//template<> cfloat  conj<cfloat> (cfloat  c) { return std::conj(c); }
+//template<> cdouble conj<cdouble>(cdouble c) { return std::conj(c); }
+//
+//template<typename T, bool conjugate, bool both_conjugate>
+//Array<T> dot_(const Array<T> &lhs, const Array<T> &rhs,
+//        af_mat_prop optLhs, af_mat_prop optRhs)
+//{
+//    int N = lhs.dims()[0];
+//
+//    T out = 0;
+//    const T *pL = lhs.get();
+//    const T *pR = rhs.get();
+//
+//    for(int i = 0; i < N; i++)
+//        out += (conjugate ? cpu::conj(pL[i]) : pL[i]) * pR[i];
+//
+//    if(both_conjugate) out = cpu::conj(out);
+//
+//    return createValueArray(af::dim4(1), out);
+//}
+//
+//template<typename T>
+//Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
+//        af_mat_prop optLhs, af_mat_prop optRhs)
+//{
+//    if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) {
+//        return dot_<T, false, true>(lhs, rhs, optLhs, optRhs);
+//    } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) {
+//        return dot_<T, true, false>(lhs, rhs, optLhs, optRhs);
+//    } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) {
+//        return dot_<T, true, false>(rhs, lhs, optRhs, optLhs);
+//    } else {
+//        return dot_<T, false, false>(lhs, rhs, optLhs, optRhs);
+//    }
+//}
+
+#undef BT
+#undef REINTEPRET_CAST
+
+#define INSTANTIATE_BLAS(TYPE)                                                          \
+    template Array<TYPE> matmul<TYPE>(const Array<TYPE> &lhs, const Array<TYPE> &rhs,   \
+                                      af_mat_prop optLhs, af_mat_prop optRhs);
+
+INSTANTIATE_BLAS(float)
+INSTANTIATE_BLAS(cfloat)
+INSTANTIATE_BLAS(double)
+INSTANTIATE_BLAS(cdouble)
+
+//#define INSTANTIATE_DOT(TYPE)                                                               \
+//    template Array<TYPE> dot<TYPE>(const Array<TYPE> &lhs, const Array<TYPE> &rhs,          \
+//                                   af_mat_prop optLhs, af_mat_prop optRhs);
+//
+//INSTANTIATE_DOT(float)
+//INSTANTIATE_DOT(double)
+//INSTANTIATE_DOT(cfloat)
+//INSTANTIATE_DOT(cdouble)
+
+}
+}
diff --git a/src/backend/opencl/cpu/cpu_blas.hpp b/src/backend/opencl/cpu/cpu_blas.hpp
new file mode 100644
index 0000000..303b60c
--- /dev/null
+++ b/src/backend/opencl/cpu/cpu_blas.hpp
@@ -0,0 +1,23 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <cpu/cpu_helper.hpp>
+
+namespace opencl
+{
+namespace cpu
+{
+    template<typename T>
+    Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
+                    af_mat_prop optLhs, af_mat_prop optRhs);
+//    template<typename T>
+//    Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
+//                 af_mat_prop optLhs, af_mat_prop optRhs);
+}
+}
diff --git a/src/backend/cpu/blas.hpp b/src/backend/opencl/cpu/cpu_helper.hpp
similarity index 77%
copy from src/backend/cpu/blas.hpp
copy to src/backend/opencl/cpu/cpu_helper.hpp
index 117d3a2..afc60d3 100644
--- a/src/backend/cpu/blas.hpp
+++ b/src/backend/opencl/cpu/cpu_helper.hpp
@@ -9,7 +9,9 @@
 
 #include <af/defines.h>
 #include <af/blas.h>
+#include <af/lapack.h>
 #include <Array.hpp>
+#include <memory.hpp>
 
 #ifdef __APPLE__
 #include <Accelerate/Accelerate.h>
@@ -35,14 +37,10 @@ extern "C" {
 typedef int blasint;
 #endif
 
+namespace opencl
+{
 namespace cpu
 {
-
-template<typename T>
-Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
-                af_mat_prop optLhs, af_mat_prop optRhs);
-template<typename T>
-Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
-             af_mat_prop optLhs, af_mat_prop optRhs);
-
 }
+}
+
diff --git a/test/blas.cpp b/test/blas.cpp
index 507cc6d..b5d92f1 100644
--- a/test/blas.cpp
+++ b/test/blas.cpp
@@ -36,6 +36,7 @@ template<typename T, bool isBVector>
 void MatMulCheck(string TestFile)
 {
     if (noDoubleTests<T>()) return;
+    af::info();
 
     using std::vector;
     vector<af::dim4> numDims;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git