[arrayfire] 110/248: Added CPU fallback for CUDA LU when CUDA older than 7

Tue Nov 17 15:54:11 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.

commit 0ca7aebe5d6670490fd0095306017c42a9aea097
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date:   Thu Oct 8 16:04:12 2015 -0400

    Added CPU fallback for CUDA LU when CUDA older than 7
---
 src/backend/cuda/CMakeLists.txt               |  35 ++++-
 src/backend/cuda/cpu_lapack/cpu_lu.cpp        | 197 ++++++++++++++++++++++++++
 src/backend/cuda/cpu_lapack/cpu_lu.hpp        |  22 +++
 src/backend/cuda/cpu_lapack/lapack_helper.hpp |  35 +++++
 src/backend/cuda/lu.cu                        |  30 ++++
 5 files changed, 318 insertions(+), 1 deletion(-)

diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
index 696aba7..02e0b1a 100644
--- a/src/backend/cuda/CMakeLists.txt
+++ b/src/backend/cuda/CMakeLists.txt
@@ -69,7 +69,20 @@ ENDIF()
 ADD_DEFINITIONS(-DAF_CUDA)
 
 IF(${CUDA_VERSION_MAJOR} LESS 7)
-    MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available." )
+    ## Try to use CPU side lapack
+    IF(APPLE)
+        FIND_PACKAGE(LAPACK)
+    ELSE(APPLE) # Linux and Windows
+        FIND_PACKAGE(LAPACKE)
+    ENDIF(APPLE)
+
+    IF(NOT LAPACK_FOUND)
+        MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.")
+    ELSE(NOT LAPACK_FOUND)
+        MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. But CPU LAPACK libraries are available. Will fallback to using host side code.")
+        SET(CUDA_LAPACK_CPU_FALLBACK ON)
+        ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA)
+    ENDIF()
     IF(CMAKE_VERSION VERSION_LESS 3.2)
         SET(CUDA_cusolver_LIBRARY)
     ENDIF(CMAKE_VERSION VERSION_LESS 3.2)
@@ -97,6 +110,10 @@ INCLUDE_DIRECTORIES(
     ${CUDA_NVVM_INCLUDE_DIR}
     )
 
+IF(CUDA_LAPACK_CPU_FALLBACK)
+  INCLUDE_DIRECTORIES(${LAPACK_INCLUDE_DIR})
+ENDIF()
+
 FILE(GLOB cuda_headers
      "*.hpp"
      "*.h")
@@ -121,6 +138,16 @@ SOURCE_GROUP(backend\\cuda\\Sources FILES ${cuda_sources})
 SOURCE_GROUP(backend\\cuda\\JIT FILES ${jit_sources})
 SOURCE_GROUP(backend\\cuda\\kernel\\Headers FILES ${kernel_headers})
 
+IF(CUDA_LAPACK_CPU_FALLBACK)
+    FILE(GLOB cpu_lapack_sources
+        "cpu_lapack/*.cpp")
+    FILE(GLOB cpu_lapack_headers
+        "cpu_lapack/*.hpp")
+
+    SOURCE_GROUP(backend\\cuda\\cpu_lapack\\Headers FILES ${cpu_lapack_headers})
+    SOURCE_GROUP(backend\\cuda\\cpu_lapack\\Sources FILES ${cpu_lapack_sources})
+ENDIF()
+
 FILE(GLOB backend_headers
     "../*.hpp"
     "../*.h"
@@ -256,6 +283,8 @@ MY_CUDA_ADD_LIBRARY(afcuda SHARED
                 ${cuda_sources}
                 ${jit_sources}
                 ${kernel_headers}
+                ${cpu_lapack_headers}
+                ${cpu_lapack_sources}
                 ${backend_headers}
                 ${backend_sources}
                 ${c_headers}
@@ -277,6 +306,10 @@ IF(FORGE_FOUND)
     TARGET_LINK_LIBRARIES(afcuda PRIVATE ${FORGE_LIBRARIES})
 ENDIF()
 
+IF(CUDA_LAPACK_CPU_FALLBACK)
+  TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES})
+ENDIF()
+
 SET_TARGET_PROPERTIES(afcuda PROPERTIES
     VERSION "${AF_VERSION}"
     SOVERSION "${AF_VERSION_MAJOR}")
diff --git a/src/backend/cuda/cpu_lapack/cpu_lu.cpp b/src/backend/cuda/cpu_lapack/cpu_lu.cpp
new file mode 100644
index 0000000..df7dde6
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_lu.cpp
@@ -0,0 +1,197 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <cpu_lapack/cpu_lu.hpp>
+#include <err_common.hpp>
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <iostream>
+#include <cassert>
+#include <err_cuda.hpp>
+
+#include "lapack_helper.hpp"
+
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T>
+using getrf_func_def = int (*)(ORDER_TYPE, int, int,
+                               T*, int,
+                               int*);
+
+#define LU_FUNC_DEF( FUNC )                                     \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define LU_FUNC( FUNC, TYPE, PREFIX )                           \
+template<> FUNC##_func_def<TYPE>     FUNC##_func<TYPE>()        \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+LU_FUNC_DEF( getrf )
+LU_FUNC(getrf , float  , s)
+LU_FUNC(getrf , double , d)
+LU_FUNC(getrf , cfloat , c)
+LU_FUNC(getrf , cdouble, z)
+
+template<typename T>
+void lu_split(T *l, T *u, const T *i,
+        const dim4 ldm, const dim4 udm, const dim4 idm,
+        const dim4 lst, const dim4 ust, const dim4 ist)
+{
+    for(dim_t ow = 0; ow < idm[3]; ow++) {
+        const dim_t lW = ow * lst[3];
+        const dim_t uW = ow * ust[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < idm[2]; oz++) {
+            const dim_t lZW = lW + oz * lst[2];
+            const dim_t uZW = uW + oz * ust[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < idm[1]; oy++) {
+                const dim_t lYZW = lZW + oy * lst[1];
+                const dim_t uYZW = uZW + oy * ust[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < idm[0]; ox++) {
+                    const dim_t lMem = lYZW + ox;
+                    const dim_t uMem = uYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+                    if(ox > oy) {
+                        if(oy < ldm[1])
+                            l[lMem] = i[iMem];
+                        if(ox < udm[0])
+                            u[uMem] = scalar<T>(0);
+                    } else if (oy > ox) {
+                        if(oy < ldm[1])
+                            l[lMem] = scalar<T>(0);
+                        if(ox < udm[0])
+                            u[uMem] = i[iMem];
+                    } else if(ox == oy) {
+                        if(oy < ldm[1])
+                            l[lMem] = scalar<T>(1.0);
+                        if(ox < udm[0])
+                            u[uMem] = i[iMem];
+                    }
+                }
+            }
+        }
+    }
+}
+
+void convertPivot(int **pivot, int out_sz, dim_t d0)
+{
+    int* p = pinnedAlloc<int>(out_sz);
+    for(int i = 0; i < out_sz; i++)
+        p[i] = i;
+
+    for(int j = 0; j < (int)d0; j++) {
+        // 1 indexed in pivot
+        std::swap(p[j], p[(*pivot)[j] - 1]);
+    }
+
+    pinnedFree(*pivot);
+    *pivot = p;
+}
+
+template<typename T>
+void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    Array<T> in_copy = copyArray<T>(in);
+
+    //////////////////////////////////////////
+    // LU inplace
+    int *pivotPtr  = pinnedAlloc<int>(min(M, N));
+    T   *inPtr     = pinnedAlloc<T>  (in_copy.elements());
+    copyData(inPtr, in);
+
+    getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                    inPtr, in_copy.strides()[1],
+                    pivotPtr);
+
+    convertPivot(&pivotPtr, M, min(M, N));
+
+    pivot = createHostDataArray<int>(af::dim4(M), pivotPtr);
+    //////////////////////////////////////////
+
+    // SPLIT into lower and upper
+    dim4 ldims(M, min(M, N));
+    dim4 udims(min(M, N), N);
+
+    T *lowerPtr = pinnedAlloc<T>(ldims.elements());
+    T *upperPtr = pinnedAlloc<T>(udims.elements());
+
+    dim4 lst(1, ldims[0], ldims[0] * ldims[1], ldims[0] * ldims[1] * ldims[2]);
+    dim4 ust(1, udims[0], udims[0] * udims[1], udims[0] * udims[1] * udims[2]);
+
+    lu_split<T>(lowerPtr, upperPtr, inPtr, ldims, udims, iDims,
+                lst, ust, in_copy.strides());
+
+    lower = createHostDataArray<T>(ldims, lowerPtr);
+    upper = createHostDataArray<T>(udims, upperPtr);
+
+    lower.eval();
+    upper.eval();
+
+    pinnedFree(lowerPtr);
+    pinnedFree(upperPtr);
+    pinnedFree(pivotPtr);
+    pinnedFree(inPtr);
+}
+
+template<typename T>
+Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
+{
+    dim4 iDims = in.dims();
+    int M = iDims[0];
+    int N = iDims[1];
+
+    int *pivotPtr  = pinnedAlloc<int>(min(M, N));
+    T   *inPtr     = pinnedAlloc<T>  (in.elements());
+    copyData(inPtr, in);
+
+    getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+                    inPtr, in.strides()[1],
+                    pivotPtr);
+
+    if(convert_pivot) convertPivot(&pivotPtr, M, min(M, N));
+
+    writeHostDataArray<T>(in, inPtr, in.elements() * sizeof(T));
+    Array<int> pivot = createHostDataArray<int>(af::dim4(M), pivotPtr);
+
+    pivot.eval();
+
+    pinnedFree(inPtr);
+    pinnedFree(pivotPtr);
+
+    return pivot;
+}
+
+#define INSTANTIATE_LU(T)                                                                           \
+    template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
+    template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+INSTANTIATE_LU(float)
+INSTANTIATE_LU(cfloat)
+INSTANTIATE_LU(double)
+INSTANTIATE_LU(cdouble)
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_lu.hpp b/src/backend/cuda/cpu_lapack/cpu_lu.hpp
new file mode 100644
index 0000000..39a638f
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_lu.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+    template<typename T>
+    void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+    template<typename T>
+    Array<int> lu_inplace(Array<T> &in, const bool convert_pivot = true);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/lapack_helper.hpp b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
new file mode 100644
index 0000000..5826587
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
@@ -0,0 +1,35 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#ifndef AFCPU_LAPACK
+#define AFCPU_LAPACK
+
+#include <types.hpp>
+
+#define lapack_complex_float cuda::cfloat
+#define lapack_complex_double cuda::cdouble
+#define LAPACK_PREFIX LAPACKE_
+#define ORDER_TYPE int
+#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR
+#define LAPACK_NAME(fn) LAPACKE_##fn
+
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#include <lapacke.hpp>
+#undef AF_LAPACK_COL_MAJOR
+#define AF_LAPACK_COL_MAJOR 0
+#else
+#ifdef USE_MKL
+#include<mkl_lapacke.h>
+#else // NETLIB LAPACKE
+#include<lapacke.h>
+#endif
+#endif
+
+#endif
diff --git a/src/backend/cuda/lu.cu b/src/backend/cuda/lu.cu
index 85dedf5..2a45d4b 100644
--- a/src/backend/cuda/lu.cu
+++ b/src/backend/cuda/lu.cu
@@ -166,6 +166,36 @@ INSTANTIATE_LU(double)
 INSTANTIATE_LU(cdouble)
 }
 
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+////////////////////////////////////////////////////////////////////////////////
+// For versions earlier than CUDA 7, use CPU fallback
+////////////////////////////////////////////////////////////////////////////////
+#include <cpu_lapack/cpu_lu.hpp>
+
+namespace cuda
+{
+template<typename T>
+void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
+{
+    return cpu::lu(lower, upper, pivot, in);
+}
+
+template<typename T>
+Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
+{
+    return cpu::lu_inplace(in, convert_pivot);
+}
+
+#define INSTANTIATE_LU(T)                                                                           \
+    template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
+    template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+INSTANTIATE_LU(float)
+INSTANTIATE_LU(cfloat)
+INSTANTIATE_LU(double)
+INSTANTIATE_LU(cdouble)
+}
+
 #else
 namespace cuda
 {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git