[arrayfire] 110/248: Added CPU fallback for CUDA LU when CUDA older than 7
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Nov 17 15:54:11 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.
commit 0ca7aebe5d6670490fd0095306017c42a9aea097
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date: Thu Oct 8 16:04:12 2015 -0400
Added CPU fallback for CUDA LU when CUDA older than 7
---
src/backend/cuda/CMakeLists.txt | 35 ++++-
src/backend/cuda/cpu_lapack/cpu_lu.cpp | 197 ++++++++++++++++++++++++++
src/backend/cuda/cpu_lapack/cpu_lu.hpp | 22 +++
src/backend/cuda/cpu_lapack/lapack_helper.hpp | 35 +++++
src/backend/cuda/lu.cu | 30 ++++
5 files changed, 318 insertions(+), 1 deletion(-)
diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
index 696aba7..02e0b1a 100644
--- a/src/backend/cuda/CMakeLists.txt
+++ b/src/backend/cuda/CMakeLists.txt
@@ -69,7 +69,20 @@ ENDIF()
ADD_DEFINITIONS(-DAF_CUDA)
IF(${CUDA_VERSION_MAJOR} LESS 7)
- MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available." )
+ ## Try to use CPU side lapack
+ IF(APPLE)
+ FIND_PACKAGE(LAPACK)
+ ELSE(APPLE) # Linux and Windows
+ FIND_PACKAGE(LAPACKE)
+ ENDIF(APPLE)
+
+ IF(NOT LAPACK_FOUND)
+ MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.")
+ ELSE(NOT LAPACK_FOUND)
+ MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. But CPU LAPACK libraries are available. Will fallback to using host side code.")
+ SET(CUDA_LAPACK_CPU_FALLBACK ON)
+ ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA)
+ ENDIF()
IF(CMAKE_VERSION VERSION_LESS 3.2)
SET(CUDA_cusolver_LIBRARY)
ENDIF(CMAKE_VERSION VERSION_LESS 3.2)
@@ -97,6 +110,10 @@ INCLUDE_DIRECTORIES(
${CUDA_NVVM_INCLUDE_DIR}
)
+IF(CUDA_LAPACK_CPU_FALLBACK)
+ INCLUDE_DIRECTORIES(${LAPACK_INCLUDE_DIR})
+ENDIF()
+
FILE(GLOB cuda_headers
"*.hpp"
"*.h")
@@ -121,6 +138,16 @@ SOURCE_GROUP(backend\\cuda\\Sources FILES ${cuda_sources})
SOURCE_GROUP(backend\\cuda\\JIT FILES ${jit_sources})
SOURCE_GROUP(backend\\cuda\\kernel\\Headers FILES ${kernel_headers})
+IF(CUDA_LAPACK_CPU_FALLBACK)
+ FILE(GLOB cpu_lapack_sources
+ "cpu_lapack/*.cpp")
+ FILE(GLOB cpu_lapack_headers
+ "cpu_lapack/*.hpp")
+
+ SOURCE_GROUP(backend\\cuda\\cpu_lapack\\Headers FILES ${cpu_lapack_headers})
+ SOURCE_GROUP(backend\\cuda\\cpu_lapack\\Sources FILES ${cpu_lapack_sources})
+ENDIF()
+
FILE(GLOB backend_headers
"../*.hpp"
"../*.h"
@@ -256,6 +283,8 @@ MY_CUDA_ADD_LIBRARY(afcuda SHARED
${cuda_sources}
${jit_sources}
${kernel_headers}
+ ${cpu_lapack_headers}
+ ${cpu_lapack_sources}
${backend_headers}
${backend_sources}
${c_headers}
@@ -277,6 +306,10 @@ IF(FORGE_FOUND)
TARGET_LINK_LIBRARIES(afcuda PRIVATE ${FORGE_LIBRARIES})
ENDIF()
+IF(CUDA_LAPACK_CPU_FALLBACK)
+ TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES})
+ENDIF()
+
SET_TARGET_PROPERTIES(afcuda PROPERTIES
VERSION "${AF_VERSION}"
SOVERSION "${AF_VERSION_MAJOR}")
diff --git a/src/backend/cuda/cpu_lapack/cpu_lu.cpp b/src/backend/cuda/cpu_lapack/cpu_lu.cpp
new file mode 100644
index 0000000..df7dde6
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_lu.cpp
@@ -0,0 +1,197 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <cpu_lapack/cpu_lu.hpp>
+#include <err_common.hpp>
+
+#if defined(WITH_CPU_LINEAR_ALGEBRA)
+
+#include <af/dim4.hpp>
+#include <handle.hpp>
+#include <iostream>
+#include <cassert>
+#include <err_cuda.hpp>
+
+#include "lapack_helper.hpp"
+
+namespace cuda
+{
+namespace cpu
+{
+
+template<typename T>
+using getrf_func_def = int (*)(ORDER_TYPE, int, int,
+ T*, int,
+ int*);
+
+#define LU_FUNC_DEF( FUNC ) \
+template<typename T> FUNC##_func_def<T> FUNC##_func();
+
+
+#define LU_FUNC( FUNC, TYPE, PREFIX ) \
+template<> FUNC##_func_def<TYPE> FUNC##_func<TYPE>() \
+{ return & LAPACK_NAME(PREFIX##FUNC); }
+
+LU_FUNC_DEF( getrf )
+LU_FUNC(getrf , float , s)
+LU_FUNC(getrf , double , d)
+LU_FUNC(getrf , cfloat , c)
+LU_FUNC(getrf , cdouble, z)
+
+template<typename T>
+void lu_split(T *l, T *u, const T *i,
+ const dim4 ldm, const dim4 udm, const dim4 idm,
+ const dim4 lst, const dim4 ust, const dim4 ist)
+{
+ for(dim_t ow = 0; ow < idm[3]; ow++) {
+ const dim_t lW = ow * lst[3];
+ const dim_t uW = ow * ust[3];
+ const dim_t iW = ow * ist[3];
+
+ for(dim_t oz = 0; oz < idm[2]; oz++) {
+ const dim_t lZW = lW + oz * lst[2];
+ const dim_t uZW = uW + oz * ust[2];
+ const dim_t iZW = iW + oz * ist[2];
+
+ for(dim_t oy = 0; oy < idm[1]; oy++) {
+ const dim_t lYZW = lZW + oy * lst[1];
+ const dim_t uYZW = uZW + oy * ust[1];
+ const dim_t iYZW = iZW + oy * ist[1];
+
+ for(dim_t ox = 0; ox < idm[0]; ox++) {
+ const dim_t lMem = lYZW + ox;
+ const dim_t uMem = uYZW + ox;
+ const dim_t iMem = iYZW + ox;
+ if(ox > oy) {
+ if(oy < ldm[1])
+ l[lMem] = i[iMem];
+ if(ox < udm[0])
+ u[uMem] = scalar<T>(0);
+ } else if (oy > ox) {
+ if(oy < ldm[1])
+ l[lMem] = scalar<T>(0);
+ if(ox < udm[0])
+ u[uMem] = i[iMem];
+ } else if(ox == oy) {
+ if(oy < ldm[1])
+ l[lMem] = scalar<T>(1.0);
+ if(ox < udm[0])
+ u[uMem] = i[iMem];
+ }
+ }
+ }
+ }
+ }
+}
+
+void convertPivot(int **pivot, int out_sz, dim_t d0)
+{
+ int* p = pinnedAlloc<int>(out_sz);
+ for(int i = 0; i < out_sz; i++)
+ p[i] = i;
+
+ for(int j = 0; j < (int)d0; j++) {
+ // 1 indexed in pivot
+ std::swap(p[j], p[(*pivot)[j] - 1]);
+ }
+
+ pinnedFree(*pivot);
+ *pivot = p;
+}
+
+template<typename T>
+void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
+{
+ dim4 iDims = in.dims();
+ int M = iDims[0];
+ int N = iDims[1];
+
+ Array<T> in_copy = copyArray<T>(in);
+
+ //////////////////////////////////////////
+ // LU inplace
+ int *pivotPtr = pinnedAlloc<int>(min(M, N));
+ T *inPtr = pinnedAlloc<T> (in_copy.elements());
+ copyData(inPtr, in);
+
+ getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+ inPtr, in_copy.strides()[1],
+ pivotPtr);
+
+ convertPivot(&pivotPtr, M, min(M, N));
+
+ pivot = createHostDataArray<int>(af::dim4(M), pivotPtr);
+ //////////////////////////////////////////
+
+ // SPLIT into lower and upper
+ dim4 ldims(M, min(M, N));
+ dim4 udims(min(M, N), N);
+
+ T *lowerPtr = pinnedAlloc<T>(ldims.elements());
+ T *upperPtr = pinnedAlloc<T>(udims.elements());
+
+ dim4 lst(1, ldims[0], ldims[0] * ldims[1], ldims[0] * ldims[1] * ldims[2]);
+ dim4 ust(1, udims[0], udims[0] * udims[1], udims[0] * udims[1] * udims[2]);
+
+ lu_split<T>(lowerPtr, upperPtr, inPtr, ldims, udims, iDims,
+ lst, ust, in_copy.strides());
+
+ lower = createHostDataArray<T>(ldims, lowerPtr);
+ upper = createHostDataArray<T>(udims, upperPtr);
+
+ lower.eval();
+ upper.eval();
+
+ pinnedFree(lowerPtr);
+ pinnedFree(upperPtr);
+ pinnedFree(pivotPtr);
+ pinnedFree(inPtr);
+}
+
+template<typename T>
+Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
+{
+ dim4 iDims = in.dims();
+ int M = iDims[0];
+ int N = iDims[1];
+
+ int *pivotPtr = pinnedAlloc<int>(min(M, N));
+ T *inPtr = pinnedAlloc<T> (in.elements());
+ copyData(inPtr, in);
+
+ getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N,
+ inPtr, in.strides()[1],
+ pivotPtr);
+
+ if(convert_pivot) convertPivot(&pivotPtr, M, min(M, N));
+
+ writeHostDataArray<T>(in, inPtr, in.elements() * sizeof(T));
+ Array<int> pivot = createHostDataArray<int>(af::dim4(M), pivotPtr);
+
+ pivot.eval();
+
+ pinnedFree(inPtr);
+ pinnedFree(pivotPtr);
+
+ return pivot;
+}
+
+#define INSTANTIATE_LU(T) \
+ template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot); \
+ template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+INSTANTIATE_LU(float)
+INSTANTIATE_LU(cfloat)
+INSTANTIATE_LU(double)
+INSTANTIATE_LU(cdouble)
+
+}
+}
+
+#endif
diff --git a/src/backend/cuda/cpu_lapack/cpu_lu.hpp b/src/backend/cuda/cpu_lapack/cpu_lu.hpp
new file mode 100644
index 0000000..39a638f
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/cpu_lu.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cuda
+{
+namespace cpu
+{
+ template<typename T>
+ void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+ template<typename T>
+ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot = true);
+}
+}
diff --git a/src/backend/cuda/cpu_lapack/lapack_helper.hpp b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
new file mode 100644
index 0000000..5826587
--- /dev/null
+++ b/src/backend/cuda/cpu_lapack/lapack_helper.hpp
@@ -0,0 +1,35 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#ifndef AFCPU_LAPACK
+#define AFCPU_LAPACK
+
+#include <types.hpp>
+
+#define lapack_complex_float cuda::cfloat
+#define lapack_complex_double cuda::cdouble
+#define LAPACK_PREFIX LAPACKE_
+#define ORDER_TYPE int
+#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR
+#define LAPACK_NAME(fn) LAPACKE_##fn
+
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#include <lapacke.hpp>
+#undef AF_LAPACK_COL_MAJOR
+#define AF_LAPACK_COL_MAJOR 0
+#else
+#ifdef USE_MKL
+#include<mkl_lapacke.h>
+#else // NETLIB LAPACKE
+#include<lapacke.h>
+#endif
+#endif
+
+#endif
diff --git a/src/backend/cuda/lu.cu b/src/backend/cuda/lu.cu
index 85dedf5..2a45d4b 100644
--- a/src/backend/cuda/lu.cu
+++ b/src/backend/cuda/lu.cu
@@ -166,6 +166,36 @@ INSTANTIATE_LU(double)
INSTANTIATE_LU(cdouble)
}
+#elif defined(WITH_CPU_LINEAR_ALGEBRA)
+////////////////////////////////////////////////////////////////////////////////
+// For versions earlier than CUDA 7, use CPU fallback
+////////////////////////////////////////////////////////////////////////////////
+#include <cpu_lapack/cpu_lu.hpp>
+
+namespace cuda
+{
+template<typename T>
+void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
+{
+ return cpu::lu(lower, upper, pivot, in);
+}
+
+template<typename T>
+Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
+{
+ return cpu::lu_inplace(in, convert_pivot);
+}
+
+#define INSTANTIATE_LU(T) \
+ template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot); \
+ template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
+
+INSTANTIATE_LU(float)
+INSTANTIATE_LU(cfloat)
+INSTANTIATE_LU(double)
+INSTANTIATE_LU(cdouble)
+}
+
#else
namespace cuda
{
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list