[arrayfire] 295/408: Cleaning up cpu blas / lapack in OpenCL backend

Mon Sep 21 19:12:17 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 20b5f5183c4e4bb28babf4119e5a84b7f48a5bad
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Wed Aug 19 22:54:36 2015 -0400

    Cleaning up cpu blas / lapack in OpenCL backend
---
 src/api/c/svd.cpp                           |   6 --
 src/backend/cblas.cpp                       |  74 +++++++++++--------
 src/backend/opencl/CMakeLists.txt           |  22 +++++-
 src/backend/opencl/magma/geqrf2.cpp         |  10 +--
 src/backend/opencl/magma/geqrf3.cpp         |  10 +--
 src/backend/opencl/magma/getrf.cpp          |  40 +++++------
 src/backend/opencl/magma/getrs.cpp          |  30 ++++----
 src/backend/opencl/magma/larfb.cpp          |  31 ++++----
 src/backend/opencl/magma/magma_blas.h       |  73 ++++++++-----------
 src/backend/opencl/magma/magma_cpu_blas.h   |  90 +++++++++++++++++++++++
 src/backend/opencl/magma/magma_cpu_lapack.h | 107 +++++++++++++---------------
 src/backend/opencl/magma/magma_helper.cpp   |  22 ++++++
 src/backend/opencl/magma/magma_helper.h     |   3 +
 src/backend/opencl/magma/potrf.cpp          |  40 +++++------
 src/backend/opencl/magma/ungqr.cpp          |   4 +-
 src/backend/opencl/magma/unmqr.cpp          |   6 +-
 src/backend/opencl/magma/unmqr2.cpp         |   4 +-
 src/backend/opencl/solve.cpp                |  14 ++--
 18 files changed, 355 insertions(+), 231 deletions(-)

diff --git a/src/api/c/svd.cpp b/src/api/c/svd.cpp
index 31f9aae..fc465dd 100644
--- a/src/api/c/svd.cpp
+++ b/src/api/c/svd.cpp
@@ -12,17 +12,11 @@
 #include <af/lapack.h>
 
 #include <af/util.h>
-
 #include <af/defines.h>
-
 #include <err_common.hpp>
-
 #include <backend.hpp>
-
 #include <Array.hpp>
-
 #include <handle.hpp>
-
 #include <svd.hpp>
 
 using namespace detail;
diff --git a/src/backend/cblas.cpp b/src/backend/cblas.cpp
index 1b582c5..5400740 100644
--- a/src/backend/cblas.cpp
+++ b/src/backend/cblas.cpp
@@ -23,34 +23,52 @@ static char transChar(CBLAS_TRANSPOSE Trans)
     }
 }
 
-#define GEMM_F77(X, TS, TV, TY)                                                     \
-void cblas_##X##gemm(                                                               \
-       const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA,                       \
-       const CBLAS_TRANSPOSE TransB, const int M, const int N,                      \
-       const int K, const TS alpha, const TV *A,                                    \
-       const int lda, const TV *B, const int ldb,                                   \
-       const TS beta, TV *C, const int ldc)                                         \
-{                                                                                   \
-    char aT = transChar(TransA);                                                    \
-    char bT = transChar(TransB);                                                    \
-    X##gemm_(&aT, &bT, &M, &N, &K,                                                  \
-            (const TY *)ADDR(alpha), (const TY *)A, &lda,                           \
-            (const TY *)B, &ldb,                                                    \
-            (const TY *)ADDR(beta), (TY *)C, &ldc);                                 \
-}                                                                                   \
-void cblas_##X##gemv(                                                               \
-        const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA,                      \
-        const int M, const int N,                                                   \
-        const TS alpha, const TV *A, const int lda,                                 \
-        const TV *X, const int incX, const TS beta,                                 \
-        TV *Y, const int incY)                                                      \
-{                                                                                   \
-    char aT = transChar(TransA);                                                    \
-    X##gemv_(&aT, &M, &N,                                                           \
-            (const TY *)ADDR(alpha), (const TY *)A, &lda,                           \
-            (const TY *)X, &incX,                                                   \
-            (const TY *)ADDR(beta), (TY *)Y, &incY);                                \
-}                                                                                   \
+#define GEMM_F77(X, TS, TV, TY)                                 \
+    void cblas_##X##gemm(                                       \
+        const CBLAS_ORDER Order, const CBLAS_TRANSPOSE TransA,  \
+        const CBLAS_TRANSPOSE TransB, const int M, const int N, \
+        const int K, const TS alpha, const TV *A,               \
+        const int lda, const TV *B, const int ldb,              \
+        const TS beta, TV *C, const int ldc)                    \
+    {                                                           \
+        char aT = transChar(TransA);                            \
+        char bT = transChar(TransB);                            \
+        X##gemm_(&aT, &bT, &M, &N, &K,                          \
+                 (const TY *)ADDR(alpha), (const TY *)A, &lda,  \
+                 (const TY *)B, &ldb,                           \
+                 (const TY *)ADDR(beta), (TY *)C, &ldc);        \
+    }                                                           \
+    void cblas_##X##gemv(                                       \
+        const CBLAS_ORDER order, const CBLAS_TRANSPOSE TransA,  \
+        const int M, const int N,                               \
+        const TS alpha, const TV *A, const int lda,             \
+        const TV *X, const int incX, const TS beta,             \
+        TV *Y, const int incY)                                  \
+    {                                                           \
+        char aT = transChar(TransA);                            \
+        X##gemv_(&aT, &M, &N,                                   \
+                 (const TY *)ADDR(alpha), (const TY *)A, &lda,  \
+                 (const TY *)X, &incX,                          \
+                 (const TY *)ADDR(beta), (TY *)Y, &incY);       \
+    }                                                           \
+    void cblas_##X##axpy(                                       \
+        const int N, const TS alpha,                            \
+        const TV *X, const int incX,                            \
+        TV *Y, const int incY)                                  \
+    {                                                           \
+        X##axpy_(&N,                                            \
+                 (const TY *)ADDR(alpha),                       \
+                 (const TY *)X, &incX,                          \
+                 (TY *)Y, &incY);                               \
+    }                                                           \
+    void cblas_##X##scal(                                       \
+        const int N, const TS alpha,                            \
+        TV *X, const int incX)                                  \
+    {                                                           \
+        X##scal_(&N,                                            \
+                 (const TY *)ADDR(alpha),                       \
+                 (TY *)X, &incX);                               \
+    }                                                           \
 
 #define ADDR(val) &val
 GEMM_F77(s, float, float, float)
diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt
index 3c6bc37..767a292 100644
--- a/src/backend/opencl/CMakeLists.txt
+++ b/src/backend/opencl/CMakeLists.txt
@@ -24,7 +24,22 @@ ENDIF(APPLE)
 IF(NOT LAPACK_FOUND)
     MESSAGE(WARNING "LAPACK not found. Functionality will be disabled")
 ELSE(NOT LAPACK_FOUND)
-    ADD_DEFINITIONS(-DWITH_OPENCL_LINEAR_ALGEBRA)
+  ADD_DEFINITIONS(-DWITH_OPENCL_LINEAR_ALGEBRA)
+
+  IF(NOT USE_OPENCL_MKL)
+    FIND_PACKAGE(CBLAS REQUIRED)
+
+    IF(USE_CPU_F77_BLAS)
+      MESSAGE("Using F77 BLAS")
+      ADD_DEFINITIONS(-DUSE_F77_BLAS)
+    ENDIF()
+
+    IF (NOT CBLAS_LIBRARIES)
+      MESSAGE(SEND_ERROR "CBLAS Library not set")
+    ELSE()
+      MESSAGE(STATUS "Using CBLAS Library: ${CBLAS_LIBRARIES}")
+    ENDIF()
+  ENDIF()
 ENDIF()
 
 IF(NOT UNIX)
@@ -75,6 +90,7 @@ INCLUDE_DIRECTORIES(
     ${CLFFT_INCLUDE_DIRS}
     ${Boost_INCLUDE_DIR}
     ${BoostCompute_INCLUDE_DIRS}
+    ${CBLAS_INCLUDE_DIR}
     ${LAPACK_INCLUDE_DIR}
     )
 
@@ -243,7 +259,9 @@ IF(FORGE_FOUND AND NOT USE_SYSTEM_FORGE)
 ENDIF()
 
 IF(LAPACK_FOUND)
-   TARGET_LINK_LIBRARIES(afopencl   PRIVATE ${LAPACK_LIBRARIES})
+  TARGET_LINK_LIBRARIES(afopencl
+    PRIVATE ${LAPACK_LIBRARIES}
+    PRIVATE ${CBLAS_LIBRARIES})
 ENDIF()
 
 SET_TARGET_PROPERTIES(afopencl PROPERTIES
diff --git a/src/backend/opencl/magma/geqrf2.cpp b/src/backend/opencl/magma/geqrf2.cpp
index 3784eda..4041976 100644
--- a/src/backend/opencl/magma/geqrf2.cpp
+++ b/src/backend/opencl/magma/geqrf2.cpp
@@ -245,8 +245,8 @@ magma_geqrf2_gpu(
                                    0, lwork*sizeof(Ty),
                                    0, NULL, NULL, NULL);
 
-    geqrf_work_func<Ty> cpu_geqrf;
-    larft_func<Ty> cpu_larft;
+    cpu_geqrf_work_func<Ty> cpu_geqrf;
+    cpu_larft_func<Ty> cpu_larft;
 
     nbmin = 2;
     nx    = nb;
@@ -275,11 +275,11 @@ magma_geqrf2_gpu(
             }
 
             magma_queue_sync(queue[0]);
-            *info = cpu_geqrf(LAPACK_COL_MAJOR, rows, ib, work(i), ldwork, tau+i, hwork, lhwork);
+            *info = cpu_geqrf( rows, ib, work(i), ldwork, tau+i, hwork, lhwork);
 
             /* Form the triangular factor of the block reflector
                H = H(i) H(i+1) . . . H(i+ib-1) */
-            cpu_larft(LAPACK_COL_MAJOR,
+            cpu_larft(
                       *MagmaForwardStr, *MagmaColumnwiseStr,
                       rows, ib,
                       work(i), ldwork, tau+i, hwork, ib);
@@ -329,7 +329,7 @@ magma_geqrf2_gpu(
         magma_queue_sync(queue[1]);
 
         lhwork = lwork - rows*ib;
-        *info = cpu_geqrf(LAPACK_COL_MAJOR, rows, ib, work, rows, tau+i, work+ib*rows, lhwork);
+        *info = cpu_geqrf( rows, ib, work, rows, tau+i, work+ib*rows, lhwork);
 
         magma_setmatrix_async<Ty>(rows, ib, work, rows, dA(i, i), ldda, queue[1], NULL);
     }
diff --git a/src/backend/opencl/magma/geqrf3.cpp b/src/backend/opencl/magma/geqrf3.cpp
index ce7a1c9..192bd45 100644
--- a/src/backend/opencl/magma/geqrf3.cpp
+++ b/src/backend/opencl/magma/geqrf3.cpp
@@ -217,8 +217,8 @@ magma_geqrf3_gpu(
     ldwork = m;
     lddwork= n;
 
-    geqrf_work_func<Ty> cpu_geqrf;
-    larft_func<Ty> cpu_larft;
+    cpu_geqrf_work_func<Ty> cpu_geqrf;
+    cpu_larft_func<Ty> cpu_larft;
 
     if ( (nb > 1) && (nb < k) ) {
         /* Use blocked code initially */
@@ -244,11 +244,11 @@ magma_geqrf3_gpu(
             }
 
             magma_event_sync(event[1]);
-            *info = cpu_geqrf(LAPACK_COL_MAJOR, rows, ib, work_ref(i), ldwork, tau+i, hwork, lhwork);
+            *info = cpu_geqrf( rows, ib, work_ref(i), ldwork, tau+i, hwork, lhwork);
 
             /* Form the triangular factor of the block reflector
                H = H(i) H(i+1) . . . H(i+ib-1) */
-            cpu_larft(LAPACK_COL_MAJOR,
+            cpu_larft(
                       *MagmaForwardStr, *MagmaColumnwiseStr,
                       rows, ib,
                       work_ref(i), ldwork,
@@ -296,7 +296,7 @@ magma_geqrf3_gpu(
         magma_getmatrix<Ty>( rows, ib, a_ref(i, i), ldda, work, rows, queue );
 
         lhwork = lwork - rows*ib;
-        *info = cpu_geqrf(LAPACK_COL_MAJOR, rows, ib, work, rows, tau+i, work+ib*rows, lhwork);
+        *info = cpu_geqrf( rows, ib, work, rows, tau+i, work+ib*rows, lhwork);
 
         magma_setmatrix<Ty>( rows, ib, work, rows, a_ref(i, i), ldda, queue );
     }
diff --git a/src/backend/opencl/magma/getrf.cpp b/src/backend/opencl/magma/getrf.cpp
index a79bd7c..b398afd 100644
--- a/src/backend/opencl/magma/getrf.cpp
+++ b/src/backend/opencl/magma/getrf.cpp
@@ -31,22 +31,22 @@
  * * Redistributions  of  source  code  must  retain  the above copyright
  *   notice,  this  list  of  conditions  and  the  following  disclaimer.
  * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the 
+ *   notice,  this list of conditions and the following disclaimer in the
  *   documentation  and/or other materials provided with the distribution.
- * * Neither  the  name of the University of Tennessee, Knoxville nor the 
+ * * Neither  the  name of the University of Tennessee, Knoxville nor the
  *   names of its contributors may be used to endorse or promote products
  *   derived from this software without specific prior written permission.
  *
  * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT 
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
  * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **********************************************************************/
@@ -149,9 +149,9 @@ magma_int_t magma_getrf_gpu(
     if (m == 0 || n == 0)
         return *info;
 
-    gemm_func<Ty> gpu_gemm;
-    trsm_func<Ty> gpu_trsm;
-    getrf_func<Ty> cpu_getrf;
+    gpu_gemm_func<Ty> gpu_gemm;
+    gpu_trsm_func<Ty> gpu_trsm;
+    cpu_getrf_func<Ty> cpu_getrf;
 
     /* Function Body */
     mindim = std::min(m, n);
@@ -165,7 +165,7 @@ magma_int_t magma_getrf_gpu(
             return *info;
         }
         magma_getmatrix<Ty>(m, n, dA(0,0), ldda, work(0), m, queue);
-        cpu_getrf(LAPACK_COL_MAJOR, m, n, work, m, ipiv);
+        cpu_getrf( m, n, work, m, ipiv);
         magma_setmatrix<Ty>(m, n, work(0), m, dA(0,0), ldda, queue);
         magma_free_cpu(work);
     }
@@ -219,7 +219,7 @@ magma_int_t magma_getrf_gpu(
             magma_getmatrix<Ty>(m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queue);
 
             if (j > 0 && n > (j + 1) * nb) {
-                gpu_trsm(clblasColumnMajor,
+                gpu_trsm(
                          clblasRight, clblasUpper, clblasNoTrans, clblasUnit,
                          n - (j+1)*nb, nb,
                          c_one,
@@ -228,7 +228,7 @@ magma_int_t magma_getrf_gpu(
                          1, &queue, 0, nullptr, &event);
 
                 if (m > j * nb)  {
-                    gpu_gemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans,
+                    gpu_gemm( clblasNoTrans, clblasNoTrans,
                          n-(j+1)*nb, m-j*nb, nb,
                          c_neg_one,
                          dAT(j-1,j+1), lddat,
@@ -241,7 +241,7 @@ magma_int_t magma_getrf_gpu(
 
             // do the cpu part
             rows = m - j*nb;
-            cpu_getrf(LAPACK_COL_MAJOR, rows, nb, work, ldwork, ipiv+j*nb);
+            cpu_getrf( rows, nb, work, ldwork, ipiv+j*nb);
             if (*info == 0 && iinfo > 0)
                 *info = iinfo + j*nb;
 
@@ -257,7 +257,7 @@ magma_int_t magma_getrf_gpu(
 
             // do the small non-parallel computations (next panel update)
             if (s > (j+1)) {
-                gpu_trsm(clblasColumnMajor,
+                gpu_trsm(
                          clblasRight, clblasUpper, clblasNoTrans, clblasUnit,
                          nb, nb,
                          c_one,
@@ -266,7 +266,7 @@ magma_int_t magma_getrf_gpu(
                          1, &queue, 0, nullptr, &event);
 
 
-                gpu_gemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans,
+                gpu_gemm( clblasNoTrans, clblasNoTrans,
                          nb, m-(j+1)*nb, nb,
                          c_neg_one,
                          dAT(j,   j+1), lddat,
@@ -277,7 +277,7 @@ magma_int_t magma_getrf_gpu(
             }
             else {
                 if (n > s * nb) {
-                    gpu_trsm(clblasColumnMajor,
+                    gpu_trsm(
                              clblasRight, clblasUpper, clblasNoTrans, clblasUnit,
                              n-s*nb, nb,
                              c_one,
@@ -287,7 +287,7 @@ magma_int_t magma_getrf_gpu(
                 }
 
                 if ((n > (j+1) * nb) && (m > (j+1) * nb)) {
-                    gpu_gemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans,
+                    gpu_gemm( clblasNoTrans, clblasNoTrans,
                              n-(j+1)*nb, m-(j+1)*nb, nb,
                              c_neg_one,
                              dAT(j,   j+1), lddat,
@@ -308,7 +308,7 @@ magma_int_t magma_getrf_gpu(
             magma_getmatrix<Ty>(rows, nb0, dAP(0,0), maxm, work(0), ldwork, queue);
 
             // do the cpu part
-            cpu_getrf(LAPACK_COL_MAJOR, rows, nb0, work, ldwork, ipiv+s*nb);
+            cpu_getrf( rows, nb0, work, ldwork, ipiv+s*nb);
             if (*info == 0 && iinfo > 0)
                 *info = iinfo + s*nb;
 
@@ -322,7 +322,7 @@ magma_int_t magma_getrf_gpu(
             magmablas_transpose<Ty>(rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queue);
 
             if (n > s * nb + nb0) {
-                gpu_trsm(clblasColumnMajor,
+                gpu_trsm(
                          clblasRight, clblasUpper, clblasNoTrans, clblasUnit,
                          n-s*nb-nb0, nb0,
                          c_one, dAT(s,s),     lddat,
diff --git a/src/backend/opencl/magma/getrs.cpp b/src/backend/opencl/magma/getrs.cpp
index 3b83179..6ad943b 100644
--- a/src/backend/opencl/magma/getrs.cpp
+++ b/src/backend/opencl/magma/getrs.cpp
@@ -159,9 +159,9 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
     i1 = 1;
     i2 = n;
 
-    laswp_func<Ty> cpu_laswp;
-    trsm_func<Ty> gpu_trsm;
-    trsv_func<Ty> gpu_trsv;
+    cpu_laswp_func<Ty> cpu_laswp;
+    gpu_trsm_func<Ty> gpu_trsm;
+    gpu_trsv_func<Ty> gpu_trsv;
 
     cl_event event = NULL;
 
@@ -180,18 +180,18 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
 
         /* Solve A * X = B. */
         magma_getmatrix<Ty>( n, nrhs, dB, dB_offset, lddb, work, n, queue );
-        cpu_laswp(LAPACK_COL_MAJOR, nrhs, work, n, i1, i2, ipiv, inc);
+        cpu_laswp( nrhs, work, n, i1, i2, ipiv, inc);
         magma_setmatrix<Ty>( n, nrhs, work, n, dB, dB_offset, lddb, queue );
         if ( nrhs == 1) {
-            gpu_trsv(clblasColumnMajor, clblasLower, clblasNoTrans, clblasUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
-            gpu_trsv(clblasColumnMajor, clblasUpper, clblasNoTrans, clblasNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
+            gpu_trsv( clblasLower, clblasNoTrans, clblasUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
+            gpu_trsv( clblasUpper, clblasNoTrans, clblasNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
         } else {
-            gpu_trsm(clblasColumnMajor, clblasLeft, clblasLower, clblasNoTrans, clblasUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
+            gpu_trsm( clblasLeft, clblasLower, clblasNoTrans, clblasUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
 
             if(cond) {
-                gpu_trsm(clblasColumnMajor, clblasLeft, clblasLower, clblasTrans, clblasNonUnit, n, nrhs, c_one, dAT, 0, n, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
+                gpu_trsm( clblasLeft, clblasLower, clblasTrans, clblasNonUnit, n, nrhs, c_one, dAT, 0, n, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
             } else {
-                gpu_trsm(clblasColumnMajor, clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
+                gpu_trsm( clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
             }
         }
     } else {
@@ -199,18 +199,18 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
 
         /* Solve A' * X = B. */
         if ( nrhs == 1) {
-            gpu_trsv(clblasColumnMajor, clblasUpper, cltrans, clblasNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
-            gpu_trsv(clblasColumnMajor, clblasLower, cltrans, clblasUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
+            gpu_trsv( clblasUpper, cltrans, clblasNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
+            gpu_trsv( clblasLower, cltrans, clblasUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, 1, &queue, 0, nullptr, &event);
         } else {
             if(cond) {
-                gpu_trsm(clblasColumnMajor, clblasLeft, clblasLower, clblasNoTrans, clblasNonUnit, n, nrhs, c_one, dAT, 0, n, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
+                gpu_trsm( clblasLeft, clblasLower, clblasNoTrans, clblasNonUnit, n, nrhs, c_one, dAT, 0, n, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
             } else {
-                gpu_trsm(clblasColumnMajor, clblasLeft, clblasUpper, cltrans, clblasNonUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
+                gpu_trsm( clblasLeft, clblasUpper, cltrans, clblasNonUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
             }
-            gpu_trsm(clblasColumnMajor, clblasLeft, clblasLower, cltrans, clblasUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
+            gpu_trsm( clblasLeft, clblasLower, cltrans, clblasUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, nullptr, &event);
         }
         magma_getmatrix<Ty>( n, nrhs, dB, dB_offset, lddb, work, n, queue );
-        cpu_laswp(LAPACK_COL_MAJOR, nrhs, work, n, i1, i2, ipiv, inc);
+        cpu_laswp( nrhs, work, n, i1, i2, ipiv, inc);
         magma_setmatrix<Ty>( n, nrhs, work, n, dB, dB_offset, lddb, queue );
     }
 
diff --git a/src/backend/opencl/magma/larfb.cpp b/src/backend/opencl/magma/larfb.cpp
index 747e16a..5b188f4 100644
--- a/src/backend/opencl/magma/larfb.cpp
+++ b/src/backend/opencl/magma/larfb.cpp
@@ -33,22 +33,22 @@
  * * Redistributions  of  source  code  must  retain  the above copyright
  *   notice,  this  list  of  conditions  and  the  following  disclaimer.
  * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the 
+ *   notice,  this list of conditions and the following disclaimer in the
  *   documentation  and/or other materials provided with the distribution.
- * * Neither  the  name of the University of Tennessee, Knoxville nor the 
+ * * Neither  the  name of the University of Tennessee, Knoxville nor the
  *   names of its contributors may be used to endorse or promote products
  *   derived from this software without specific prior written permission.
  *
  * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT 
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
  * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **********************************************************************/
@@ -56,7 +56,6 @@
 #include "magma.h"
 #include "magma_blas.h"
 #include "magma_data.h"
-#include "magma_cpu_lapack.h"
 #include "magma_helper.h"
 #include "magma_sync.h"
 
@@ -255,8 +254,8 @@ magma_larfb_gpu(
         transV   = clblasNoTrans;
     }
 
-    gemm_func<Ty> gpu_gemm;
-    trmm_func<Ty> gpu_trmm;
+    gpu_gemm_func<Ty> gpu_gemm;
+    gpu_trmm_func<Ty> gpu_trmm;
 
     cl_event event = NULL;
 
@@ -265,7 +264,7 @@ magma_larfb_gpu(
         // Comments assume H C. When forming H^H C, T gets transposed via transt.
 
         // W = C^H V
-        gpu_gemm(clblasColumnMajor,
+        gpu_gemm(
                  transType, notransV,
                  n, k, m,
                  c_one,
@@ -276,7 +275,7 @@ magma_larfb_gpu(
                  1, &queue, 0, nullptr, &event);
 
         // W = W T^H = C^H V T^H
-        gpu_trmm(clblasColumnMajor,
+        gpu_trmm(
                  clblasRight,
                  uplo, transt, clblasNonUnit,
                  n, k,
@@ -286,7 +285,7 @@ magma_larfb_gpu(
                  1, &queue, 0, nullptr, &event);
 
         // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
-        gpu_gemm(clblasColumnMajor,
+        gpu_gemm(
                  notransV, transType,
                  m, n, k,
                  c_neg_one,
@@ -301,7 +300,7 @@ magma_larfb_gpu(
         // Comments assume C H. When forming C H^H, T gets transposed via trans.
 
         // W = C V
-        gpu_gemm(clblasColumnMajor,
+        gpu_gemm(
                  clblasNoTrans, notransV,
                  m, k, n,
                  c_one,
@@ -312,7 +311,7 @@ magma_larfb_gpu(
                  1, &queue, 0, nullptr, &event);
 
         // W = W T = C V T
-        gpu_trmm(clblasColumnMajor,
+        gpu_trmm(
                  clblasRight, uplo,
                  cltrans,
                  clblasNonUnit,
@@ -323,7 +322,7 @@ magma_larfb_gpu(
                  1, &queue, 0, nullptr, &event);
 
         // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
-        gpu_gemm(clblasColumnMajor,
+        gpu_gemm(
                  clblasNoTrans, transV,
                  m, n, k,
                  c_neg_one,
diff --git a/src/backend/opencl/magma/magma_blas.h b/src/backend/opencl/magma/magma_blas.h
index 8314bb7..44ebc03 100644
--- a/src/backend/opencl/magma/magma_blas.h
+++ b/src/backend/opencl/magma/magma_blas.h
@@ -19,53 +19,38 @@
 using opencl::cfloat;
 using opencl::cdouble;
 
+#define clblasSherk(...) clblasSsyrk(__VA_ARGS__)
+#define clblasDherk(...) clblasDsyrk(__VA_ARGS__)
+
 #define BLAS_FUNC_DEF(NAME)                     \
     template<typename T>                        \
-    struct NAME##_func;
-
-#define BLAS_FUNC(NAME, TYPE, PREFIX)                       \
-    template<>                                              \
-    struct NAME##_func<TYPE>                                \
-    {                                                       \
-        template<typename... Args>                          \
-            void                                            \
-            operator() (Args... args)                       \
-        {                                                   \
-            CLBLAS_CHECK(clblas##PREFIX##NAME(args...));    \
-        }                                                   \
+    struct gpu_##NAME##_func;
+
+#define BLAS_FUNC(NAME, TYPE, PREFIX)                               \
+    template<>                                                      \
+    struct gpu_##NAME##_func<TYPE>                                  \
+    {                                                               \
+        template<typename... Args>                                  \
+            void                                                    \
+            operator() (Args... args)                               \
+        {                                                           \
+            CLBLAS_CHECK(clblas##PREFIX##NAME(clblasColumnMajor,    \
+                                              args...));            \
+        }                                                           \
     };
 
-BLAS_FUNC_DEF(gemm)
-BLAS_FUNC(gemm, float,      S)
-BLAS_FUNC(gemm, double,     D)
-BLAS_FUNC(gemm, cfloat,     C)
-BLAS_FUNC(gemm, cdouble,    Z)
-
-BLAS_FUNC_DEF(trmm)
-BLAS_FUNC(trmm, float,      S)
-BLAS_FUNC(trmm, double,     D)
-BLAS_FUNC(trmm, cfloat,     C)
-BLAS_FUNC(trmm, cdouble,    Z)
-
-BLAS_FUNC_DEF(trsm)
-BLAS_FUNC(trsm, float,      S)
-BLAS_FUNC(trsm, double,     D)
-BLAS_FUNC(trsm, cfloat,     C)
-BLAS_FUNC(trsm, cdouble,    Z)
-
-BLAS_FUNC_DEF(trsv)
-BLAS_FUNC(trsv, float,      S)
-BLAS_FUNC(trsv, double,     D)
-BLAS_FUNC(trsv, cfloat,     C)
-BLAS_FUNC(trsv, cdouble,    Z)
-
-#define clblasSherk(...) clblasSsyrk(__VA_ARGS__)
-#define clblasDherk(...) clblasDsyrk(__VA_ARGS__)
-
-BLAS_FUNC_DEF(herk)
-BLAS_FUNC(herk, float,      S)
-BLAS_FUNC(herk, double,     D)
-BLAS_FUNC(herk, cfloat,     C)
-BLAS_FUNC(herk, cdouble,    Z)
+#define BLAS_FUNC_DECL(NAME)                    \
+    BLAS_FUNC_DEF(NAME)                         \
+    BLAS_FUNC(NAME, float,      S)              \
+    BLAS_FUNC(NAME, double,     D)              \
+    BLAS_FUNC(NAME, cfloat,     C)              \
+    BLAS_FUNC(NAME, cdouble,    Z)              \
+
+BLAS_FUNC_DECL(gemm)
+BLAS_FUNC_DECL(gemv)
+BLAS_FUNC_DECL(trmm)
+BLAS_FUNC_DECL(trsm)
+BLAS_FUNC_DECL(trsv)
+BLAS_FUNC_DECL(herk)
 
 #endif
diff --git a/src/backend/opencl/magma/magma_cpu_blas.h b/src/backend/opencl/magma/magma_cpu_blas.h
new file mode 100644
index 0000000..f5df93d
--- /dev/null
+++ b/src/backend/opencl/magma/magma_cpu_blas.h
@@ -0,0 +1,90 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#ifndef MAGMA_CPU_BLAS
+#define MAGMA_CPU_BLAS
+
+#include "magma_types.h"
+
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#else
+#ifdef USE_MKL
+#include <mkl_cblas.h>
+#else
+extern "C" {
+#include <cblas.h>
+}
+#endif
+#endif
+
+// TODO: Ask upstream for a more official way to detect it
+#ifdef OPENBLAS_CONST
+#define IS_OPENBLAS
+#endif
+
+// Make sure we get the correct type signature for OpenBLAS
+// OpenBLAS defines blasint as it's index type. Emulate this
+// if we're not dealing with openblas and use it where applicable
+#ifndef IS_OPENBLAS
+typedef int blasint;
+#endif
+
+#define CPU_BLAS_FUNC_DEF(NAME)                 \
+    template<typename T>                        \
+    struct cpu_##NAME##_func;
+
+#define CPU_BLAS_FUNC1(NAME, TYPE, X)                       \
+    template<>                                              \
+    struct cpu_##NAME##_func<TYPE>                          \
+    {                                                       \
+        template<typename... Args>                          \
+            void                                            \
+            operator() (Args... args)                       \
+        { return cblas_##X##NAME(CblasColMajor, args...); } \
+    };
+
+#define CPU_BLAS_FUNC2(NAME, TYPE, X)                       \
+    template<>                                              \
+    struct cpu_##NAME##_func<TYPE>                          \
+    {                                                       \
+        template<typename... Args>                          \
+            void                                            \
+            operator() (Args... args)                       \
+        { return cblas_##X##NAME(args...); }                \
+    };
+
+#define CPU_BLAS_DECL1(NAME)                         \
+    CPU_BLAS_FUNC_DEF(NAME)                         \
+    CPU_BLAS_FUNC1(NAME, float,      s)             \
+    CPU_BLAS_FUNC1(NAME, double,     d)             \
+    CPU_BLAS_FUNC1(NAME, magmaFloatComplex,     c)  \
+    CPU_BLAS_FUNC1(NAME, magmaDoubleComplex,    z)  \
+
+#define CPU_BLAS_DECL2(NAME)                         \
+    CPU_BLAS_FUNC_DEF(NAME)                         \
+    CPU_BLAS_FUNC2(NAME, float,      s)             \
+    CPU_BLAS_FUNC2(NAME, double,     d)             \
+    CPU_BLAS_FUNC2(NAME, magmaFloatComplex,     c)  \
+    CPU_BLAS_FUNC2(NAME, magmaDoubleComplex,    z)  \
+
+CPU_BLAS_DECL1(gemv)
+CPU_BLAS_DECL2(scal)
+CPU_BLAS_DECL2(axpy)
+
+inline float * cblas_ptr(float *in) { return in; }
+inline double * cblas_ptr(double *in) { return in; }
+inline void * cblas_ptr(magmaFloatComplex *in) { return (void *)in; }
+inline void * cblas_ptr(magmaDoubleComplex *in) { return (void *)in; }
+
+inline float cblas_scalar(float *in) { return *in; }
+inline double cblas_scalar(double *in) { return *in; }
+inline void *cblas_scalar(magmaFloatComplex *in) { return (void *)in; }
+inline void *cblas_scalar(magmaDoubleComplex *in) { return (void *)in; }
+#endif
diff --git a/src/backend/opencl/magma/magma_cpu_lapack.h b/src/backend/opencl/magma/magma_cpu_lapack.h
index 4410e1b..d406508 100644
--- a/src/backend/opencl/magma/magma_cpu_lapack.h
+++ b/src/backend/opencl/magma/magma_cpu_lapack.h
@@ -17,6 +17,12 @@
 #define LAPACKE_sungqr_work(...) LAPACKE_sorgqr_work(__VA_ARGS__)
 #define LAPACKE_dungqr_work(...) LAPACKE_dorgqr_work(__VA_ARGS__)
 
+template<typename... Args>
+int LAPACKE_slacgv(Args... args) { return 0; }
+
+template<typename... Args>
+int LAPACKE_dlacgv(Args... args) { return 0; }
+
 #define lapack_complex_float magmaFloatComplex
 #define lapack_complex_double magmaDoubleComplex
 #define LAPACK_PREFIX LAPACKE_
@@ -36,64 +42,53 @@
 
 #define CPU_LAPACK_FUNC_DEF(NAME)               \
     template<typename T>                        \
-    struct NAME##_func;
-
-#define CPU_LAPACK_FUNC(NAME, TYPE, X)              \
-    template<>                                      \
-    struct NAME##_func<TYPE>                        \
-    {                                               \
-        template<typename... Args>                  \
-            int                                     \
-            operator() (Args... args)               \
-        { return LAPACK_NAME(X##NAME)(args...); }   \
+    struct cpu_##NAME##_func;
+
+#define CPU_LAPACK_FUNC1(NAME, TYPE, X)                             \
+    template<>                                                      \
+    struct cpu_##NAME##_func<TYPE>                                  \
+    {                                                               \
+        template<typename... Args>                                  \
+            int                                                     \
+            operator() (Args... args)                               \
+        { return LAPACK_NAME(X##NAME)(LAPACK_COL_MAJOR, args...); } \
     };
 
-CPU_LAPACK_FUNC_DEF(getrf)
-CPU_LAPACK_FUNC(getrf, float,      s)
-CPU_LAPACK_FUNC(getrf, double,     d)
-CPU_LAPACK_FUNC(getrf, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(getrf, magmaDoubleComplex,    z)
-
-CPU_LAPACK_FUNC_DEF(potrf)
-CPU_LAPACK_FUNC(potrf, float,      s)
-CPU_LAPACK_FUNC(potrf, double,     d)
-CPU_LAPACK_FUNC(potrf, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(potrf, magmaDoubleComplex,    z)
-
-CPU_LAPACK_FUNC_DEF(trtri)
-CPU_LAPACK_FUNC(trtri, float,      s)
-CPU_LAPACK_FUNC(trtri, double,     d)
-CPU_LAPACK_FUNC(trtri, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(trtri, magmaDoubleComplex,    z)
-
-CPU_LAPACK_FUNC_DEF(geqrf_work)
-CPU_LAPACK_FUNC(geqrf_work, float,      s)
-CPU_LAPACK_FUNC(geqrf_work, double,     d)
-CPU_LAPACK_FUNC(geqrf_work, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(geqrf_work, magmaDoubleComplex,    z)
-
-CPU_LAPACK_FUNC_DEF(larft)
-CPU_LAPACK_FUNC(larft, float,      s)
-CPU_LAPACK_FUNC(larft, double,     d)
-CPU_LAPACK_FUNC(larft, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(larft, magmaDoubleComplex,    z)
-
-CPU_LAPACK_FUNC_DEF(unmqr_work)
-CPU_LAPACK_FUNC(unmqr_work, float,      s)
-CPU_LAPACK_FUNC(unmqr_work, double,     d)
-CPU_LAPACK_FUNC(unmqr_work, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(unmqr_work, magmaDoubleComplex,    z)
-
-CPU_LAPACK_FUNC_DEF(ungqr_work)
-CPU_LAPACK_FUNC(ungqr_work, float,      s)
-CPU_LAPACK_FUNC(ungqr_work, double,     d)
-CPU_LAPACK_FUNC(ungqr_work, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(ungqr_work, magmaDoubleComplex,    z)
+#define CPU_LAPACK_FUNC2(NAME, TYPE, X)                             \
+    template<>                                                      \
+    struct cpu_##NAME##_func<TYPE>                                  \
+    {                                                               \
+        template<typename... Args>                                  \
+            int                                                     \
+            operator() (Args... args)                               \
+        { return LAPACK_NAME(X##NAME)(args...); }                   \
+    };
 
-CPU_LAPACK_FUNC_DEF(laswp)
-CPU_LAPACK_FUNC(laswp, float,      s)
-CPU_LAPACK_FUNC(laswp, double,     d)
-CPU_LAPACK_FUNC(laswp, magmaFloatComplex,     c)
-CPU_LAPACK_FUNC(laswp, magmaDoubleComplex,    z)
+#define CPU_LAPACK_DECL1(NAME)                          \
+    CPU_LAPACK_FUNC_DEF(NAME)                           \
+    CPU_LAPACK_FUNC1(NAME, float,      s)               \
+    CPU_LAPACK_FUNC1(NAME, double,     d)               \
+    CPU_LAPACK_FUNC1(NAME, magmaFloatComplex,     c)    \
+    CPU_LAPACK_FUNC1(NAME, magmaDoubleComplex,    z)    \
+
+#define CPU_LAPACK_DECL2(NAME)                          \
+    CPU_LAPACK_FUNC_DEF(NAME)                           \
+    CPU_LAPACK_FUNC2(NAME, float,      s)               \
+    CPU_LAPACK_FUNC2(NAME, double,     d)               \
+    CPU_LAPACK_FUNC2(NAME, magmaFloatComplex,     c)    \
+    CPU_LAPACK_FUNC2(NAME, magmaDoubleComplex,    z)    \
+
+CPU_LAPACK_DECL1(getrf)
+CPU_LAPACK_DECL1(gebrd)
+CPU_LAPACK_DECL1(potrf)
+CPU_LAPACK_DECL1(trtri)
+CPU_LAPACK_DECL1(geqrf_work)
+CPU_LAPACK_DECL1(larft)
+CPU_LAPACK_DECL1(unmqr_work)
+CPU_LAPACK_DECL1(ungqr_work)
+CPU_LAPACK_DECL1(laswp)
+CPU_LAPACK_DECL1(laset)
+CPU_LAPACK_DECL2(lacgv)
+CPU_LAPACK_DECL2(larfg)
 
 #endif
diff --git a/src/backend/opencl/magma/magma_helper.cpp b/src/backend/opencl/magma/magma_helper.cpp
index b38045a..a3a3d9e 100644
--- a/src/backend/opencl/magma/magma_helper.cpp
+++ b/src/backend/opencl/magma/magma_helper.cpp
@@ -43,6 +43,12 @@ template<typename T> T magma_scalar(double val) { return (T)val; }
 template float magma_scalar<float>(double val);
 template double magma_scalar<double>(double val);
 
+template<typename T> double  magma_real(T val) { return (double)val; }
+template double magma_real<float>(float val);
+template double magma_real<double>(double val);
+template<> double magma_real<magmaFloatComplex>(magmaFloatComplex val) { return (double)val.s[0]; }
+template<> double magma_real<magmaDoubleComplex>(magmaDoubleComplex val) { return (double)val.s[0]; }
+
 #define INSTANTIATE_CPLX_SCALAR(T)              \
     template<> T magma_scalar<T>(double val)    \
     {                                           \
@@ -152,3 +158,19 @@ magma_int_t magma_get_geqrf_nb<magmaDoubleComplex>( magma_int_t m )
     else if (m <= 4032) return 64;
     else                return 128;
 }
+
+template<typename T> magma_int_t magma_get_gebrd_nb(int num) { return 256; }
+
+template<typename T> T magma_make(double r, double i) { return (T) r; }
+template float magma_make<float>(double r, double i);
+template double magma_make<double>(double r, double i);
+template<> magmaFloatComplex magma_make<magmaFloatComplex>(double r, double i)
+{
+    magmaFloatComplex tmp = {r, i};
+    return tmp;
+}
+template<> magmaDoubleComplex magma_make<magmaDoubleComplex>(double r, double i)
+{
+    magmaDoubleComplex tmp = {r, i};
+    return tmp;
+}
diff --git a/src/backend/opencl/magma/magma_helper.h b/src/backend/opencl/magma/magma_helper.h
index 32fd065..a63c8d1 100644
--- a/src/backend/opencl/magma/magma_helper.h
+++ b/src/backend/opencl/magma/magma_helper.h
@@ -14,11 +14,14 @@ template<typename T> T magma_zero();
 template<typename T> T magma_one();
 template<typename T> T magma_neg_one();
 template<typename T> T magma_scalar(double val);
+template<typename T> double magma_real(T val);
+template<typename T> T magma_make(double r, double i);
 
 template<typename T> bool magma_is_real();
 
 template<typename T> magma_int_t magma_get_getrf_nb(int num);
 template<typename T> magma_int_t magma_get_potrf_nb(int num);
 template<typename T> magma_int_t magma_get_geqrf_nb(int num);
+template<typename T> magma_int_t magma_get_gebrd_nb(int num);
 
 #endif
diff --git a/src/backend/opencl/magma/potrf.cpp b/src/backend/opencl/magma/potrf.cpp
index f09cd22..ddc4f46 100644
--- a/src/backend/opencl/magma/potrf.cpp
+++ b/src/backend/opencl/magma/potrf.cpp
@@ -31,22 +31,22 @@
  * * Redistributions  of  source  code  must  retain  the above copyright
  *   notice,  this  list  of  conditions  and  the  following  disclaimer.
  * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the 
+ *   notice,  this list of conditions and the following disclaimer in the
  *   documentation  and/or other materials provided with the distribution.
- * * Neither  the  name of the University of Tennessee, Knoxville nor the 
+ * * Neither  the  name of the University of Tennessee, Knoxville nor the
  *   names of its contributors may be used to endorse or promote products
  *   derived from this software without specific prior written permission.
  *
  * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT 
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
  * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **********************************************************************/
@@ -151,10 +151,10 @@ magma_int_t magma_potrf_gpu(
 
     nb = magma_get_potrf_nb<Ty>(n);
 
-    gemm_func<Ty> gpu_gemm;
-    trsm_func<Ty> gpu_trsm;
-    herk_func<Ty> gpu_herk;
-    potrf_func<Ty> cpu_potrf;
+    gpu_gemm_func<Ty> gpu_gemm;
+    gpu_trsm_func<Ty> gpu_trsm;
+    gpu_herk_func<Ty> gpu_herk;
+    cpu_potrf_func<Ty> cpu_potrf;
 
 
     err = magma_malloc_cpu<Ty>( &work, nb*nb);
@@ -170,7 +170,7 @@ magma_int_t magma_potrf_gpu(
         // use unblocked code
         magma_getmatrix<Ty>(n, n, dA, dA_offset, ldda, work, n, queue);
 
-        cpu_potrf(LAPACK_COL_MAJOR,
+        cpu_potrf(
                   uplo == MagmaUpper ? *MagmaUpperStr : *MagmaLowerStr,
                   n, work, n);
 
@@ -185,7 +185,7 @@ magma_int_t magma_potrf_gpu(
                 // apply all previous updates to diagonal block
                 jb = std::min(nb, n-j);
                 if (j > 0) {
-                    gpu_herk(clblasColumnMajor,
+                    gpu_herk(
                              clblasUpper, transType,
                              jb, j,
                              m_one,
@@ -200,7 +200,7 @@ magma_int_t magma_potrf_gpu(
 
                 // apply all previous updates to block row right of diagonal block
                 if (j+jb < n) {
-                    gpu_gemm(clblasColumnMajor,
+                    gpu_gemm(
                              transType, clblasNoTrans,
                              jb, n-j-jb, j,
                              mz_one,
@@ -214,7 +214,7 @@ magma_int_t magma_potrf_gpu(
                 // simultaneous with above zgemm, transfer data, factor
                 // diagonal block on CPU, and test for positive definiteness
                 magma_event_sync(event);
-                *info =cpu_potrf(LAPACK_COL_MAJOR, *MagmaUpperStr, jb, work, jb);
+                *info =cpu_potrf( *MagmaUpperStr, jb, work, jb);
 
                 if (*info != 0) {
                     assert(*info > 0);
@@ -227,7 +227,7 @@ magma_int_t magma_potrf_gpu(
                 // apply diagonal block to block row right of diagonal block
                 if (j+jb < n) {
                     magma_event_sync(event);
-                    gpu_trsm(clblasColumnMajor,
+                    gpu_trsm(
                              clblasLeft, clblasUpper,
                              transType, clblasNonUnit,
                              jb, n-j-jb,
@@ -246,7 +246,7 @@ magma_int_t magma_potrf_gpu(
                 // apply all previous updates to diagonal block
                 jb = std::min(nb, n-j);
                 if (j>0) {
-                    gpu_herk(clblasColumnMajor,
+                    gpu_herk(
                              clblasLower, clblasNoTrans, jb, j,
                              m_one,
                              dA(j, 0), ldda,
@@ -260,7 +260,7 @@ magma_int_t magma_potrf_gpu(
 
                 // apply all previous updates to block column below diagonal block
                 if (j+jb < n) {
-                    gpu_gemm(clblasColumnMajor,
+                    gpu_gemm(
                              clblasNoTrans, transType,
                              n-j-jb, jb, j,
                              mz_one,
@@ -274,7 +274,7 @@ magma_int_t magma_potrf_gpu(
                 // simultaneous with above zgemm, transfer data, factor
                 // diagonal block on CPU, and test for positive definiteness
                 magma_event_sync(event);
-                *info = cpu_potrf(LAPACK_COL_MAJOR,
+                *info = cpu_potrf(
                                   *MagmaLowerStr, jb, work, jb);
                 if (*info != 0) {
                     assert(*info > 0);
@@ -286,7 +286,7 @@ magma_int_t magma_potrf_gpu(
                 // apply diagonal block to block column below diagonal
                 if (j+jb < n) {
                     magma_event_sync(event);
-                    gpu_trsm(clblasColumnMajor,
+                    gpu_trsm(
                              clblasRight, clblasLower, transType, clblasNonUnit,
                              n-j-jb, jb,
                              z_one,
diff --git a/src/backend/opencl/magma/ungqr.cpp b/src/backend/opencl/magma/ungqr.cpp
index c0dd47b..49a120a 100644
--- a/src/backend/opencl/magma/ungqr.cpp
+++ b/src/backend/opencl/magma/ungqr.cpp
@@ -137,7 +137,7 @@ magma_ungqr_gpu(
     cl_mem dW;
     magma_malloc<Ty>(&dW, (((n+31)/32)*32)*nb);
 
-    ungqr_work_func<Ty> cpu_ungqr;
+    cpu_ungqr_work_func<Ty> cpu_ungqr;
 
     // Use unblocked code for the last or only block.
     if (kk < n) {
@@ -147,7 +147,7 @@ magma_ungqr_gpu(
         magma_getmatrix<Ty>(m_kk, k_kk,
                             dA(kk, kk), ldda, panel, m_kk, queue);
 
-        cpu_ungqr(LAPACK_COL_MAJOR,
+        cpu_ungqr(
                   m_kk, n_kk, k_kk,
                   panel, m_kk,
                   &tau[kk], work, lwork);
diff --git a/src/backend/opencl/magma/unmqr.cpp b/src/backend/opencl/magma/unmqr.cpp
index b740a87..ed69e51 100644
--- a/src/backend/opencl/magma/unmqr.cpp
+++ b/src/backend/opencl/magma/unmqr.cpp
@@ -227,7 +227,7 @@ magma_unmqr_gpu(
 
     magma_malloc<Ty>(&dwork, (((n+31)/32)*32)*nb);
 
-    unmqr_work_func<Ty> cpu_unmqr;
+    cpu_unmqr_work_func<Ty> cpu_unmqr;
 
     if ( (left && (! notran)) || ( (!left) && notran ) ) {
         i1 = 0;
@@ -283,7 +283,7 @@ magma_unmqr_gpu(
         magma_getmatrix<Ty>(ma, ib, a_ref(i,  i ), ldda, hA, ma, queue);
         magma_getmatrix<Ty>(mi, ni, c_ref(ic, jc), lddc, hC, mi, queue);
 
-        *info = cpu_unmqr(LAPACK_COL_MAJOR,
+        *info = cpu_unmqr(
                           side == MagmaRight ? 'R' : 'L',
                           notran ? 'N' : (is_real ? 'T' : 'C'),
                           mi, ni, ib,
@@ -351,7 +351,7 @@ magma_unmqr_gpu(
         magma_getmatrix<Ty>(ma, ib, a_ref(i,  i ), ldda, hA, ma, queue);
         magma_getmatrix<Ty>(mi, ni, c_ref(ic, jc), lddc, hC, mi, queue);
 
-        *info = cpu_unmqr(LAPACK_COL_MAJOR,
+        *info = cpu_unmqr(
                           side == MagmaRight ? 'R' : 'L',
                           notran ? 'N' : (is_real ? 'T' : 'C'),
                           mi, ni, ib,
diff --git a/src/backend/opencl/magma/unmqr2.cpp b/src/backend/opencl/magma/unmqr2.cpp
index 6de4caf..4da4143 100644
--- a/src/backend/opencl/magma/unmqr2.cpp
+++ b/src/backend/opencl/magma/unmqr2.cpp
@@ -251,7 +251,7 @@ magma_unmqr2_gpu(
         ic = 1;
     }
 
-    larft_func<Ty> cpu_larft;
+    cpu_larft_func<Ty> cpu_larft;
 
     // set nb-1 super-diagonals to 0, and diagonal to 1.
     // This way we can copy V directly to the GPU,
@@ -265,7 +265,7 @@ magma_unmqr2_gpu(
         /* Form the triangular factor of the block reflector
            H = H(i) H(i+1) . . . H(i+ib-1) */
         i__4 = nq - i + 1;
-        cpu_larft(LAPACK_COL_MAJOR,
+        cpu_larft(
                   *MagmaForwardStr, *MagmaColumnwiseStr,
                   i__4, ib,
                   wA(i,i), ldwa, &tau[i], T, ib);
diff --git a/src/backend/opencl/solve.cpp b/src/backend/opencl/solve.cpp
index 34a357e..fa101e8 100644
--- a/src/backend/opencl/solve.cpp
+++ b/src/backend/opencl/solve.cpp
@@ -89,7 +89,7 @@ Array<T> leastSquares(const Array<T> &a, const Array<T> &b)
     int MN = std::min(M, N);
 
     Array<T> B = createEmptyArray<T>(dim4());
-    trsm_func<T> gpu_trsm;
+    gpu_trsm_func<T> gpu_trsm;
 
     cl_event event;
     cl_command_queue queue = getQueue()();
@@ -137,7 +137,7 @@ Array<T> leastSquares(const Array<T> &a, const Array<T> &b)
                               (*dA)(), A.getOffset(), A.strides()[1], 1,
                               (*dT)(), tmp.getOffset() + MN * NB, NB, 0, queue);
 
-        gpu_trsm(clblasColumnMajor,
+        gpu_trsm(
                  clblasLeft, clblasUpper,
                  clblasConjTrans, clblasNonUnit,
                  B.dims()[0], B.dims()[1],
@@ -225,14 +225,14 @@ Array<T> leastSquares(const Array<T> &a, const Array<T> &b)
         {
             Array<T> AT = transpose<T>(A, true);
             cl::Buffer* AT_buf = AT.get();
-            gpu_trsm(clblasColumnMajor,
+            gpu_trsm(
                      clblasLeft, clblasLower, clblasConjTrans, clblasNonUnit,
                      N, NRHS, scalar<T>(1),
                      (*AT_buf)(), AT.getOffset(), AT.strides()[1],
                      (*B_buf)(), B.getOffset(), B.strides()[1],
                      1, &queue, 0, nullptr, &event);
         } else {
-            gpu_trsm(clblasColumnMajor,
+            gpu_trsm(
                      clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit,
                      N, NRHS, scalar<T>(1),
                      (*A_buf)(), A.getOffset(), A.strides()[1],
@@ -248,7 +248,7 @@ Array<T> leastSquares(const Array<T> &a, const Array<T> &b)
 template<typename T>
 Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options)
 {
-    trsm_func<T> gpu_trsm;
+    gpu_trsm_func<T> gpu_trsm;
 
     Array<T> B = copyArray<T>(b);
 
@@ -267,7 +267,7 @@ Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop o
         Array<T> AT = transpose<T>(A, true);
 
         cl::Buffer* AT_buf = AT.get();
-        gpu_trsm(clblasColumnMajor,
+        gpu_trsm(
                  clblasLeft,
                  clblasLower,
                  clblasConjTrans,
@@ -277,7 +277,7 @@ Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop o
                  (*B_buf)(), B.getOffset(), B.strides()[1],
                  1, &queue, 0, nullptr, &event);
     } else {
-        gpu_trsm(clblasColumnMajor,
+        gpu_trsm(
                  clblasLeft,
                  options & AF_MAT_LOWER ? clblasLower : clblasUpper,
                  clblasNoTrans,

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git