[arrayfire] 304/408: Adding functions from clMagma necessary for OpenCL SVD:
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:12:18 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit fde6380e9cc5681e434dbcb1f0e0320fdadfa5b7
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Thu Aug 20 10:09:00 2015 -0400
Adding functions from clMagma necessary for OpenCL SVD:
- gebrd_hybrid
- labrd_gpu
---
src/backend/opencl/magma/gebrd.cpp | 367 +++++++++++++++
src/backend/opencl/magma/labrd.cpp | 663 ++++++++++++++++++++++++++++
src/backend/opencl/magma/magma.h | 21 +
src/backend/opencl/magma/magma_cpu_lapack.h | 63 ++-
src/backend/opencl/magma/magma_helper.cpp | 4 +-
src/backend/opencl/magma/magma_helper.h | 2 +-
6 files changed, 1100 insertions(+), 20 deletions(-)
diff --git a/src/backend/opencl/magma/gebrd.cpp b/src/backend/opencl/magma/gebrd.cpp
new file mode 100644
index 0000000..e4df977
--- /dev/null
+++ b/src/backend/opencl/magma/gebrd.cpp
@@ -0,0 +1,367 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+/***********************************************************************
+ * Based on MAGMA library http://icl.cs.utk.edu/magma/
+ * Below is the original copyright.
+ *
+ * -- MAGMA (version 0.1) --
+ * Univ. of Tennessee, Knoxville
+ * Univ. of California, Berkeley
+ * Univ. of Colorado, Denver
+ * @date
+ *
+ * @precisions normal z -> s d c
+ *
+ * -- Innovative Computing Laboratory
+ * -- Electrical Engineering and Computer Science Department
+ * -- University of Tennessee
+ * -- (C) Copyright 2009-2013
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the University of Tennessee, Knoxville nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **********************************************************************/
+
+#include "magma.h"
+#include "magma_blas.h"
+#include "magma_data.h"
+#include "magma_cpu_lapack.h"
+#include "magma_cpu_blas.h"
+#include "magma_helper.h"
+#include "magma_sync.h"
+#include <traits.hpp>
+#include <types.hpp>
+
+#include <algorithm>
+
+// produces pointer and offset as two args to magmaBLAS routines
+#define dA(i,j) da, ((da_offset) + (i) + (j)*ldda)
+// produces pointer as single arg to BLAS routines
+#define A(i,j) &a[ (i) + (j)*lda ]
+
+template<typename Ty>
+magma_int_t
+magma_gebrd_hybrid(
+ magma_int_t m, magma_int_t n,
+ Ty *a, magma_int_t lda,
+ cl_mem da, size_t da_offset, magma_int_t ldda,
+ void *_d, void *_e,
+ Ty *tauq, Ty *taup,
+ Ty *work, magma_int_t lwork,
+ magma_queue_t queue,
+ magma_int_t *info,
+ bool copy)
+{
+/* -- MAGMA (version 1.1) --
+ Univ. of Tennessee, Knoxville
+ Univ. of California, Berkeley
+ Univ. of Colorado, Denver
+ @date
+
+ Purpose
+ =======
+ ZGEBRD reduces a general complex M-by-N matrix A to upper or lower
+ bidiagonal form B by an orthogonal transformation: Q**H * A * P = B.
+
+ If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+ Arguments
+ =========
+ M (input) INTEGER
+ The number of rows in the matrix A. M >= 0.
+
+ N (input) INTEGER
+ The number of columns in the matrix A. N >= 0.
+
+ A (input/output) COMPLEX_16 array, dimension (LDA,N)
+ On entry, the M-by-N general matrix to be reduced.
+ On exit,
+ if m >= n, the diagonal and the first superdiagonal are
+ overwritten with the upper bidiagonal matrix B; the
+ elements below the diagonal, with the array TAUQ, represent
+ the orthogonal matrix Q as a product of elementary
+ reflectors, and the elements above the first superdiagonal,
+ with the array TAUP, represent the orthogonal matrix P as
+ a product of elementary reflectors;
+ if m < n, the diagonal and the first subdiagonal are
+ overwritten with the lower bidiagonal matrix B; the
+ elements below the first subdiagonal, with the array TAUQ,
+ represent the orthogonal matrix Q as a product of
+ elementary reflectors, and the elements above the diagonal,
+ with the array TAUP, represent the orthogonal matrix P as
+ a product of elementary reflectors.
+ See Further Details.
+
+ LDA (input) INTEGER
+ The leading dimension of the array A. LDA >= max(1,M).
+
+ D (output) double precision array, dimension (min(M,N))
+ The diagonal elements of the bidiagonal matrix B:
+ D(i) = A(i,i).
+
+ E (output) double precision array, dimension (min(M,N)-1)
+ The off-diagonal elements of the bidiagonal matrix B:
+ if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+ if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+ TAUQ (output) COMPLEX_16 array dimension (min(M,N))
+ The scalar factors of the elementary reflectors which
+ represent the orthogonal matrix Q. See Further Details.
+
+ TAUP (output) COMPLEX_16 array, dimension (min(M,N))
+ The scalar factors of the elementary reflectors which
+ represent the orthogonal matrix P. See Further Details.
+
+ WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
+ On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
+
+ LWORK (input) INTEGER
+ The length of the array WORK. LWORK >= (M+N)*NB, where NB
+ is the optimal blocksize.
+
+ If LWORK = -1, then a workspace query is assumed; the routine
+ only calculates the optimal size of the WORK array, returns
+ this value as the first entry of the WORK array, and no error
+ message related to LWORK is issued by XERBLA.
+
+ INFO (output) INTEGER
+ = 0: successful exit
+ < 0: if INFO = -i, the i-th argument had an illegal value.
+
+ Further Details
+ ===============
+ The matrices Q and P are represented as products of elementary
+ reflectors:
+
+ If m >= n,
+ Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1)
+ Each H(i) and G(i) has the form:
+ H(i) = I - tauq * v * v' and G(i) = I - taup * u * u'
+ where tauq and taup are complex scalars, and v and u are complex vectors;
+ v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
+ u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
+ tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+ If m < n,
+ Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m)
+ Each H(i) and G(i) has the form:
+ H(i) = I - tauq * v * v' and G(i) = I - taup * u * u'
+ where tauq and taup are complex scalars, and v and u are complex vectors;
+ v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+ u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+ tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+ The contents of A on exit are illustrated by the following examples:
+
+ m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n):
+
+ ( d e u1 u1 u1) ( d u1 u1 u1 u1 u1)
+ ( v1 d e u2 u2) ( e d u2 u2 u2 u2)
+ ( v1 v2 d e u3) ( v1 e d u3 u3 u3)
+ ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4)
+ ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5)
+ ( v1 v2 v3 v4 v5)
+
+ where d and e denote diagonal and off-diagonal elements of B, vi
+ denotes an element of the vector defining H(i), and ui an element of
+ the vector defining G(i).
+ ===================================================================== */
+
+ typedef typename af::dtype_traits<Ty>::base_type Tr;
+
+ Tr *d = (Tr *)_d;
+ Tr *e = (Tr *)_e;
+
+
+ Ty c_neg_one = magma_neg_one<Ty>();
+ Ty c_one = magma_one<Ty>();
+ cl_mem dwork;
+
+ magma_int_t ncol, nrow, jmax, nb;
+
+ magma_int_t i, j, nx;
+ //magma_int_t iinfo;
+
+ magma_int_t minmn;
+ magma_int_t ldwrkx, ldwrky, lwkopt;
+ magma_int_t lquery;
+
+ nb = magma_get_gebrd_nb<Ty>(n);
+
+ lwkopt = (m + n) * nb;
+ work[0] = magma_make<Ty>(lwkopt, 0.);
+ lquery = (lwork == -1);
+
+ /* Check arguments */
+ *info = 0;
+ if (m < 0) {
+ *info = -1;
+ } else if (n < 0) {
+ *info = -2;
+ } else if (lda < std::max(1,m)) {
+ *info = -4;
+ } else if (lwork < lwkopt && (! lquery)) {
+ *info = -10;
+ }
+ if (*info < 0) {
+ //magma_xerbla(__func__, -(*info));
+ return *info;
+ }
+ else if (lquery)
+ return *info;
+
+ /* Quick return if possible */
+ minmn = std::min(m,n);
+ if (minmn == 0) {
+ work[0] = c_one;
+ return *info;
+ }
+
+ if (MAGMA_SUCCESS != magma_malloc<Ty>(&dwork, (m + n)*nb)) {
+ *info = MAGMA_ERR_DEVICE_ALLOC;
+ return *info;
+ }
+ size_t dwork_offset = 0;
+
+ cl_event event = 0;
+
+ ldwrkx = m;
+ ldwrky = n;
+
+ /* Set the block/unblock crossover point NX. */
+ nx = nb;
+ assert(nx <= nb);
+
+ /* Copy the matrix to the GPU */
+ if (copy && minmn - nx >= 1) {
+ magma_setmatrix<Ty>(m, n, a, lda, da, da_offset, ldda, queue);
+ }
+
+ gpu_gemm_func<Ty> gpu_blas_gemm;
+ cpu_gebrd_work_func<Ty> cpu_lapack_gebrd_work;
+
+ for (i=0; i< (minmn - nx); i += nb) {
+ /* Reduce rows and columns i:i+nb-1 to bidiagonal form and return
+ the matrices X and Y which are needed to update the unreduced
+ part of the matrix */
+ nrow = m - i;
+ ncol = n - i;
+
+ /* Get the current panel (no need for the 1st iteration) */
+ if (i > 0) {
+ magma_getmatrix<Ty>(nrow, nb, dA(i, i), ldda, A(i, i), lda, queue);
+ magma_getmatrix<Ty>(nb, ncol - nb,
+ dA(i, i+nb), ldda,
+ A(i, i+nb), lda, queue);
+ }
+
+ magma_labrd_gpu<Ty>(nrow, ncol, nb,
+ A(i, i), lda, dA(i, i), ldda,
+ d+i, e+i, tauq+i, taup+i,
+ work, ldwrkx, dwork, dwork_offset, ldwrkx, // x, dx
+ work+(ldwrkx*nb), ldwrky, dwork, dwork_offset+(ldwrkx*nb), ldwrky, // y, dy
+ queue);
+
+ /* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
+ of the form A := A - V*Y' - X*U' */
+ nrow = m - i - nb;
+ ncol = n - i - nb;
+
+ // Send Y back to the GPU
+ magma_setmatrix<Ty>(nrow, nb, work+nb, ldwrkx, dwork, dwork_offset+nb, ldwrkx, queue);
+ magma_setmatrix<Ty>(ncol, nb,
+ work + (ldwrkx+1)*nb, ldwrky,
+ dwork, dwork_offset + (ldwrkx+1)*nb, ldwrky, queue);
+
+ gpu_blas_gemm(clblasNoTrans, clblasConjTrans,
+ nrow, ncol, nb,
+ c_neg_one, dA(i+nb, i ), ldda,
+ dwork, dwork_offset+(ldwrkx+1)*nb, ldwrky,
+ c_one, dA(i+nb, i+nb), ldda,
+ 1, &queue, 0, nullptr, &event);
+
+ gpu_blas_gemm(clblasNoTrans, clblasNoTrans,
+ nrow, ncol, nb,
+ c_neg_one, dwork, dwork_offset+nb, ldwrkx,
+ dA(i, i+nb), ldda,
+ c_one, dA(i+nb, i+nb), ldda,
+ 1, &queue, 0, nullptr, &event);
+
+ /* Copy diagonal and off-diagonal elements of B back into A */
+ if (m >= n) {
+ jmax = i + nb;
+ for (j = i; j < jmax; ++j) {
+ *A(j, j ) = magma_make<Ty>(d[j], 0.);
+ *A(j, j+1) = magma_make<Ty>(e[j], 0.);
+ }
+ } else {
+ jmax = i + nb;
+ for (j = i; j < jmax; ++j) {
+ *A(j, j) = magma_make<Ty>(d[j], 0.);
+ *A(j+1, j) = magma_make<Ty>(e[j], 0.);
+ }
+ }
+ }
+
+ /* Use unblocked code to reduce the remainder of the matrix */
+ nrow = m - i;
+ ncol = n - i;
+
+ if (0 < minmn - nx) {
+ magma_getmatrix<Ty>(nrow, ncol, dA(i, i), ldda, A(i, i), lda, queue);
+ }
+
+ *info = cpu_lapack_gebrd_work(nrow, ncol,
+ A(i, i), lda, d+i, e+i,
+ tauq+i, taup+i, work, lwork);
+ work[0] = magma_make<Ty>(lwkopt, 0.);
+
+ magma_free(dwork);
+ return *info;
+} /* magma_zgebrd */
+
+#define INSTANTIATE(Ty) \
+ template magma_int_t \
+ magma_gebrd_hybrid<Ty>( \
+ magma_int_t m, magma_int_t n, \
+ Ty *a, magma_int_t lda, \
+ cl_mem da, size_t da_offset, magma_int_t ldda, \
+ void *_d, void *_e, \
+ Ty *tauq, Ty *taup, \
+ Ty *work, magma_int_t lwork, \
+ magma_queue_t queue, \
+ magma_int_t *info, bool copy); \
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(magmaFloatComplex)
+INSTANTIATE(magmaDoubleComplex)
diff --git a/src/backend/opencl/magma/labrd.cpp b/src/backend/opencl/magma/labrd.cpp
new file mode 100644
index 0000000..ee7f120
--- /dev/null
+++ b/src/backend/opencl/magma/labrd.cpp
@@ -0,0 +1,663 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+/***********************************************************************
+ * Based on MAGMA library http://icl.cs.utk.edu/magma/
+ * Below is the original copyright.
+ *
+ * -- MAGMA (version 0.1) --
+ * Univ. of Tennessee, Knoxville
+ * Univ. of California, Berkeley
+ * Univ. of Colorado, Denver
+ * @date
+ *
+ * @precisions normal z -> s d c
+ *
+ * -- Innovative Computing Laboratory
+ * -- Electrical Engineering and Computer Science Department
+ * -- University of Tennessee
+ * -- (C) Copyright 2009-2013
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the University of Tennessee, Knoxville nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **********************************************************************/
+
+
+#include "magma.h"
+#include "magma_blas.h"
+#include "magma_data.h"
+#include "magma_cpu_lapack.h"
+#include "magma_cpu_blas.h"
+#include "magma_helper.h"
+#include "magma_sync.h"
+#include <traits.hpp>
+#include <types.hpp>
+
+#include <algorithm>
+
+template<typename Ty> magma_int_t
+magma_labrd_gpu(
+ magma_int_t m, magma_int_t n, magma_int_t nb,
+ Ty *a, magma_int_t lda,
+ cl_mem da, size_t da_offset, magma_int_t ldda,
+ void *_d, void *_e, Ty *tauq, Ty *taup,
+ Ty *x, magma_int_t ldx,
+ cl_mem dx, size_t dx_offset, magma_int_t lddx,
+ Ty *y, magma_int_t ldy,
+ cl_mem dy, size_t dy_offset, magma_int_t lddy,
+ magma_queue_t queue)
+{
+/* -- MAGMA (version 1.1) --
+ Univ. of Tennessee, Knoxville
+ Univ. of California, Berkeley
+ Univ. of Colorado, Denver
+ @date
+
+ Purpose
+ =======
+ ZLABRD reduces the first NB rows and columns of a complex general
+ m by n matrix A to upper or lower bidiagonal form by an orthogonal
+ transformation Q' * A * P, and returns the matrices X and Y which
+ are needed to apply the transformation to the unreduced part of A.
+
+ If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+ bidiagonal form.
+
+ This is an auxiliary routine called by SGEBRD
+
+ Arguments
+ =========
+ M (input) INTEGER
+ The number of rows in the matrix A.
+
+ N (input) INTEGER
+ The number of columns in the matrix A.
+
+ NB (input) INTEGER
+ The number of leading rows and columns of A to be reduced.
+
+ A (input/output) COMPLEX_16 array, dimension (LDA,N)
+ On entry, the m by n general matrix to be reduced.
+ On exit, the first NB rows and columns of the matrix are
+ overwritten; the rest of the array is unchanged.
+ If m >= n, elements on and below the diagonal in the first NB
+ columns, with the array TAUQ, represent the orthogonal
+ matrix Q as a product of elementary reflectors; and
+ elements above the diagonal in the first NB rows, with the
+ array TAUP, represent the orthogonal matrix P as a product
+ of elementary reflectors.
+ If m < n, elements below the diagonal in the first NB
+ columns, with the array TAUQ, represent the orthogonal
+ matrix Q as a product of elementary reflectors, and
+ elements on and above the diagonal in the first NB rows,
+ with the array TAUP, represent the orthogonal matrix P as
+ a product of elementary reflectors.
+ See Further Details.
+
+ LDA (input) INTEGER
+ The leading dimension of the array A. LDA >= max(1,M).
+
+ D (output) COMPLEX_16 array, dimension (NB)
+ The diagonal elements of the first NB rows and columns of
+ the reduced matrix. D(i) = A(i,i).
+
+ E (output) COMPLEX_16 array, dimension (NB)
+ The off-diagonal elements of the first NB rows and columns of
+ the reduced matrix.
+
+ TAUQ (output) COMPLEX_16 array dimension (NB)
+ The scalar factors of the elementary reflectors which
+ represent the orthogonal matrix Q. See Further Details.
+
+ TAUP (output) COMPLEX_16 array, dimension (NB)
+ The scalar factors of the elementary reflectors which
+ represent the orthogonal matrix P. See Further Details.
+
+ X (output) COMPLEX_16 array, dimension (LDX,NB)
+ The m-by-nb matrix X required to update the unreduced part
+ of A.
+
+ LDX (input) INTEGER
+ The leading dimension of the array X. LDX >= M.
+
+ Y (output) COMPLEX_16 array, dimension (LDY,NB)
+ The n-by-nb matrix Y required to update the unreduced part
+ of A.
+
+ LDY (input) INTEGER
+ The leading dimension of the array Y. LDY >= N.
+
+ Further Details
+ ===============
+ The matrices Q and P are represented as products of elementary
+ reflectors:
+
+ Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb)
+
+ Each H(i) and G(i) has the form:
+
+ H(i) = I - tauq * v * v' and G(i) = I - taup * u * u'
+
+ where tauq and taup are complex scalars, and v and u are complex vectors.
+
+ If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
+ A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
+ A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+ If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
+ A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
+ A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+ The elements of the vectors v and u together form the m-by-nb matrix
+ V and the nb-by-n matrix U' which are needed, with X and Y, to apply
+ the transformation to the unreduced part of the matrix, using a block
+ update of the form: A := A - V*Y' - X*U'.
+
+ The contents of A on exit are illustrated by the following examples
+ with nb = 2:
+
+ m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n):
+
+ ( 1 1 u1 u1 u1) ( 1 u1 u1 u1 u1 u1)
+ ( v1 1 1 u2 u2) ( 1 1 u2 u2 u2 u2)
+ ( v1 v2 a a a ) ( v1 1 a a a a )
+ ( v1 v2 a a a ) ( v1 v2 a a a a )
+ ( v1 v2 a a a ) ( v1 v2 a a a a )
+ ( v1 v2 a a a )
+
+ where a denotes an element of the original matrix which is unchanged,
+ vi denotes an element of the vector defining H(i), and ui an element
+ of the vector defining G(i).
+ ===================================================================== */
+
+ typedef typename af::dtype_traits<Ty>::base_type Tr;
+
+ const bool is_cplx = opencl::is_complex<Ty>::value;
+
+ Tr *d = (Tr *)_d;
+ Tr *e = (Tr *)_e;
+
+ Ty c_neg_one = magma_neg_one<Ty>();
+ Ty c_one = magma_one<Ty>();
+ Ty c_zero = magma_zero<Ty>();
+ magma_int_t c__1 = 1;
+
+ magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3;
+ magma_int_t i__;
+ Ty alpha;
+
+ a_dim1 = lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --d;
+ --e;
+ --tauq;
+ --taup;
+
+ x_dim1 = ldx;
+ x_offset = 1 + x_dim1;
+ x -= x_offset;
+ dx_offset -= 1 + lddx;
+
+ y_dim1 = ldy;
+ y_offset = 1 + y_dim1;
+ y -= y_offset;
+ dy_offset -= 1 + lddy;
+
+ /* Quick return if possible */
+ if (m <= 0 || n <= 0) {
+ return 0;
+ }
+
+ Ty *f;
+ magma_malloc_cpu<Ty>(&f, std::max(n,m));
+ assert(f != NULL); // TODO return error, or allocate outside zlatrd
+
+ magma_event_t event = NULL;
+
+ gpu_gemv_func<Ty> gpu_blas_gemv;
+ cpu_gemv_func<Ty> cpu_blas_gemv;
+ cpu_scal_func<Ty> cpu_blas_scal;
+ cpu_axpy_func<Ty> cpu_blas_axpy;
+ cpu_larfg_func<Ty> cpu_lapack_larfg;
+ cpu_lacgv_func<Ty> cpu_lapack_lacgv;
+
+ CBLAS_TRANSPOSE CblasTransParam = is_cplx ? CblasConjTrans : CblasTrans;
+
+ if (m >= n) {
+ /* Reduce to upper bidiagonal form */
+ for (i__ = 1; i__ <= nb; ++i__) {
+ /* Update A(i:m,i) */
+ i__2 = m - i__ + 1;
+ i__3 = i__ - 1;
+
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__3, &y[i__+y_dim1], ldy);
+ }
+
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &a[i__ + a_dim1], lda,
+ &y[i__+y_dim1], ldy, cblas_scalar(&c_one), &a[i__ + i__ * a_dim1], c__1);
+
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__3, &y[i__+y_dim1], ldy);
+ }
+
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &x[i__ + x_dim1], ldx,
+ &a[i__*a_dim1+1], c__1, cblas_scalar(&c_one), &a[i__+i__*a_dim1], c__1);
+
+ /* Generate reflection Q(i) to annihilate A(i+1:m,i) */
+ alpha = a[i__ + i__ * a_dim1];
+ i__2 = m - i__ + 1;
+ i__3 = i__ + 1;
+ cpu_lapack_larfg(i__2, &alpha,
+ &a[std::min(i__3,m) + i__ * a_dim1], c__1, &tauq[i__]);
+ d[i__] = magma_real<Ty>(alpha);
+ if (i__ < n) {
+ a[i__ + i__ * a_dim1] = c_one;
+
+ /* Compute Y(i+1:n,i) */
+ i__2 = m - i__ + 1;
+ i__3 = n - i__;
+
+ // 1. Send the block reflector A(i+1:m,i) to the GPU ------
+ magma_setvector<Ty>(i__2,
+ a + i__ + i__ * a_dim1, 1,
+ da, da_offset + (i__-1)+(i__-1)* (ldda), 1,
+ queue);
+ // 2. Multiply ---------------------------------------------
+ gpu_blas_gemv(clblasConjTrans, i__2, i__3, c_one,
+ da, da_offset + (i__-1) + ((i__-1) + 1) * (ldda), ldda,
+ da, da_offset + (i__-1) + (i__-1) * (ldda), c__1, c_zero,
+ dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1,
+ 1, &queue, 0, nullptr, &event);
+
+ // 3. Put the result back ----------------------------------
+ magma_getmatrix_async<Ty>(i__3, 1,
+ dy, dy_offset + i__+1+i__*y_dim1, y_dim1,
+ y+i__+1+i__*y_dim1, y_dim1,
+ queue, &event);
+ i__2 = m - i__ + 1;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &a[i__ + a_dim1],
+ lda, &a[i__ + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+ &y[i__ * y_dim1 + 1], c__1);
+
+ i__2 = n - i__;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &y[i__ + 1 +y_dim1], ldy,
+ &y[i__ * y_dim1 + 1], c__1,
+ cblas_scalar(&c_zero), f, c__1);
+ i__2 = m - i__ + 1;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &x[i__ + x_dim1],
+ ldx, &a[i__ + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+ &y[i__ * y_dim1 + 1], c__1);
+
+ // 4. Synch to make sure the result is back ----------------
+ magma_event_sync(event);
+
+ if (i__3 != 0){
+ i__2 = n - i__;
+ cpu_blas_axpy(i__2, cblas_scalar(&c_one), f,c__1, &y[i__+1+i__*y_dim1],c__1);
+ }
+
+ i__2 = i__ - 1;
+ i__3 = n - i__;
+ cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_neg_one),
+ &a[(i__ + 1) * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], c__1, cblas_scalar(&c_one),
+ &y[i__ + 1 + i__ * y_dim1], c__1);
+ i__2 = n - i__;
+ cpu_blas_scal(i__2, cblas_scalar(&tauq[i__]), &y[i__ + 1 + i__ * y_dim1], c__1);
+
+ /* Update A(i,i+1:n) */
+ i__2 = n - i__;
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__2, &a[i__+(i__+1)*a_dim1], lda);
+ cpu_lapack_lacgv(i__, &a[i__+a_dim1], lda);
+ }
+
+ cpu_blas_gemv(CblasNoTrans, i__2, i__, cblas_scalar(&c_neg_one),
+ &y[i__ + 1 + y_dim1], ldy, &a[i__ + a_dim1], lda,
+ cblas_scalar(&c_one), &a[i__ + (i__ + 1) * a_dim1], lda);
+ i__2 = i__ - 1;
+ i__3 = n - i__;
+
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__, &a[i__+a_dim1], lda);
+ cpu_lapack_lacgv(i__2, &x[i__+x_dim1], ldx);
+ }
+
+ cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_neg_one), &a[(i__ + 1) *
+ a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, cblas_scalar(&c_one), &a[
+ i__ + (i__ + 1) * a_dim1], lda);
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__2, &x[i__+x_dim1], ldx);
+ }
+
+ /* Generate reflection P(i) to annihilate A(i,i+2:n) */
+ i__2 = n - i__;
+ /* Computing MIN */
+ i__3 = i__ + 2;
+ alpha = a[i__ + (i__ + 1) * a_dim1];
+ cpu_lapack_larfg(i__2, &alpha, &a[i__ + std::min(
+ i__3,n) * a_dim1], lda, &taup[i__]);
+ e[i__] = magma_real<Ty>(alpha);
+ a[i__ + (i__ + 1) * a_dim1] = c_one;
+
+ /* Compute X(i+1:m,i) */
+ i__2 = m - i__;
+ i__3 = n - i__;
+ // 1. Send the block reflector A(i+1:m,i) to the GPU ------
+ magma_setvector<Ty>(i__3,
+ a + i__ + (i__ +1)* a_dim1, lda,
+ da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda,
+ queue);
+ // 2. Multiply ---------------------------------------------
+ //magma_zcopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda,
+ // dy + 1 + lddy, 1);
+ gpu_blas_gemv(clblasNoTrans, i__2, i__3, c_one,
+ da, da_offset + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda,
+ da, da_offset + (i__-1) + ((i__-1)+1) * (ldda), ldda,
+ //dy + 1 + lddy, 1,
+ c_zero, dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1,
+ 1, &queue, 0, nullptr, &event);
+
+ // 3. Put the result back ----------------------------------
+ magma_getmatrix_async<Ty>(i__2, 1,
+ dx, dx_offset + i__+1+i__*x_dim1, x_dim1,
+ x+i__+1+i__*x_dim1, x_dim1,
+ queue, &event);
+
+ i__2 = n - i__;
+ cpu_blas_gemv(CblasTransParam, i__2, i__, cblas_scalar(&c_one), &y[i__ + 1 + y_dim1],
+ ldy, &a[i__ + (i__ + 1) * a_dim1], lda, cblas_scalar(&c_zero), &x[
+ i__ * x_dim1 + 1], c__1);
+
+ i__2 = m - i__;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__, cblas_scalar(&c_neg_one), &a[i__ + 1 + a_dim1], lda,
+ &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_zero), f, c__1);
+ i__2 = i__ - 1;
+ i__3 = n - i__;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_one), &a[(i__ + 1) * a_dim1 + 1],
+ lda, &a[i__ + (i__ + 1) * a_dim1], lda,
+ cblas_scalar(&c_zero), &x[i__ * x_dim1 + 1], c__1);
+
+ // 4. Synch to make sure the result is back ----------------
+ magma_event_sync(event);
+
+ if (i__!=0){
+ i__2 = m - i__;
+ cpu_blas_axpy(i__2, cblas_scalar(&c_one), f,c__1, &x[i__+1+i__*x_dim1],c__1);
+ }
+
+
+ i__2 = m - i__;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &x[i__ + 1 +
+ x_dim1], ldx, &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_one), &x[
+ i__ + 1 + i__ * x_dim1], c__1);
+ i__2 = m - i__;
+ cpu_blas_scal(i__2, cblas_scalar(&taup[i__]), &x[i__ + 1 + i__ * x_dim1], c__1);
+
+ if (is_cplx) {
+ i__2 = n - i__;
+ cpu_lapack_lacgv(i__2, &a[i__+(i__+1)*a_dim1], lda);
+ // 4. Send the block reflector A(i+1:m,i) to the GPU after ZLACGV()
+ magma_setvector<Ty>(i__2,
+ a + i__ + (i__ +1)* a_dim1, lda,
+ da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda,
+ queue);
+ }
+ }
+ }
+ }
+ else {
+ /* Reduce to lower bidiagonal form */
+ for (i__ = 1; i__ <= nb; ++i__) {
+
+ /* Update A(i,i:n) */
+ i__2 = n - i__ + 1;
+ i__3 = i__ - 1;
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__2, &a[i__ + i__ * a_dim1], lda);
+ cpu_lapack_lacgv(i__3, &a[i__ + a_dim1], lda);
+ }
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &y[i__ + y_dim1], ldy,
+ &a[i__ + a_dim1], lda, cblas_scalar(&c_one), &a[i__ + i__ * a_dim1], lda);
+ i__2 = i__ - 1;
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__3, &a[i__ + a_dim1], lda);
+ cpu_lapack_lacgv(i__3, &x[i__ + x_dim1], ldx);
+ }
+ i__3 = n - i__ + 1;
+ cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_neg_one), &a[i__ * a_dim1 + 1],
+ lda, &x[i__ + x_dim1], ldx, cblas_scalar(&c_one), &a[i__ + i__ * a_dim1], lda);
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__2, &x[i__ + x_dim1], ldx);
+ }
+
+ /* Generate reflection P(i) to annihilate A(i,i+1:n) */
+ i__2 = n - i__ + 1;
+ /* Computing MIN */
+ i__3 = i__ + 1;
+ alpha = a[i__ + i__ * a_dim1];
+ cpu_lapack_larfg(i__2, &alpha,
+ &a[i__ + std::min(i__3,n) * a_dim1], lda, &taup[i__]);
+ d[i__] = magma_real<Ty>(alpha);
+ if (i__ < m) {
+ a[i__ + i__ * a_dim1] = c_one;
+
+ /* Compute X(i+1:m,i) */
+ i__2 = m - i__;
+ i__3 = n - i__ + 1;
+
+ // 1. Send the block reflector A(i,i+1:n) to the GPU ------
+ magma_setvector<Ty>(i__3,
+ a + i__ + i__ * a_dim1, lda,
+ da, da_offset + (i__-1)+(i__-1)* (ldda), ldda,
+ queue);
+
+ // 2. Multiply ---------------------------------------------
+ //magma_zcopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda,
+ // dy + 1 + lddy, 1);
+ gpu_blas_gemv(clblasNoTrans, i__2, i__3, c_one,
+ da, da_offset + (i__-1)+1 + (i__-1) * ldda, ldda,
+ da, da_offset + (i__-1) + (i__-1) * ldda, ldda,
+ // dy + 1 + lddy, 1,
+ c_zero,
+ dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1,
+ 1, &queue, 0, nullptr, &event);
+
+
+ // 3. Put the result back ----------------------------------
+ magma_getmatrix_async<Ty>(i__2, 1,
+ dx, dx_offset + i__+1+i__*x_dim1, x_dim1,
+ x+i__+1+i__*x_dim1, x_dim1,
+ queue, &event);
+
+ i__2 = n - i__ + 1;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &y[i__ + y_dim1],
+ ldy, &a[i__ + i__ * a_dim1], lda, cblas_scalar(&c_zero),
+ &x[i__ * x_dim1 + 1], c__1);
+ i__2 = m - i__;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+ &a[i__ + 1 + a_dim1], lda, &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_zero),
+ f, c__1);
+
+ i__2 = i__ - 1;
+ i__3 = n - i__ + 1;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_one),
+ &a[i__ * a_dim1 + 1], lda, &a[i__ + i__ * a_dim1], lda, cblas_scalar(&c_zero),
+ &x[i__ * x_dim1 + 1], c__1);
+
+ // 4. Synch to make sure the result is back ----------------
+ magma_event_sync(event);
+ if (i__2 != 0){
+ i__3 = m - i__;
+ cpu_blas_axpy(i__3, cblas_scalar(&c_one), f,c__1, &x[i__+1+i__*x_dim1],c__1);
+ }
+
+ i__2 = m - i__;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+ &x[i__ + 1 + x_dim1], ldx, &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_one),
+ &x[i__ + 1 + i__ * x_dim1], c__1);
+ i__2 = m - i__;
+ cpu_blas_scal(i__2, cblas_scalar(&taup[i__]), &x[i__ + 1 + i__ * x_dim1], c__1);
+ i__2 = n - i__ + 1;
+
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__2, &a[i__ + i__ * a_dim1], lda);
+ magma_setvector<Ty>(i__2,
+ a + i__ + (i__ )* a_dim1, lda,
+ da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda,
+ queue);
+ }
+
+ /* Update A(i+1:m,i) */
+ i__2 = m - i__;
+ i__3 = i__ - 1;
+
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__3, &y[i__ + y_dim1], ldy);
+ }
+
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+ &a[i__ + 1 + a_dim1], lda, &y[i__ + y_dim1], ldy, cblas_scalar(&c_one),
+ &a[i__ + 1 + i__ * a_dim1], c__1);
+ i__2 = m - i__;
+ if (is_cplx) {
+ cpu_lapack_lacgv(i__3, &y[i__ + y_dim1], ldy);
+ }
+ cpu_blas_gemv(CblasNoTrans, i__2, i__, cblas_scalar(&c_neg_one),
+ &x[i__ + 1 + x_dim1], ldx, &a[i__ * a_dim1 + 1], c__1, cblas_scalar(&c_one),
+ &a[i__ + 1 + i__ * a_dim1], c__1);
+
+ /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
+ i__2 = m - i__;
+ i__3 = i__ + 2;
+ alpha = a[i__ + 1 + i__ * a_dim1];
+ cpu_lapack_larfg(i__2, &alpha,
+ &a[std::min(i__3,m) + i__ * a_dim1], c__1, &tauq[i__]);
+ e[i__] = magma_real<Ty>(alpha);
+ a[i__ + 1 + i__ * a_dim1] = c_one;
+
+ /* Compute Y(i+1:n,i) */
+ i__2 = m - i__;
+ i__3 = n - i__;
+
+ // 1. Send the block reflector A(i+1:m,i) to the GPU ------
+ magma_setvector<Ty>(i__2,
+ a + i__ +1+ i__ * a_dim1, 1,
+ da, da_offset + (i__-1)+1+ (i__-1)*(ldda), 1,
+ queue);
+ // 2. Multiply ---------------------------------------------
+ gpu_blas_gemv(clblasConjTrans, i__2, i__3, c_one,
+ da, da_offset + (i__-1)+1+ ((i__-1)+1) * ldda, ldda,
+ da, da_offset + (i__-1)+1+ (i__-1) * ldda, c__1,
+ c_zero, dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1,
+ 1, &queue, 0, nullptr, &event);
+
+ // 3. Put the result back ----------------------------------
+ magma_getmatrix_async<Ty>(i__3, 1,
+ dy, dy_offset + i__+1+i__*y_dim1, y_dim1,
+ y+i__+1+i__*y_dim1, y_dim1,
+ queue, &event);
+
+ i__2 = m - i__;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &a[i__ + 1 + a_dim1],
+ lda, &a[i__ + 1 + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+ &y[ i__ * y_dim1 + 1], c__1);
+ i__2 = n - i__;
+ i__3 = i__ - 1;
+ cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+ &y[i__ + 1 + y_dim1], ldy, &y[i__ * y_dim1 + 1], c__1,
+ cblas_scalar(&c_zero), f, c__1);
+
+ i__2 = m - i__;
+ cpu_blas_gemv(CblasTransParam, i__2, i__, cblas_scalar(&c_one), &x[i__ + 1 + x_dim1],
+ ldx, &a[i__ + 1 + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+ &y[i__ * y_dim1 + 1], c__1);
+
+ // 4. Synch to make sure the result is back ----------------
+ magma_event_sync(event);
+ if (i__3 != 0){
+ i__2 = n - i__;
+ cpu_blas_axpy(i__2, cblas_scalar(&c_one), f,c__1, &y[i__+1+i__*y_dim1],c__1);
+ }
+
+ i__2 = n - i__;
+ cpu_blas_gemv(CblasTransParam, i__, i__2, cblas_scalar(&c_neg_one),
+ &a[(i__ + 1) * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1],
+ c__1, cblas_scalar(&c_one), &y[i__ + 1 + i__ * y_dim1], c__1);
+ i__2 = n - i__;
+ cpu_blas_scal(i__2, cblas_scalar(&tauq[i__]), &y[i__ + 1 + i__ * y_dim1], c__1);
+ }
+ else {
+ if (is_cplx) {
+ i__2 = n - i__ + 1;
+ cpu_lapack_lacgv(i__2, &a[i__ + i__ * a_dim1], lda);
+ magma_setvector<Ty>(i__2,
+ a + i__ + (i__ )* a_dim1, lda,
+ da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda,
+ queue);
+ }
+ }
+ }
+ }
+
+ magma_queue_sync(queue);
+ magma_free_cpu(f);
+
+ return MAGMA_SUCCESS;
+}
+
+#define INSTANTIATE(Ty) \
+ template magma_int_t \
+ magma_labrd_gpu<Ty>( \
+ magma_int_t m, magma_int_t n, magma_int_t nb, \
+ Ty *a, magma_int_t lda, \
+ cl_mem da, size_t da_offset, magma_int_t ldda, \
+ void *_d, void *_e, Ty *tauq, Ty *taup, \
+ Ty *x, magma_int_t ldx, \
+ cl_mem dx, size_t dx_offset, magma_int_t lddx, \
+ Ty *y, magma_int_t ldy, \
+ cl_mem dy, size_t dy_offset, magma_int_t lddy, \
+ magma_queue_t queue);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(magmaFloatComplex)
+INSTANTIATE(magmaDoubleComplex)
diff --git a/src/backend/opencl/magma/magma.h b/src/backend/opencl/magma/magma.h
index 5584c20..7797775 100644
--- a/src/backend/opencl/magma/magma.h
+++ b/src/backend/opencl/magma/magma.h
@@ -93,4 +93,25 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
magma_queue_t queue,
magma_int_t *info);
+template<typename Ty> magma_int_t
+magma_labrd_gpu(magma_int_t m, magma_int_t n, magma_int_t nb,
+ Ty *a, magma_int_t lda,
+ cl_mem da, size_t da_offset, magma_int_t ldda,
+ void *_d, void *_e, Ty *tauq, Ty *taup,
+ Ty *x, magma_int_t ldx,
+ cl_mem dx, size_t dx_offset, magma_int_t lddx,
+ Ty *y, magma_int_t ldy,
+ cl_mem dy, size_t dy_offset, magma_int_t lddy,
+ magma_queue_t queue);
+
+template<typename Ty> magma_int_t
+magma_gebrd_hybrid(magma_int_t m, magma_int_t n,
+ Ty *a, magma_int_t lda,
+ cl_mem da, size_t da_offset, magma_int_t ldda,
+ void *_d, void *_e,
+ Ty *tauq, Ty *taup,
+ Ty *work, magma_int_t lwork,
+ magma_queue_t queue,
+ magma_int_t *info, bool copy);
+
#endif
diff --git a/src/backend/opencl/magma/magma_cpu_lapack.h b/src/backend/opencl/magma/magma_cpu_lapack.h
index d406508..c051306 100644
--- a/src/backend/opencl/magma/magma_cpu_lapack.h
+++ b/src/backend/opencl/magma/magma_cpu_lapack.h
@@ -16,6 +16,8 @@
#define LAPACKE_dunmqr_work(...) LAPACKE_dormqr_work(__VA_ARGS__)
#define LAPACKE_sungqr_work(...) LAPACKE_sorgqr_work(__VA_ARGS__)
#define LAPACKE_dungqr_work(...) LAPACKE_dorgqr_work(__VA_ARGS__)
+#define LAPACKE_sungbr_work(...) LAPACKE_sorgbr_work(__VA_ARGS__)
+#define LAPACKE_dungbr_work(...) LAPACKE_dorgbr_work(__VA_ARGS__)
template<typename... Args>
int LAPACKE_slacgv(Args... args) { return 0; }
@@ -44,24 +46,42 @@ int LAPACKE_dlacgv(Args... args) { return 0; }
template<typename T> \
struct cpu_##NAME##_func;
-#define CPU_LAPACK_FUNC1(NAME, TYPE, X) \
- template<> \
- struct cpu_##NAME##_func<TYPE> \
- { \
- template<typename... Args> \
- int \
- operator() (Args... args) \
- { return LAPACK_NAME(X##NAME)(LAPACK_COL_MAJOR, args...); } \
+#define CPU_LAPACK_FUNC1(NAME, TYPE, X) \
+ template<> \
+ struct cpu_##NAME##_func<TYPE> \
+ { \
+ template<typename... Args> \
+ int \
+ operator() (Args... args) \
+ { \
+ int err = LAPACK_NAME(X##NAME)(LAPACK_COL_MAJOR, args...); \
+ if (err != 0) AF_ERROR("Error in "#NAME, AF_ERR_INTERNAL); \
+ return err; \
+ } \
};
-#define CPU_LAPACK_FUNC2(NAME, TYPE, X) \
- template<> \
- struct cpu_##NAME##_func<TYPE> \
- { \
- template<typename... Args> \
- int \
- operator() (Args... args) \
- { return LAPACK_NAME(X##NAME)(args...); } \
+#define CPU_LAPACK_FUNC2(NAME, TYPE, X) \
+ template<> \
+ struct cpu_##NAME##_func<TYPE> \
+ { \
+ template<typename... Args> \
+ int \
+ operator() (Args... args) \
+ { \
+ int err = LAPACK_NAME(X##NAME)(args...); \
+ if (err != 0) AF_ERROR("Error in "#NAME, AF_ERR_INTERNAL); \
+ return err; \
+ } \
+ };
+
+#define CPU_LAPACK_FUNC3(NAME, TYPE, X) \
+ template<> \
+ struct cpu_##NAME##_func<TYPE> \
+ { \
+ template<typename... Args> \
+ double \
+ operator() (Args... args) \
+ { return LAPACK_NAME(X##NAME)(args...); } \
};
#define CPU_LAPACK_DECL1(NAME) \
@@ -78,17 +98,28 @@ int LAPACKE_dlacgv(Args... args) { return 0; }
CPU_LAPACK_FUNC2(NAME, magmaFloatComplex, c) \
CPU_LAPACK_FUNC2(NAME, magmaDoubleComplex, z) \
+#define CPU_LAPACK_DECL3(NAME) \
+ CPU_LAPACK_FUNC_DEF(NAME) \
+ CPU_LAPACK_FUNC3(NAME, float, s) \
+ CPU_LAPACK_FUNC3(NAME, double, d) \
+
CPU_LAPACK_DECL1(getrf)
CPU_LAPACK_DECL1(gebrd)
+CPU_LAPACK_DECL1(gebrd_work)
CPU_LAPACK_DECL1(potrf)
CPU_LAPACK_DECL1(trtri)
CPU_LAPACK_DECL1(geqrf_work)
CPU_LAPACK_DECL1(larft)
CPU_LAPACK_DECL1(unmqr_work)
CPU_LAPACK_DECL1(ungqr_work)
+CPU_LAPACK_DECL1(ungbr_work)
+CPU_LAPACK_DECL1(bdsqr_work)
CPU_LAPACK_DECL1(laswp)
CPU_LAPACK_DECL1(laset)
+
CPU_LAPACK_DECL2(lacgv)
CPU_LAPACK_DECL2(larfg)
+CPU_LAPACK_DECL1(lacpy)
+CPU_LAPACK_DECL3(lamch)
#endif
diff --git a/src/backend/opencl/magma/magma_helper.cpp b/src/backend/opencl/magma/magma_helper.cpp
index a3a3d9e..584a412 100644
--- a/src/backend/opencl/magma/magma_helper.cpp
+++ b/src/backend/opencl/magma/magma_helper.cpp
@@ -159,14 +159,12 @@ magma_int_t magma_get_geqrf_nb<magmaDoubleComplex>( magma_int_t m )
else return 128;
}
-template<typename T> magma_int_t magma_get_gebrd_nb(int num) { return 256; }
-
template<typename T> T magma_make(double r, double i) { return (T) r; }
template float magma_make<float>(double r, double i);
template double magma_make<double>(double r, double i);
template<> magmaFloatComplex magma_make<magmaFloatComplex>(double r, double i)
{
- magmaFloatComplex tmp = {r, i};
+ magmaFloatComplex tmp = {(float)r, (float)i};
return tmp;
}
template<> magmaDoubleComplex magma_make<magmaDoubleComplex>(double r, double i)
diff --git a/src/backend/opencl/magma/magma_helper.h b/src/backend/opencl/magma/magma_helper.h
index a63c8d1..6e81cb5 100644
--- a/src/backend/opencl/magma/magma_helper.h
+++ b/src/backend/opencl/magma/magma_helper.h
@@ -22,6 +22,6 @@ template<typename T> bool magma_is_real();
template<typename T> magma_int_t magma_get_getrf_nb(int num);
template<typename T> magma_int_t magma_get_potrf_nb(int num);
template<typename T> magma_int_t magma_get_geqrf_nb(int num);
-template<typename T> magma_int_t magma_get_gebrd_nb(int num);
+template<typename T> magma_int_t magma_get_gebrd_nb(int num) { return 256; }
#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list