[arrayfire] 304/408: Adding functions from clMagma necessary for OpenCL SVD:

Mon Sep 21 19:12:18 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit fde6380e9cc5681e434dbcb1f0e0320fdadfa5b7
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Thu Aug 20 10:09:00 2015 -0400

    Adding functions from clMagma necessary for OpenCL SVD:
    
    - gebrd_hybrid
    - labrd_gpu
---
 src/backend/opencl/magma/gebrd.cpp          | 367 +++++++++++++++
 src/backend/opencl/magma/labrd.cpp          | 663 ++++++++++++++++++++++++++++
 src/backend/opencl/magma/magma.h            |  21 +
 src/backend/opencl/magma/magma_cpu_lapack.h |  63 ++-
 src/backend/opencl/magma/magma_helper.cpp   |   4 +-
 src/backend/opencl/magma/magma_helper.h     |   2 +-
 6 files changed, 1100 insertions(+), 20 deletions(-)

diff --git a/src/backend/opencl/magma/gebrd.cpp b/src/backend/opencl/magma/gebrd.cpp
new file mode 100644
index 0000000..e4df977
--- /dev/null
+++ b/src/backend/opencl/magma/gebrd.cpp
@@ -0,0 +1,367 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+/***********************************************************************
+ * Based on MAGMA library http://icl.cs.utk.edu/magma/
+ * Below is the original copyright.
+ *
+ *   -- MAGMA (version 0.1) --
+ *      Univ. of Tennessee, Knoxville
+ *      Univ. of California, Berkeley
+ *      Univ. of Colorado, Denver
+ *      @date
+ *
+ *      @precisions normal z -> s d c
+ *
+ * -- Innovative Computing Laboratory
+ * -- Electrical Engineering and Computer Science Department
+ * -- University of Tennessee
+ * -- (C) Copyright 2009-2013
+ *
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
+ *
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * Neither  the  name of the University of Tennessee, Knoxville nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **********************************************************************/
+
+#include "magma.h"
+#include "magma_blas.h"
+#include "magma_data.h"
+#include "magma_cpu_lapack.h"
+#include "magma_cpu_blas.h"
+#include "magma_helper.h"
+#include "magma_sync.h"
+#include <traits.hpp>
+#include <types.hpp>
+
+#include <algorithm>
+
+// produces pointer and offset as two args to magmaBLAS routines
+#define dA(i,j)  da, ((da_offset) + (i) + (j)*ldda)
+// produces pointer as single arg to BLAS routines
+#define A(i,j)  &a[ (i) + (j)*lda ]
+
+template<typename Ty>
+magma_int_t
+magma_gebrd_hybrid(
+    magma_int_t m, magma_int_t n,
+    Ty *a, magma_int_t lda,
+    cl_mem da, size_t da_offset, magma_int_t ldda,
+    void *_d, void *_e,
+    Ty *tauq, Ty *taup,
+    Ty *work, magma_int_t lwork,
+    magma_queue_t queue,
+    magma_int_t *info,
+    bool copy)
+{
+/*  -- MAGMA (version 1.1) --
+    Univ. of Tennessee, Knoxville
+    Univ. of California, Berkeley
+    Univ. of Colorado, Denver
+    @date
+
+    Purpose
+    =======
+    ZGEBRD reduces a general complex M-by-N matrix A to upper or lower
+    bidiagonal form B by an orthogonal transformation: Q**H * A * P = B.
+
+    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+    Arguments
+    =========
+    M       (input) INTEGER
+    The number of rows in the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+    The number of columns in the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
+    On entry, the M-by-N general matrix to be reduced.
+    On exit,
+    if m >= n, the diagonal and the first superdiagonal are
+    overwritten with the upper bidiagonal matrix B; the
+    elements below the diagonal, with the array TAUQ, represent
+    the orthogonal matrix Q as a product of elementary
+    reflectors, and the elements above the first superdiagonal,
+    with the array TAUP, represent the orthogonal matrix P as
+    a product of elementary reflectors;
+    if m < n, the diagonal and the first subdiagonal are
+    overwritten with the lower bidiagonal matrix B; the
+    elements below the first subdiagonal, with the array TAUQ,
+    represent the orthogonal matrix Q as a product of
+    elementary reflectors, and the elements above the diagonal,
+    with the array TAUP, represent the orthogonal matrix P as
+    a product of elementary reflectors.
+    See Further Details.
+
+    LDA     (input) INTEGER
+    The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) double precision array, dimension (min(M,N))
+    The diagonal elements of the bidiagonal matrix B:
+    D(i) = A(i,i).
+
+    E       (output) double precision array, dimension (min(M,N)-1)
+    The off-diagonal elements of the bidiagonal matrix B:
+    if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
+    if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
+
+    TAUQ    (output) COMPLEX_16 array dimension (min(M,N))
+    The scalar factors of the elementary reflectors which
+    represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX_16 array, dimension (min(M,N))
+    The scalar factors of the elementary reflectors which
+    represent the orthogonal matrix P. See Further Details.
+
+    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
+    On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+    The length of the array WORK. LWORK >= (M+N)*NB, where NB
+    is the optimal blocksize.
+
+    If LWORK = -1, then a workspace query is assumed; the routine
+    only calculates the optimal size of the WORK array, returns
+    this value as the first entry of the WORK array, and no error
+    message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+    = 0:  successful exit
+    < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    If m >= n,
+    Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
+    Each H(i) and G(i) has the form:
+    H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+    where tauq and taup are complex scalars, and v and u are complex vectors;
+    v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
+    u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n,
+    Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
+    Each H(i) and G(i) has the form:
+    H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+    where tauq and taup are complex scalars, and v and u are complex vectors;
+    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
+    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
+    tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The contents of A on exit are illustrated by the following examples:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+    ( d   e   u1  u1  u1)           ( d   u1  u1  u1  u1  u1)
+    ( v1  d   e   u2  u2)           ( e   d   u2  u2  u2  u2)
+    ( v1  v2  d   e   u3)           ( v1  e   d   u3  u3  u3)
+    ( v1  v2  v3  d   e )           ( v1  v2  e   d   u4  u4)
+    ( v1  v2  v3  v4  d )           ( v1  v2  v3  e   d   u5)
+    ( v1  v2  v3  v4  v5)
+
+    where d and e denote diagonal and off-diagonal elements of B, vi
+    denotes an element of the vector defining H(i), and ui an element of
+    the vector defining G(i).
+    =====================================================================    */
+
+    typedef typename af::dtype_traits<Ty>::base_type Tr;
+
+    Tr *d = (Tr *)_d;
+    Tr *e = (Tr *)_e;
+
+
+    Ty c_neg_one = magma_neg_one<Ty>();
+    Ty c_one     = magma_one<Ty>();
+    cl_mem dwork;
+
+    magma_int_t ncol, nrow, jmax, nb;
+
+    magma_int_t i, j, nx;
+    //magma_int_t iinfo;
+
+    magma_int_t minmn;
+    magma_int_t ldwrkx, ldwrky, lwkopt;
+    magma_int_t lquery;
+
+    nb   = magma_get_gebrd_nb<Ty>(n);
+
+    lwkopt = (m + n) * nb;
+    work[0] = magma_make<Ty>(lwkopt, 0.);
+    lquery = (lwork == -1);
+
+    /* Check arguments */
+    *info = 0;
+    if (m < 0) {
+        *info = -1;
+    } else if (n < 0) {
+        *info = -2;
+    } else if (lda < std::max(1,m)) {
+        *info = -4;
+    } else if (lwork < lwkopt && (! lquery)) {
+        *info = -10;
+    }
+    if (*info < 0) {
+        //magma_xerbla(__func__, -(*info));
+        return *info;
+    }
+    else if (lquery)
+        return *info;
+
+    /* Quick return if possible */
+    minmn = std::min(m,n);
+    if (minmn == 0) {
+        work[0] = c_one;
+        return *info;
+    }
+
+    if (MAGMA_SUCCESS != magma_malloc<Ty>(&dwork, (m + n)*nb)) {
+        *info = MAGMA_ERR_DEVICE_ALLOC;
+        return *info;
+    }
+    size_t dwork_offset = 0;
+
+    cl_event event =  0;
+
+    ldwrkx = m;
+    ldwrky = n;
+
+    /* Set the block/unblock crossover point NX. */
+    nx = nb;
+    assert(nx <= nb);
+
+    /* Copy the matrix to the GPU */
+    if (copy && minmn - nx >= 1) {
+        magma_setmatrix<Ty>(m, n, a, lda, da, da_offset, ldda, queue);
+    }
+
+    gpu_gemm_func<Ty> gpu_blas_gemm;
+    cpu_gebrd_work_func<Ty> cpu_lapack_gebrd_work;
+
+    for (i=0; i< (minmn - nx); i += nb) {
+        /*  Reduce rows and columns i:i+nb-1 to bidiagonal form and return
+            the matrices X and Y which are needed to update the unreduced
+            part of the matrix */
+        nrow = m - i;
+        ncol = n - i;
+
+        /*   Get the current panel (no need for the 1st iteration) */
+        if (i > 0) {
+            magma_getmatrix<Ty>(nrow, nb, dA(i, i), ldda, A(i, i), lda, queue);
+            magma_getmatrix<Ty>(nb, ncol - nb,
+                                dA(i, i+nb), ldda,
+                                A(i, i+nb), lda, queue);
+        }
+
+        magma_labrd_gpu<Ty>(nrow, ncol, nb,
+                            A(i, i),          lda,    dA(i, i),          ldda,
+                            d+i, e+i, tauq+i, taup+i,
+                            work,             ldwrkx, dwork, dwork_offset,             ldwrkx,  // x, dx
+                            work+(ldwrkx*nb), ldwrky, dwork, dwork_offset+(ldwrkx*nb), ldwrky,  // y, dy
+                            queue);
+
+        /*  Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
+            of the form  A := A - V*Y' - X*U' */
+        nrow = m - i - nb;
+        ncol = n - i - nb;
+
+        // Send Y back to the GPU
+        magma_setmatrix<Ty>(nrow, nb, work+nb, ldwrkx, dwork, dwork_offset+nb, ldwrkx, queue);
+        magma_setmatrix<Ty>(ncol, nb,
+                            work  +               (ldwrkx+1)*nb, ldwrky,
+                            dwork, dwork_offset + (ldwrkx+1)*nb, ldwrky, queue);
+
+        gpu_blas_gemm(clblasNoTrans, clblasConjTrans,
+                      nrow, ncol, nb,
+                      c_neg_one, dA(i+nb, i  ),      ldda,
+                      dwork, dwork_offset+(ldwrkx+1)*nb, ldwrky,
+                      c_one,     dA(i+nb, i+nb), ldda,
+                      1, &queue, 0, nullptr, &event);
+
+        gpu_blas_gemm(clblasNoTrans, clblasNoTrans,
+                      nrow, ncol, nb,
+                      c_neg_one, dwork, dwork_offset+nb, ldwrkx,
+                      dA(i,    i+nb), ldda,
+                      c_one,     dA(i+nb, i+nb), ldda,
+                      1, &queue, 0, nullptr, &event);
+
+        /* Copy diagonal and off-diagonal elements of B back into A */
+        if (m >= n) {
+            jmax = i + nb;
+            for (j = i; j < jmax; ++j) {
+                *A(j, j ) = magma_make<Ty>(d[j], 0.);
+                *A(j, j+1) = magma_make<Ty>(e[j], 0.);
+            }
+        } else {
+            jmax = i + nb;
+            for (j = i; j < jmax; ++j) {
+                *A(j,   j) = magma_make<Ty>(d[j], 0.);
+                *A(j+1, j) = magma_make<Ty>(e[j], 0.);
+            }
+        }
+    }
+
+    /* Use unblocked code to reduce the remainder of the matrix */
+    nrow = m - i;
+    ncol = n - i;
+
+    if (0 < minmn - nx) {
+        magma_getmatrix<Ty>(nrow, ncol, dA(i, i), ldda, A(i, i), lda, queue);
+    }
+
+    *info = cpu_lapack_gebrd_work(nrow, ncol,
+                                  A(i, i), lda, d+i, e+i,
+                                  tauq+i, taup+i, work, lwork);
+    work[0] = magma_make<Ty>(lwkopt, 0.);
+
+    magma_free(dwork);
+    return *info;
+} /* magma_zgebrd */
+
+#define INSTANTIATE(Ty)                                 \
+    template magma_int_t                                \
+    magma_gebrd_hybrid<Ty>(                             \
+        magma_int_t m, magma_int_t n,                   \
+        Ty *a, magma_int_t lda,                         \
+        cl_mem da, size_t da_offset, magma_int_t ldda,  \
+        void *_d, void *_e,                             \
+        Ty *tauq, Ty *taup,                             \
+        Ty *work, magma_int_t lwork,                    \
+        magma_queue_t queue,                            \
+        magma_int_t *info, bool copy);                  \
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(magmaFloatComplex)
+INSTANTIATE(magmaDoubleComplex)
diff --git a/src/backend/opencl/magma/labrd.cpp b/src/backend/opencl/magma/labrd.cpp
new file mode 100644
index 0000000..ee7f120
--- /dev/null
+++ b/src/backend/opencl/magma/labrd.cpp
@@ -0,0 +1,663 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+/***********************************************************************
+ * Based on MAGMA library http://icl.cs.utk.edu/magma/
+ * Below is the original copyright.
+ *
+ *   -- MAGMA (version 0.1) --
+ *      Univ. of Tennessee, Knoxville
+ *      Univ. of California, Berkeley
+ *      Univ. of Colorado, Denver
+ *      @date
+ *
+ *      @precisions normal z -> s d c
+ *
+ * -- Innovative Computing Laboratory
+ * -- Electrical Engineering and Computer Science Department
+ * -- University of Tennessee
+ * -- (C) Copyright 2009-2013
+ *
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
+ *
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * Neither  the  name of the University of Tennessee, Knoxville nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **********************************************************************/
+
+
+#include "magma.h"
+#include "magma_blas.h"
+#include "magma_data.h"
+#include "magma_cpu_lapack.h"
+#include "magma_cpu_blas.h"
+#include "magma_helper.h"
+#include "magma_sync.h"
+#include <traits.hpp>
+#include <types.hpp>
+
+#include <algorithm>
+
+template<typename Ty>  magma_int_t
+magma_labrd_gpu(
+    magma_int_t m, magma_int_t n, magma_int_t nb,
+    Ty *a, magma_int_t lda,
+    cl_mem da, size_t da_offset, magma_int_t ldda,
+    void *_d, void *_e, Ty *tauq, Ty *taup,
+    Ty *x, magma_int_t ldx,
+    cl_mem dx, size_t dx_offset, magma_int_t lddx,
+    Ty *y, magma_int_t ldy,
+    cl_mem dy, size_t dy_offset, magma_int_t lddy,
+    magma_queue_t queue)
+{
+/*  -- MAGMA (version 1.1) --
+    Univ. of Tennessee, Knoxville
+    Univ. of California, Berkeley
+    Univ. of Colorado, Denver
+    @date
+
+    Purpose
+    =======
+    ZLABRD reduces the first NB rows and columns of a complex general
+    m by n matrix A to upper or lower bidiagonal form by an orthogonal
+    transformation Q' * A * P, and returns the matrices X and Y which
+    are needed to apply the transformation to the unreduced part of A.
+
+    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+    bidiagonal form.
+
+    This is an auxiliary routine called by SGEBRD
+
+    Arguments
+    =========
+    M       (input) INTEGER
+    The number of rows in the matrix A.
+
+    N       (input) INTEGER
+    The number of columns in the matrix A.
+
+    NB      (input) INTEGER
+    The number of leading rows and columns of A to be reduced.
+
+    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
+    On entry, the m by n general matrix to be reduced.
+    On exit, the first NB rows and columns of the matrix are
+    overwritten; the rest of the array is unchanged.
+    If m >= n, elements on and below the diagonal in the first NB
+    columns, with the array TAUQ, represent the orthogonal
+    matrix Q as a product of elementary reflectors; and
+    elements above the diagonal in the first NB rows, with the
+    array TAUP, represent the orthogonal matrix P as a product
+    of elementary reflectors.
+    If m < n, elements below the diagonal in the first NB
+    columns, with the array TAUQ, represent the orthogonal
+    matrix Q as a product of elementary reflectors, and
+    elements on and above the diagonal in the first NB rows,
+    with the array TAUP, represent the orthogonal matrix P as
+    a product of elementary reflectors.
+    See Further Details.
+
+    LDA     (input) INTEGER
+    The leading dimension of the array A.  LDA >= max(1,M).
+
+    D       (output) COMPLEX_16 array, dimension (NB)
+    The diagonal elements of the first NB rows and columns of
+    the reduced matrix.  D(i) = A(i,i).
+
+    E       (output) COMPLEX_16 array, dimension (NB)
+    The off-diagonal elements of the first NB rows and columns of
+    the reduced matrix.
+
+    TAUQ    (output) COMPLEX_16 array dimension (NB)
+    The scalar factors of the elementary reflectors which
+    represent the orthogonal matrix Q. See Further Details.
+
+    TAUP    (output) COMPLEX_16 array, dimension (NB)
+    The scalar factors of the elementary reflectors which
+    represent the orthogonal matrix P. See Further Details.
+
+    X       (output) COMPLEX_16 array, dimension (LDX,NB)
+    The m-by-nb matrix X required to update the unreduced part
+    of A.
+
+    LDX     (input) INTEGER
+    The leading dimension of the array X. LDX >= M.
+
+    Y       (output) COMPLEX_16 array, dimension (LDY,NB)
+    The n-by-nb matrix Y required to update the unreduced part
+    of A.
+
+    LDY     (input) INTEGER
+    The leading dimension of the array Y. LDY >= N.
+
+    Further Details
+    ===============
+    The matrices Q and P are represented as products of elementary
+    reflectors:
+
+    Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)
+
+    Each H(i) and G(i) has the form:
+
+    H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
+
+    where tauq and taup are complex scalars, and v and u are complex vectors.
+
+    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
+    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
+    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
+    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).
+
+    The elements of the vectors v and u together form the m-by-nb matrix
+    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
+    the transformation to the unreduced part of the matrix, using a block
+    update of the form:  A := A - V*Y' - X*U'.
+
+    The contents of A on exit are illustrated by the following examples
+    with nb = 2:
+
+    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):
+
+    ( 1   1   u1  u1  u1)           ( 1   u1  u1  u1  u1  u1)
+    ( v1  1   1   u2  u2)           ( 1   1   u2  u2  u2  u2)
+    ( v1  v2  a   a   a )           ( v1  1   a   a   a   a )
+    ( v1  v2  a   a   a )           ( v1  v2  a   a   a   a )
+    ( v1  v2  a   a   a )           ( v1  v2  a   a   a   a )
+    ( v1  v2  a   a   a )
+
+    where a denotes an element of the original matrix which is unchanged,
+    vi denotes an element of the vector defining H(i), and ui an element
+    of the vector defining G(i).
+    =====================================================================    */
+
+    typedef typename af::dtype_traits<Ty>::base_type Tr;
+
+    const bool is_cplx = opencl::is_complex<Ty>::value;
+
+    Tr *d = (Tr *)_d;
+    Tr *e = (Tr *)_e;
+
+    Ty c_neg_one = magma_neg_one<Ty>();
+    Ty c_one = magma_one<Ty>();
+    Ty c_zero = magma_zero<Ty>();
+    magma_int_t c__1 = 1;
+
+    magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3;
+    magma_int_t i__;
+    Ty alpha;
+
+    a_dim1 = lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --d;
+    --e;
+    --tauq;
+    --taup;
+
+    x_dim1 = ldx;
+    x_offset = 1 + x_dim1;
+    x -= x_offset;
+    dx_offset -= 1 + lddx;
+
+    y_dim1 = ldy;
+    y_offset = 1 + y_dim1;
+    y -= y_offset;
+    dy_offset -= 1 + lddy;
+
+    /* Quick return if possible */
+    if (m <= 0 || n <= 0) {
+        return 0;
+    }
+
+    Ty *f;
+    magma_malloc_cpu<Ty>(&f, std::max(n,m));
+    assert(f != NULL);  // TODO return error, or allocate outside zlatrd
+
+    magma_event_t event = NULL;
+
+    gpu_gemv_func<Ty> gpu_blas_gemv;
+    cpu_gemv_func<Ty> cpu_blas_gemv;
+    cpu_scal_func<Ty> cpu_blas_scal;
+    cpu_axpy_func<Ty> cpu_blas_axpy;
+    cpu_larfg_func<Ty> cpu_lapack_larfg;
+    cpu_lacgv_func<Ty> cpu_lapack_lacgv;
+
+    CBLAS_TRANSPOSE CblasTransParam = is_cplx ? CblasConjTrans : CblasTrans;
+
+    if (m >= n) {
+        /* Reduce to upper bidiagonal form */
+        for (i__ = 1; i__ <= nb; ++i__) {
+            /*  Update A(i:m,i) */
+            i__2 = m - i__ + 1;
+            i__3 = i__ - 1;
+
+            if (is_cplx) {
+                cpu_lapack_lacgv(i__3, &y[i__+y_dim1], ldy);
+            }
+
+            cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &a[i__ + a_dim1], lda,
+                          &y[i__+y_dim1], ldy, cblas_scalar(&c_one), &a[i__ + i__ * a_dim1], c__1);
+
+            if (is_cplx) {
+                cpu_lapack_lacgv(i__3, &y[i__+y_dim1], ldy);
+            }
+
+            cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &x[i__ + x_dim1], ldx,
+                          &a[i__*a_dim1+1], c__1, cblas_scalar(&c_one), &a[i__+i__*a_dim1], c__1);
+
+            /* Generate reflection Q(i) to annihilate A(i+1:m,i) */
+            alpha = a[i__ + i__ * a_dim1];
+            i__2 = m - i__ + 1;
+            i__3 = i__ + 1;
+            cpu_lapack_larfg(i__2, &alpha,
+                             &a[std::min(i__3,m) + i__ * a_dim1], c__1, &tauq[i__]);
+            d[i__] = magma_real<Ty>(alpha);
+            if (i__ < n) {
+                a[i__ + i__ * a_dim1] = c_one;
+
+                /* Compute Y(i+1:n,i) */
+                i__2 = m - i__ + 1;
+                i__3 = n - i__;
+
+                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
+                magma_setvector<Ty>(i__2,
+                                    a + i__   + i__   * a_dim1, 1,
+                                    da, da_offset + (i__-1)+(i__-1)* (ldda), 1,
+                                    queue);
+                // 2. Multiply ---------------------------------------------
+                gpu_blas_gemv(clblasConjTrans, i__2, i__3, c_one,
+                              da, da_offset + (i__-1) + ((i__-1) + 1) * (ldda), ldda,
+                              da, da_offset + (i__-1) + (i__-1) * (ldda), c__1, c_zero,
+                              dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1,
+                              1, &queue, 0, nullptr, &event);
+
+                // 3. Put the result back ----------------------------------
+                magma_getmatrix_async<Ty>(i__3, 1,
+                                          dy, dy_offset + i__+1+i__*y_dim1, y_dim1,
+                                          y+i__+1+i__*y_dim1, y_dim1,
+                                          queue, &event);
+                i__2 = m - i__ + 1;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &a[i__ + a_dim1],
+                              lda, &a[i__ + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+                              &y[i__ * y_dim1 + 1], c__1);
+
+                i__2 = n - i__;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &y[i__ + 1 +y_dim1], ldy,
+                              &y[i__ * y_dim1 + 1], c__1,
+                              cblas_scalar(&c_zero), f, c__1);
+                i__2 = m - i__ + 1;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &x[i__ + x_dim1],
+                              ldx, &a[i__ + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+                              &y[i__ * y_dim1 + 1], c__1);
+
+                // 4. Synch to make sure the result is back ----------------
+                magma_event_sync(event);
+
+                if (i__3 != 0){
+                    i__2 = n - i__;
+                    cpu_blas_axpy(i__2, cblas_scalar(&c_one), f,c__1, &y[i__+1+i__*y_dim1],c__1);
+                }
+
+                i__2 = i__ - 1;
+                i__3 = n - i__;
+                cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_neg_one),
+                              &a[(i__ + 1) * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1], c__1, cblas_scalar(&c_one),
+                              &y[i__ + 1 + i__ * y_dim1], c__1);
+                i__2 = n - i__;
+                cpu_blas_scal(i__2, cblas_scalar(&tauq[i__]), &y[i__ + 1 + i__ * y_dim1], c__1);
+
+                /* Update A(i,i+1:n) */
+                i__2 = n - i__;
+                if (is_cplx) {
+                    cpu_lapack_lacgv(i__2, &a[i__+(i__+1)*a_dim1], lda);
+                    cpu_lapack_lacgv(i__,  &a[i__+a_dim1], lda);
+                }
+
+                cpu_blas_gemv(CblasNoTrans, i__2, i__, cblas_scalar(&c_neg_one),
+                              &y[i__ + 1 + y_dim1], ldy, &a[i__ + a_dim1], lda,
+                              cblas_scalar(&c_one), &a[i__ + (i__ + 1) * a_dim1], lda);
+                i__2 = i__ - 1;
+                i__3 = n - i__;
+
+                if (is_cplx) {
+                    cpu_lapack_lacgv(i__,  &a[i__+a_dim1], lda);
+                    cpu_lapack_lacgv(i__2, &x[i__+x_dim1], ldx);
+                }
+
+                cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_neg_one), &a[(i__ + 1) *
+                                                                              a_dim1 + 1], lda, &x[i__ + x_dim1], ldx, cblas_scalar(&c_one), &a[
+                                                                                  i__ + (i__ + 1) * a_dim1], lda);
+                if (is_cplx) {
+                    cpu_lapack_lacgv(i__2, &x[i__+x_dim1], ldx);
+                }
+
+                /* Generate reflection P(i) to annihilate A(i,i+2:n) */
+                i__2 = n - i__;
+                /* Computing MIN */
+                i__3 = i__ + 2;
+                alpha = a[i__ + (i__ + 1) * a_dim1];
+                cpu_lapack_larfg(i__2, &alpha, &a[i__ + std::min(
+                                         i__3,n) * a_dim1], lda, &taup[i__]);
+                e[i__] = magma_real<Ty>(alpha);
+                a[i__ + (i__ + 1) * a_dim1] = c_one;
+
+                /* Compute X(i+1:m,i) */
+                i__2 = m - i__;
+                i__3 = n - i__;
+                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
+                magma_setvector<Ty>(i__3,
+                                    a + i__   + (i__   +1)* a_dim1, lda,
+                                    da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda,
+                                    queue);
+                // 2. Multiply ---------------------------------------------
+                //magma_zcopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda,
+                //            dy + 1 + lddy, 1);
+                gpu_blas_gemv(clblasNoTrans, i__2, i__3, c_one,
+                              da, da_offset + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda,
+                              da, da_offset + (i__-1) +  ((i__-1)+1) * (ldda), ldda,
+                              //dy + 1 + lddy, 1,
+                              c_zero, dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1,
+                              1, &queue, 0, nullptr, &event);
+
+                // 3. Put the result back ----------------------------------
+                magma_getmatrix_async<Ty>(i__2, 1,
+                                          dx, dx_offset + i__+1+i__*x_dim1, x_dim1,
+                                          x+i__+1+i__*x_dim1, x_dim1,
+                                          queue, &event);
+
+                i__2 = n - i__;
+                cpu_blas_gemv(CblasTransParam, i__2, i__, cblas_scalar(&c_one), &y[i__ + 1 + y_dim1],
+                              ldy, &a[i__ + (i__ + 1) * a_dim1], lda, cblas_scalar(&c_zero), &x[
+                                  i__ * x_dim1 + 1], c__1);
+
+                i__2 = m - i__;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__, cblas_scalar(&c_neg_one), &a[i__ + 1 + a_dim1], lda,
+                              &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_zero), f, c__1);
+                i__2 = i__ - 1;
+                i__3 = n - i__;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_one), &a[(i__ + 1) * a_dim1 + 1],
+                              lda, &a[i__ + (i__ + 1) * a_dim1], lda,
+                              cblas_scalar(&c_zero), &x[i__ * x_dim1 + 1], c__1);
+
+                // 4. Synch to make sure the result is back ----------------
+                magma_event_sync(event);
+
+                if (i__!=0){
+                    i__2 = m - i__;
+                    cpu_blas_axpy(i__2, cblas_scalar(&c_one), f,c__1, &x[i__+1+i__*x_dim1],c__1);
+                }
+
+
+                i__2 = m - i__;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &x[i__ + 1 +
+                                                                           x_dim1], ldx, &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_one), &x[
+                                                                               i__ + 1 + i__ * x_dim1], c__1);
+                i__2 = m - i__;
+                cpu_blas_scal(i__2, cblas_scalar(&taup[i__]), &x[i__ + 1 + i__ * x_dim1], c__1);
+
+                if (is_cplx) {
+                    i__2 = n - i__;
+                    cpu_lapack_lacgv(i__2,  &a[i__+(i__+1)*a_dim1], lda);
+                    // 4. Send the block reflector  A(i+1:m,i) to the GPU after ZLACGV()
+                    magma_setvector<Ty>(i__2,
+                                        a + i__   + (i__   +1)* a_dim1, lda,
+                                        da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda,
+                                        queue);
+                }
+            }
+        }
+    }
+    else {
+        /* Reduce to lower bidiagonal form */
+        for (i__ = 1; i__ <= nb; ++i__) {
+
+            /* Update A(i,i:n) */
+            i__2 = n - i__ + 1;
+            i__3 = i__ - 1;
+            if (is_cplx) {
+                cpu_lapack_lacgv(i__2, &a[i__ + i__ * a_dim1], lda);
+                cpu_lapack_lacgv(i__3, &a[i__ + a_dim1], lda);
+            }
+            cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one), &y[i__ + y_dim1], ldy,
+                          &a[i__ + a_dim1], lda, cblas_scalar(&c_one), &a[i__ + i__ * a_dim1], lda);
+            i__2 = i__ - 1;
+            if (is_cplx) {
+                cpu_lapack_lacgv(i__3, &a[i__ + a_dim1], lda);
+                cpu_lapack_lacgv(i__3, &x[i__ + x_dim1], ldx);
+            }
+            i__3 = n - i__ + 1;
+            cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_neg_one), &a[i__ * a_dim1 + 1],
+                          lda, &x[i__ + x_dim1], ldx, cblas_scalar(&c_one), &a[i__ + i__ * a_dim1], lda);
+            if (is_cplx) {
+                cpu_lapack_lacgv(i__2, &x[i__ + x_dim1], ldx);
+            }
+
+            /* Generate reflection P(i) to annihilate A(i,i+1:n) */
+            i__2 = n - i__ + 1;
+            /* Computing MIN */
+            i__3 = i__ + 1;
+            alpha = a[i__ + i__ * a_dim1];
+            cpu_lapack_larfg(i__2, &alpha,
+                             &a[i__ + std::min(i__3,n) * a_dim1], lda, &taup[i__]);
+            d[i__] = magma_real<Ty>(alpha);
+            if (i__ < m) {
+                a[i__ + i__ * a_dim1] = c_one;
+
+                /* Compute X(i+1:m,i) */
+                i__2 = m - i__;
+                i__3 = n - i__ + 1;
+
+                // 1. Send the block reflector  A(i,i+1:n) to the GPU ------
+                magma_setvector<Ty>(i__3,
+                                    a + i__   + i__   * a_dim1, lda,
+                                    da, da_offset + (i__-1)+(i__-1)* (ldda), ldda,
+                                    queue);
+
+                // 2. Multiply ---------------------------------------------
+                //magma_zcopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda,
+                //            dy + 1 + lddy, 1);
+                gpu_blas_gemv(clblasNoTrans, i__2, i__3, c_one,
+                              da, da_offset + (i__-1)+1 + (i__-1) * ldda, ldda,
+                              da, da_offset + (i__-1)   + (i__-1) * ldda, ldda,
+                              // dy + 1 + lddy, 1,
+                              c_zero,
+                              dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1,
+                              1, &queue, 0, nullptr, &event);
+
+
+                // 3. Put the result back ----------------------------------
+                magma_getmatrix_async<Ty>(i__2, 1,
+                                          dx, dx_offset + i__+1+i__*x_dim1, x_dim1,
+                                          x+i__+1+i__*x_dim1, x_dim1,
+                                          queue, &event);
+
+                i__2 = n - i__ + 1;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &y[i__ + y_dim1],
+                              ldy, &a[i__ + i__ * a_dim1], lda, cblas_scalar(&c_zero),
+                              &x[i__ *  x_dim1 + 1], c__1);
+                i__2 = m - i__;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+                              &a[i__ + 1 + a_dim1], lda, &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_zero),
+                              f, c__1);
+
+                i__2 = i__ - 1;
+                i__3 = n - i__ + 1;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_one),
+                              &a[i__ * a_dim1 + 1], lda, &a[i__ + i__ * a_dim1], lda, cblas_scalar(&c_zero),
+                              &x[i__ * x_dim1 + 1], c__1);
+
+                // 4. Synch to make sure the result is back ----------------
+                magma_event_sync(event);
+                if (i__2 != 0){
+                    i__3 = m - i__;
+                    cpu_blas_axpy(i__3, cblas_scalar(&c_one), f,c__1, &x[i__+1+i__*x_dim1],c__1);
+                }
+
+                i__2 = m - i__;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+                              &x[i__ + 1 + x_dim1], ldx, &x[i__ * x_dim1 + 1], c__1, cblas_scalar(&c_one),
+                              &x[i__ + 1 + i__ * x_dim1], c__1);
+                i__2 = m - i__;
+                cpu_blas_scal(i__2, cblas_scalar(&taup[i__]), &x[i__ + 1 + i__ * x_dim1], c__1);
+                i__2 = n - i__ + 1;
+
+                if (is_cplx) {
+                    cpu_lapack_lacgv(i__2, &a[i__ + i__ * a_dim1], lda);
+                    magma_setvector<Ty>(i__2,
+                                        a + i__   + (i__ )* a_dim1, lda,
+                                        da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda,
+                                        queue);
+                }
+
+                /* Update A(i+1:m,i) */
+                i__2 = m - i__;
+                i__3 = i__ - 1;
+
+                if (is_cplx) {
+                    cpu_lapack_lacgv(i__3, &y[i__ + y_dim1], ldy);
+                }
+
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+                              &a[i__ + 1 + a_dim1], lda, &y[i__ + y_dim1], ldy, cblas_scalar(&c_one),
+                              &a[i__ + 1 + i__ * a_dim1], c__1);
+                i__2 = m - i__;
+                if (is_cplx) {
+                    cpu_lapack_lacgv(i__3, &y[i__ + y_dim1], ldy);
+                }
+                cpu_blas_gemv(CblasNoTrans, i__2, i__, cblas_scalar(&c_neg_one),
+                              &x[i__ + 1 + x_dim1], ldx, &a[i__ * a_dim1 + 1], c__1, cblas_scalar(&c_one),
+                              &a[i__ + 1 + i__ * a_dim1], c__1);
+
+                /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
+                i__2 = m - i__;
+                i__3 = i__ + 2;
+                alpha = a[i__ + 1 + i__ * a_dim1];
+                cpu_lapack_larfg(i__2, &alpha,
+                                 &a[std::min(i__3,m) + i__ * a_dim1], c__1, &tauq[i__]);
+                e[i__] = magma_real<Ty>(alpha);
+                a[i__ + 1 + i__ * a_dim1] = c_one;
+
+                /* Compute Y(i+1:n,i) */
+                i__2 = m - i__;
+                i__3 = n - i__;
+
+                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
+                magma_setvector<Ty>(i__2,
+                                    a + i__   +1+  i__   * a_dim1, 1,
+                                    da, da_offset + (i__-1)+1+ (i__-1)*(ldda),  1,
+                                    queue);
+                // 2. Multiply ---------------------------------------------
+                gpu_blas_gemv(clblasConjTrans, i__2, i__3, c_one,
+                              da, da_offset + (i__-1)+1+ ((i__-1)+1) * ldda, ldda,
+                              da, da_offset + (i__-1)+1+  (i__-1)    * ldda, c__1,
+                              c_zero, dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1,
+                              1, &queue, 0, nullptr, &event);
+
+                // 3. Put the result back ----------------------------------
+                magma_getmatrix_async<Ty>(i__3, 1,
+                                          dy, dy_offset + i__+1+i__*y_dim1, y_dim1,
+                                          y+i__+1+i__*y_dim1, y_dim1,
+                                          queue, &event);
+
+                i__2 = m - i__;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasTransParam, i__2, i__3, cblas_scalar(&c_one), &a[i__ + 1 + a_dim1],
+                              lda, &a[i__ + 1 + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+                              &y[ i__ * y_dim1 + 1], c__1);
+                i__2 = n - i__;
+                i__3 = i__ - 1;
+                cpu_blas_gemv(CblasNoTrans, i__2, i__3, cblas_scalar(&c_neg_one),
+                              &y[i__ + 1 + y_dim1], ldy, &y[i__ * y_dim1 + 1], c__1,
+                              cblas_scalar(&c_zero), f, c__1);
+
+                i__2 = m - i__;
+                cpu_blas_gemv(CblasTransParam, i__2, i__, cblas_scalar(&c_one), &x[i__ + 1 + x_dim1],
+                              ldx, &a[i__ + 1 + i__ * a_dim1], c__1, cblas_scalar(&c_zero),
+                              &y[i__ * y_dim1 + 1], c__1);
+
+                // 4. Synch to make sure the result is back ----------------
+                magma_event_sync(event);
+                if (i__3 != 0){
+                    i__2 = n - i__;
+                    cpu_blas_axpy(i__2, cblas_scalar(&c_one), f,c__1, &y[i__+1+i__*y_dim1],c__1);
+                }
+
+                i__2 = n - i__;
+                cpu_blas_gemv(CblasTransParam, i__, i__2, cblas_scalar(&c_neg_one),
+                              &a[(i__ + 1) * a_dim1 + 1], lda, &y[i__ * y_dim1 + 1],
+                              c__1, cblas_scalar(&c_one), &y[i__ + 1 + i__ * y_dim1], c__1);
+                i__2 = n - i__;
+                cpu_blas_scal(i__2, cblas_scalar(&tauq[i__]), &y[i__ + 1 + i__ * y_dim1], c__1);
+            }
+            else {
+                if (is_cplx) {
+                    i__2 = n - i__ + 1;
+                    cpu_lapack_lacgv(i__2, &a[i__ + i__ * a_dim1], lda);
+                    magma_setvector<Ty>(i__2,
+                                        a + i__   + (i__ )* a_dim1, lda,
+                                        da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda,
+                                        queue);
+                }
+            }
+        }
+    }
+
+    magma_queue_sync(queue);
+    magma_free_cpu(f);
+
+    return MAGMA_SUCCESS;
+}
+
+#define INSTANTIATE(Ty)                                 \
+    template  magma_int_t                               \
+    magma_labrd_gpu<Ty>(                                \
+        magma_int_t m, magma_int_t n, magma_int_t nb,   \
+        Ty *a, magma_int_t lda,                         \
+        cl_mem da, size_t da_offset, magma_int_t ldda,  \
+        void *_d, void *_e, Ty *tauq, Ty *taup,         \
+        Ty *x, magma_int_t ldx,                         \
+        cl_mem dx, size_t dx_offset, magma_int_t lddx,  \
+        Ty *y, magma_int_t ldy,                         \
+        cl_mem dy, size_t dy_offset, magma_int_t lddy,  \
+        magma_queue_t queue);
+
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(magmaFloatComplex)
+INSTANTIATE(magmaDoubleComplex)
diff --git a/src/backend/opencl/magma/magma.h b/src/backend/opencl/magma/magma.h
index 5584c20..7797775 100644
--- a/src/backend/opencl/magma/magma.h
+++ b/src/backend/opencl/magma/magma.h
@@ -93,4 +93,25 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
                 magma_queue_t queue,
                 magma_int_t *info);
 
+template<typename Ty>  magma_int_t
+magma_labrd_gpu(magma_int_t m, magma_int_t n, magma_int_t nb,
+                Ty *a, magma_int_t lda,
+                cl_mem da, size_t da_offset, magma_int_t ldda,
+                void *_d, void *_e, Ty *tauq, Ty *taup,
+                Ty *x, magma_int_t ldx,
+                cl_mem dx, size_t dx_offset, magma_int_t lddx,
+                Ty *y, magma_int_t ldy,
+                cl_mem dy, size_t dy_offset, magma_int_t lddy,
+                magma_queue_t queue);
+
+template<typename Ty> magma_int_t
+magma_gebrd_hybrid(magma_int_t m, magma_int_t n,
+                   Ty *a, magma_int_t lda,
+                   cl_mem da, size_t da_offset, magma_int_t ldda,
+                   void *_d, void *_e,
+                   Ty *tauq, Ty *taup,
+                   Ty *work, magma_int_t lwork,
+                   magma_queue_t queue,
+                   magma_int_t *info, bool copy);
+
 #endif
diff --git a/src/backend/opencl/magma/magma_cpu_lapack.h b/src/backend/opencl/magma/magma_cpu_lapack.h
index d406508..c051306 100644
--- a/src/backend/opencl/magma/magma_cpu_lapack.h
+++ b/src/backend/opencl/magma/magma_cpu_lapack.h
@@ -16,6 +16,8 @@
 #define LAPACKE_dunmqr_work(...) LAPACKE_dormqr_work(__VA_ARGS__)
 #define LAPACKE_sungqr_work(...) LAPACKE_sorgqr_work(__VA_ARGS__)
 #define LAPACKE_dungqr_work(...) LAPACKE_dorgqr_work(__VA_ARGS__)
+#define LAPACKE_sungbr_work(...) LAPACKE_sorgbr_work(__VA_ARGS__)
+#define LAPACKE_dungbr_work(...) LAPACKE_dorgbr_work(__VA_ARGS__)
 
 template<typename... Args>
 int LAPACKE_slacgv(Args... args) { return 0; }
@@ -44,24 +46,42 @@ int LAPACKE_dlacgv(Args... args) { return 0; }
     template<typename T>                        \
     struct cpu_##NAME##_func;
 
-#define CPU_LAPACK_FUNC1(NAME, TYPE, X)                             \
-    template<>                                                      \
-    struct cpu_##NAME##_func<TYPE>                                  \
-    {                                                               \
-        template<typename... Args>                                  \
-            int                                                     \
-            operator() (Args... args)                               \
-        { return LAPACK_NAME(X##NAME)(LAPACK_COL_MAJOR, args...); } \
+#define CPU_LAPACK_FUNC1(NAME, TYPE, X)                                 \
+    template<>                                                          \
+    struct cpu_##NAME##_func<TYPE>                                      \
+    {                                                                   \
+        template<typename... Args>                                      \
+            int                                                         \
+            operator() (Args... args)                                   \
+        {                                                               \
+            int err = LAPACK_NAME(X##NAME)(LAPACK_COL_MAJOR, args...);  \
+            if (err != 0) AF_ERROR("Error in "#NAME, AF_ERR_INTERNAL);  \
+            return err;                                                 \
+        }                                                               \
     };
 
-#define CPU_LAPACK_FUNC2(NAME, TYPE, X)                             \
-    template<>                                                      \
-    struct cpu_##NAME##_func<TYPE>                                  \
-    {                                                               \
-        template<typename... Args>                                  \
-            int                                                     \
-            operator() (Args... args)                               \
-        { return LAPACK_NAME(X##NAME)(args...); }                   \
+#define CPU_LAPACK_FUNC2(NAME, TYPE, X)                                 \
+    template<>                                                          \
+    struct cpu_##NAME##_func<TYPE>                                      \
+    {                                                                   \
+        template<typename... Args>                                      \
+            int                                                         \
+            operator() (Args... args)                                   \
+        {                                                               \
+            int err = LAPACK_NAME(X##NAME)(args...);                    \
+            if (err != 0) AF_ERROR("Error in "#NAME, AF_ERR_INTERNAL);  \
+            return err;                                                 \
+        }                                                               \
+    };
+
+#define CPU_LAPACK_FUNC3(NAME, TYPE, X)             \
+    template<>                                      \
+    struct cpu_##NAME##_func<TYPE>                  \
+    {                                               \
+        template<typename... Args>                  \
+            double                                  \
+            operator() (Args... args)               \
+        { return LAPACK_NAME(X##NAME)(args...); }   \
     };
 
 #define CPU_LAPACK_DECL1(NAME)                          \
@@ -78,17 +98,28 @@ int LAPACKE_dlacgv(Args... args) { return 0; }
     CPU_LAPACK_FUNC2(NAME, magmaFloatComplex,     c)    \
     CPU_LAPACK_FUNC2(NAME, magmaDoubleComplex,    z)    \
 
+#define CPU_LAPACK_DECL3(NAME)                  \
+    CPU_LAPACK_FUNC_DEF(NAME)                   \
+    CPU_LAPACK_FUNC3(NAME, float,      s)       \
+    CPU_LAPACK_FUNC3(NAME, double,     d)       \
+
 CPU_LAPACK_DECL1(getrf)
 CPU_LAPACK_DECL1(gebrd)
+CPU_LAPACK_DECL1(gebrd_work)
 CPU_LAPACK_DECL1(potrf)
 CPU_LAPACK_DECL1(trtri)
 CPU_LAPACK_DECL1(geqrf_work)
 CPU_LAPACK_DECL1(larft)
 CPU_LAPACK_DECL1(unmqr_work)
 CPU_LAPACK_DECL1(ungqr_work)
+CPU_LAPACK_DECL1(ungbr_work)
+CPU_LAPACK_DECL1(bdsqr_work)
 CPU_LAPACK_DECL1(laswp)
 CPU_LAPACK_DECL1(laset)
+
 CPU_LAPACK_DECL2(lacgv)
 CPU_LAPACK_DECL2(larfg)
+CPU_LAPACK_DECL1(lacpy)
+CPU_LAPACK_DECL3(lamch)
 
 #endif
diff --git a/src/backend/opencl/magma/magma_helper.cpp b/src/backend/opencl/magma/magma_helper.cpp
index a3a3d9e..584a412 100644
--- a/src/backend/opencl/magma/magma_helper.cpp
+++ b/src/backend/opencl/magma/magma_helper.cpp
@@ -159,14 +159,12 @@ magma_int_t magma_get_geqrf_nb<magmaDoubleComplex>( magma_int_t m )
     else                return 128;
 }
 
-template<typename T> magma_int_t magma_get_gebrd_nb(int num) { return 256; }
-
 template<typename T> T magma_make(double r, double i) { return (T) r; }
 template float magma_make<float>(double r, double i);
 template double magma_make<double>(double r, double i);
 template<> magmaFloatComplex magma_make<magmaFloatComplex>(double r, double i)
 {
-    magmaFloatComplex tmp = {r, i};
+    magmaFloatComplex tmp = {(float)r, (float)i};
     return tmp;
 }
 template<> magmaDoubleComplex magma_make<magmaDoubleComplex>(double r, double i)
diff --git a/src/backend/opencl/magma/magma_helper.h b/src/backend/opencl/magma/magma_helper.h
index a63c8d1..6e81cb5 100644
--- a/src/backend/opencl/magma/magma_helper.h
+++ b/src/backend/opencl/magma/magma_helper.h
@@ -22,6 +22,6 @@ template<typename T> bool magma_is_real();
 template<typename T> magma_int_t magma_get_getrf_nb(int num);
 template<typename T> magma_int_t magma_get_potrf_nb(int num);
 template<typename T> magma_int_t magma_get_geqrf_nb(int num);
-template<typename T> magma_int_t magma_get_gebrd_nb(int num);
+template<typename T> magma_int_t magma_get_gebrd_nb(int num) { return 256; }
 
 #endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git