[clblas] 51/61: add codepath for dtrsm when M and N are mod192

Fri Jul 24 22:49:48 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clblas.

commit f7c65361706cacfc89b051dbd7533df5a937892a
Author: Timmy <timmy.liu at amd.com>
Date:   Wed Jun 24 11:17:12 2015 -0500

    add codepath for dtrsm when M and N are mod192
---
 src/library/CMakeLists.txt                        |    5 +-
 src/library/bingen.cmake                          |    1 +
 src/library/blas/functor/gpu_dtrsm192.cc          |  596 ++++++++++++
 src/library/blas/functor/hawaii.cc                |   20 +
 src/library/blas/functor/include/gpu_dtrsm192.h   |   28 +
 src/library/blas/gens/clTemplates/dtrsm_gpu192.cl | 1031 +++++++++++++++++++++
 6 files changed, 1680 insertions(+), 1 deletion(-)

diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index e45a2aa..bcc482b 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -69,6 +69,7 @@ set(SRC_BLAS
 	blas/functor/bonaire.cc
     blas/functor/gcn_dgemm.cc
     blas/functor/gpu_dtrsm.cc
+    blas/functor/gpu_dtrsm192.cc
     blas/functor/functor_fill.cc
 	blas/functor/hawaii_dgemmChannelConflict.cc
 	blas/functor/hawaii_dgemmSplitKernel.cc
@@ -101,6 +102,7 @@ set(SRC_BLAS_HEADERS
 	blas/functor/include/bonaire.h
     blas/functor/include/gcn_dgemm.h
     blas/functor/include/gpu_dtrsm.h
+    blas/functor/include/gpu_dtrsm192.h
     blas/functor/include/BinaryBuild.h
 	blas/functor/include/hawaii_dgemmChannelConflict.h
 	blas/functor/include/hawaii_dgemmSplitKernel.h
@@ -229,7 +231,7 @@ set (SRC_CL_TEMPLATES
 	dgemm_hawaiiSplitKernel.cl
 	sgemm_hawaiiSplitKernel.cl
     dtrsm_gpu.cl
-
+    dtrsm_gpu192.cl
 	dgemm_gcn_SmallMatrices.cl
 	sgemm_gcn_SmallMatrices.cl
 	sgemm_gcn.cl
@@ -239,6 +241,7 @@ set (SRC_CL_TEMPLATES
 set(SRC_CL_TEMPLATES_GEN
     dgemm_hawai.clHawaii_64.bin.cl
     dtrsm_gpu.clHawaii_64.bin.cl
+	dtrsm_gpu192.clHawaii_64.bin.cl
 	dgemm_hawaiiChannelConfilct.clHawaii_64.bin.cl
 	dgemm_hawaiiSplitKernel.clHawaii_64.bin.cl
 	sgemm_hawaiiSplitKernel.clHawaii_64.bin.cl	
diff --git a/src/library/bingen.cmake b/src/library/bingen.cmake
index 7b13557..4511f45 100644
--- a/src/library/bingen.cmake
+++ b/src/library/bingen.cmake
@@ -16,6 +16,7 @@ ${CLTEMPLATE_PATH}/sgemm_gcn.cl
 ${CLTEMPLATE_PATH}/zgemm_gcn.cl
 ${CLTEMPLATE_PATH}/sgemm_gcn_SmallMatrices.cl
 ${CLTEMPLATE_PATH}/sgemm_hawaiiSplit64_32.cl
+${CLTEMPLATE_PATH}/dtrsm_gpu192.cl
 )
 
 
diff --git a/src/library/blas/functor/gpu_dtrsm192.cc b/src/library/blas/functor/gpu_dtrsm192.cc
new file mode 100644
index 0000000..f7a0006
--- /dev/null
+++ b/src/library/blas/functor/gpu_dtrsm192.cc
@@ -0,0 +1,596 @@
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+#include "functor.h"
+#include "binary_lookup.h"
+#include <iostream>
+
+#include "functor_xtrsm.h"
+#include "gpu_dtrsm192.h"
+//#include "tahiti.h"
+#include "BinaryBuild.h"
+
+#if BUILD_KERNEL_FROM_STRING
+#include "dtrsm_gpu192.clT"
+#else
+
+#include "dtrsm_gpu192.clHawaii_64.bin.clT"
+
+#endif
+
+
+// Make it 1 to enable additional debug 'print' 
+#define VERB 0
+
+
+//TODO
+//clReleaseKernel(kernel) ;
+
+#define BLOCK_SIZE 12 // inner blocking size, <=32
+#define NB 192        // outer blocking size, >BLOCK_SIZE
+
+
+//
+// The static cache used to store all instances of clblasDtrsmFunctorGpu /clblasDgemmFunctorTahiti 
+//
+typedef clblasFunctorCache<clblasDtrsm192FunctorGpu, bool> Cache ;
+static Cache cache  ;
+
+
+clblasDtrsm192FunctorGpu::clblasDtrsm192FunctorGpu(Args & args,  cl_int & err, const char* DevName, cl_uint _64BitsUse)  :
+  m_program(0)
+{
+  
+  cl_device_id device;
+  cl_context context;
+
+  cl_command_queue queue = args.queue;
+  err = getDeviceAndContext(queue, device, context);
+
+
+
+
+  if( err != CL_SUCCESS )
+  {
+      return;
+  }
+
+  if (VERB) printf(" ===> GET KERNEL %s\n", "clblasDtrsmFunctorGpu") ;
+
+  BinaryLookup bl(context, device, "clblasDtrsm192FunctorGpu");
+
+  if ( !bl.found() ) // may create empty file or may wait until file is ready  
+  {
+    // directly build from a char* 
+#if BUILD_KERNEL_FROM_STRING
+    err = bl.buildFromSource(dtrsm_gpu_kernels);
+#else 
+    if(!strcmp(DevName, "Tahiti"))
+    {
+    }
+
+    else if(!strcmp(DevName, "Hawaii"))
+    {
+#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
+      if(_64BitsUse==64)
+        err = bl.buildFromBinary(dtrsm_gpu192_kernels_64_bin_Hawaii, sizeof(dtrsm_gpu192_kernels_64_bin_Hawaii), NULL);
+      else
+      {
+        std::cout<<"we don't support clblas on 32 bits"<< std::endl;
+        assert(1);
+      }
+#endif
+    }
+
+#endif
+ 
+    if ( err != CL_SUCCESS )
+    {  
+      if (VERB) printf(" ===> BUILD PROBLEM\n") ;
+
+      return;
+    }
+  }
+
+  this->m_program = bl.getProgram();
+}
+
+
+
+
+#define CALL_KERNEL_TRIPLE_UPDATE(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event)   \
+do{                                                                                              \
+  err = call_kernel_triple_update192(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event);  \
+  if(err != CL_SUCCESS) {                                                                        \
+    return err;                                                                                  \
+  }                                                                                              \
+} while(0)
+
+
+
+cl_int call_kernel_triple_update192(const char* kernel_name, 
+                                 const cl_program prg,
+                                 const cl_command_queue queue,
+                                 cl_mem A,
+                                 unsigned int offA,
+                                 cl_mem d_dinvA,
+                                 int i,
+                                 unsigned int lda,
+                                 int M, 
+                                 cl_event *event)
+{
+  cl_int err = 0;
+
+  unsigned int m = M; 
+
+  int npages = M/(i*2)+(M%(i*2)!=0);
+
+
+  size_t globalLocal  [2];
+  size_t globalThreads[2];
+  switch (i)
+  {
+  case 12:
+	  globalLocal[0] = 12;
+	  globalLocal[1] = 1;
+	  globalThreads[0] = npages * 12;
+	  globalThreads[1] = 1;
+	  break;
+  case 24:
+	  globalLocal[0] = 24;
+	  globalLocal[1] = 2;
+	  globalThreads[0] = npages * 24;
+	  globalThreads[1] = 2;
+	  break;
+  case 48:
+	  globalLocal[0] = 24;
+	  globalLocal[1] = 2;
+	  globalThreads[0] = npages * 48;
+	  globalThreads[1] = 4;
+	  break;
+  case 96:
+	  globalLocal[0] = 24;
+	  globalLocal[1] = 2;
+	  globalThreads[0] = npages * 96;
+	  globalThreads[1] = 8;
+	  break;
+  default:
+	  break;
+  }
+
+  cl_kernel kernel = clCreateKernel(prg, kernel_name, &err);
+  if (err != CL_SUCCESS) {
+    //printf( "create kernel %s failed with %d\n", kernel_name, err );
+    return err;
+  }
+
+  clSetKernelArg(kernel, 0, sizeof(cl_mem), &A);
+  clSetKernelArg(kernel, 1, sizeof(unsigned int), &offA);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_dinvA);
+  clSetKernelArg(kernel, 3, sizeof(int), &i);
+  clSetKernelArg(kernel, 4, sizeof(unsigned int), &lda);
+  clSetKernelArg(kernel, 5, sizeof(int), &npages);
+  clSetKernelArg(kernel, 6, sizeof(unsigned int), &m);
+
+  err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
+			       globalThreads, globalLocal , 
+			       0, NULL, event);
+  
+ 
+  if (err != CL_SUCCESS) {
+    clReleaseKernel(kernel);
+    //printf( "execution of kernel %s failed with %d\n", kernel_name, err );
+    return err;
+  }
+
+  err = clReleaseKernel(kernel);
+  return err;
+
+}
+
+
+
+
+//extern "C"
+cl_int diag_dtrtri192 (cl_program prg,
+                    cl_command_queue queue,		  
+                    int M, 
+                    clblasUplo uplo, 
+                    clblasDiag diag, 
+                    cl_mem A, 
+                    size_t offA,
+                    cl_mem d_dinvA, 
+                    size_t lda,
+                    cl_event *event )
+{
+
+  cl_int err = 0;
+
+  /*
+    This routine is used in dtrsm
+  */
+
+  //For side==right, M is actually N here
+  int nthreads = (M/BLOCK_SIZE + (M % BLOCK_SIZE != 0)) * BLOCK_SIZE;
+  unsigned int m = M;
+ 
+  if (uplo == clblasLower) {
+
+	  //lower is not supported yet
+  }
+  else {
+
+    cl_kernel diag_dtrtri_kernel_upper = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_UPPER", &err);
+    if (err != CL_SUCCESS) {
+      //printf( "create kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
+      return err;
+    }
+
+    int isDiagUnit = (diag == clblasUnit);
+    clSetKernelArg(diag_dtrtri_kernel_upper, 0, sizeof(int), &isDiagUnit);
+    clSetKernelArg(diag_dtrtri_kernel_upper, 1, sizeof(cl_mem), &A);
+    clSetKernelArg(diag_dtrtri_kernel_upper, 2, sizeof(unsigned int), &offA);
+    clSetKernelArg(diag_dtrtri_kernel_upper, 3, sizeof(cl_mem), &d_dinvA);
+    clSetKernelArg(diag_dtrtri_kernel_upper, 4, sizeof(unsigned int), &lda);
+    clSetKernelArg(diag_dtrtri_kernel_upper, 5, sizeof(unsigned int), &m);
+
+    size_t globalThreads[1] = { nthreads };
+    size_t globalLocal  [1] = { BLOCK_SIZE };
+ 
+    err = clEnqueueNDRangeKernel(queue, diag_dtrtri_kernel_upper, 1, NULL,
+				 globalThreads, globalLocal , 
+				 0, NULL, event);
+
+    if (err != CL_SUCCESS) {
+      //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
+      return err;
+    }
+ 
+    clReleaseKernel(diag_dtrtri_kernel_upper);
+    if (err != CL_SUCCESS) {
+      return err;
+    }
+
+    // update the inverse up to the size of BLOCK_SIZE
+    for( int i=BLOCK_SIZE; i < NB; i*=2 ) {
+
+      switch (i) {
+      case 12:
+	CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_12_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+	break;
+
+      case 24:
+	CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+	CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+	break;
+
+      case 48:
+	CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+	CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+	break;
+
+	  case 96:
+    CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+    CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+	break;
+
+      default:
+
+	break;
+      }
+      
+      if (i*2 >= M) break;
+    }
+    
+  }
+
+  return err;
+
+}
+
+
+
+
+#define check_error(cmd)                   \
+do{                                        \
+  cl_int xxxerr = cmd ;                    \
+  if (xxxerr != CL_SUCCESS) {              \
+    if(InvA != 0)                          \
+      clReleaseMemObject(InvA);            \
+    if(X != 0)                             \
+      clReleaseMemObject(X);               \
+    return xxxerr;                         \
+  }                                        \
+} while(0)
+
+
+static cl_int clearBuffer192( cl_command_queue  queue ,
+                           cl_mem  buffer ,
+                           size_t  buffer_size )
+{
+  
+  cl_int err = 0;
+  cl_event  event; 
+  // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!! 
+  double zero = 0.0 ; 
+  err = clEnqueueFillBuffer(queue, 
+                            buffer, 
+                            &zero, 
+                            sizeof(double), 
+                            0,  // offset 
+                            buffer_size, 
+                            0,
+                            NULL, 
+                            &event 
+                            ) ;
+
+  return err;
+
+}
+
+
+
+#define nb 192        // outer blocking size, >BLOCK_SIZE
+#define min(x,y) ((x)<(y)?(x):(y))
+
+
+
+cl_int cl_dtrsm192( cl_program prg,
+                 cl_command_queue  queue , 
+                 clblasSide side, 
+                 clblasUplo uplo, 
+                 clblasTranspose transA, 
+                 clblasDiag diag, 
+                 int M, 
+                 int N,
+                 double alpha,
+                 cl_mem A, size_t offA, size_t ldA,
+                 cl_mem B, size_t offB, size_t ldB,
+                 cl_event *event
+                 ) 
+{
+  cl_int err = 0;
+
+  int i;
+  cl_context context;
+  err = getQueueContext(queue, &context);
+  if(err != CL_SUCCESS) return err;
+
+  /* quick return on wrong size */
+  if (M <= 0 || N <= 0)
+    return clblasInvalidDim;
+    
+  double neg_one = -1.0 ;
+  double one     =  1.0 ;
+  double zero    =  0.0 ; 
+
+
+
+  // Compute the number of blocks of the specified 'size' to fully cover 'n' 
+  // Simply speaking, this is n/size rounded up.
+#define BLOCKS(n,size) ( ( (n) / size ) + ( (n) % (size) != 0 ) )
+
+#define CLEANUP   
+
+#define END_DGEMM_ARGS 1,&queue,0,NULL,event
+
+  // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm 
+#define _(M,i,j)       M    , (off##M + ((i)+(j)*ld##M) ) , ld##M 
+
+  cl_mem InvA = 0;
+  cl_mem X    = 0;
+  // X of size mxn  will contain the result 
+  size_t ldX = M ; 
+  size_t offX = 0; //must be 0: needed by the _(X,i,j) macro
+  size_t size_X = N*ldX * sizeof(double);
+  X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err);
+  check_error(err) ;         
+  err = clearBuffer192( queue, X, size_X ) ;
+  check_error(err) ; 
+
+
+  if (side == clblasLeft) 
+    {
+      // side=L
+      /* invert the diagonals
+       * Allocate device memory for the inverted diagonal blocks, size=m*nb
+       */
+      // Not supported yet with 192 block size
+
+    }
+  else 
+    {
+
+      //
+      // Helper for C = alpha * B * A + beta * C        
+      //
+      // In the calls below
+      //  - the 2nd matrix shall be either A or InvA transposed according to transA
+      //  - the 1st and 3rd matrices are either B and X
+      //
+#define DGEMM_RIGHT(m,n,k, alpha,  B, A, beta, C )   \
+    do { \
+      err = clblasDgemm(clblasColumnMajor, clblasNoTrans, transA , m, n, k, alpha, B, A, beta, C , 1, &queue, 0, NULL, event ) ; \
+      check_error(err) ; \
+    } while(0) 
+
+
+      // side=R
+      /* invert the diagonals
+       * Allocate device memory for the inverted diagonal blocks, size=n*BLOCK_SIZE
+       */
+      
+      /* invert the diagonals
+       * Allocate device memory for the inverted diagonal blocks, size=m*nb
+       */
+      size_t ldInvA = nb ; 
+      size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro
+      size_t size_InvA = ldInvA * BLOCKS(N,nb) * nb *sizeof(double); 
+      InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err);
+      check_error(err) ;         
+      err = clearBuffer192( queue, InvA, size_InvA ) ;
+      check_error(err) ; 
+
+      diag_dtrtri192 (prg, queue, N, uplo, diag, A, offA, InvA, ldA, event);
+
+      if (transA == clblasNoTrans) 
+        {
+          /* the non-transpose case */
+          if (uplo == clblasLower) 
+            {
+              /* the lower case */
+              /* handle the first block seperately with alpha */
+              // lower is not implemented yet
+
+
+            }
+          else 
+            {
+              /* the upper case */
+              /* handle the first block seperately with alpha */
+              int nn = min(nb, (int)N);
+              //DGEMM_RIGHT(  M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) );
+			  err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, alpha, B, offB, ldB, InvA, offInvA, ldInvA, zero, X, offX, ldX, 1, &queue, 0, NULL, event);
+			  check_error(err);
+
+              if (nb < N)
+                {
+                  
+                  //DGEMM_RIGHT(  M, N-nb, nb, neg_one, _(X,0,0), _(A,0,nb), alpha, _(B,0,nb)  );
+				  err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - nb, nb, neg_one, X, offX, ldX, A, offA+ldA*nb, ldA, alpha, B, offB+nb*ldB, ldB, 1, &queue, 0, NULL, event);
+				  assert(err == CL_SUCCESS);
+                  
+                  /* the rest blocks */
+                  for( i=nb; i < N; i += nb ) 
+				  {
+                    nn = min(nb, (int)N-i);
+                    //DGEMM_RIGHT(  M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) );
+					err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, one, B, offB+i*ldB, ldB, InvA, offInvA+i*nb, ldInvA, zero, X, offX+i*ldX, ldX, 1, &queue, 0, NULL, event);
+					assert(err == CL_SUCCESS);
+                    
+                    if (i+nb >= N)
+                      break;
+                    
+                    //DGEMM_RIGHT(  M, N-i-nb, nb, neg_one, _(X,0,i),   _(A,i,i+nb), one, _(B,0,i+nb)  );
+					err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - i - nb, nb, neg_one, X, offX+i*ldX, ldX, A, offA + i + (nb + i)*ldA, ldA, one, B, offB + (i + nb)*ldB, ldB, 1, &queue, 0, NULL, event);
+					assert(err == CL_SUCCESS);
+                  }
+                }
+            }
+        }
+      else 
+        {
+
+          /* the transpose case */
+          // trans is not implemented yet
+        }
+      
+    }
+
+  // Copy X(m,n) to B(m,n)
+  {     
+    size_t src_origin[3] = { 0, 0, 0 } ;
+    size_t dst_origin[3] = { offB*sizeof(double), 0, 0 } ;
+    size_t region[3]     = { M*sizeof(double), N, 1 } ;
+    
+
+    err =  clEnqueueCopyBufferRect( queue,
+                                    X,
+                                    B,
+                                    src_origin,
+                                    dst_origin,
+                                    region,
+                                    ldX*sizeof(double), 0,  
+                                    ldB*sizeof(double), 0,  
+                                    0, NULL,
+                                    event) ;
+    check_error(err) ;         
+
+    clReleaseMemObject(InvA);
+    clReleaseMemObject(X);
+
+  }
+
+  return err;
+
+}
+
+
+
+clblasStatus clblasDtrsm192FunctorGpu::execute(Args &args) 
+{
+  cl_int err;
+  cl_command_queue queue = args.queue;
+
+  if (VERB) printf(" ===> EXECUTE KERNEL %s\n", "dtrsm_gpu") ;
+
+  
+  cl_program prg = this->m_program;
+  
+
+  err = cl_dtrsm192( prg,
+                  queue , 
+                  args.side, 
+                  args.uplo, 
+                  args.transA, 
+                  args.diag, 
+                  args.M, 
+                  args.N,
+                  args.alpha,
+                  args.A, args.offA, args.lda,
+                  args.B, args.offB, args.ldb,
+                  args.events
+                  );
+
+
+
+  if (VERB) printf(" ===> ERR=%d \n",(int)err) ;
+
+  return clblasStatus(err) ;
+}
+
+
+
+clblasDtrsm192FunctorGpu * 
+clblasDtrsm192FunctorGpu::provide(clblasDtrsmFunctor::Args & args , const char* DevName) 
+{
+
+  if ( args.order == clblasRowMajor ) 
+    return NULL ;   // The RowMajor case shall never occur. 
+
+
+  cl_device_id dev;
+  cl_context   ctxt;
+
+  cl_int err = getDeviceAndContext(args.queue, dev, ctxt);
+
+  if (err != CL_SUCCESS)
+  {
+      return NULL;
+  }
+  cl_uint bitness = getAddressBits(dev);
+  Cache::Lookup lookup(cache, ctxt, dev, true) ;
+
+  if ( lookup.ok() ){
+    clblasDtrsm192FunctorGpu * functor = lookup.get();
+    functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used
+    return functor;
+  }
+ 
+  clblasDtrsm192FunctorGpu * functor = new clblasDtrsm192FunctorGpu(args, err, DevName, bitness);
+  if (err != CL_SUCCESS)
+  {
+      return NULL;
+  }
+
+  lookup.set(functor) ;
+
+  return functor;
+    
+}
+
diff --git a/src/library/blas/functor/hawaii.cc b/src/library/blas/functor/hawaii.cc
index 94a23a5..62f6666 100644
--- a/src/library/blas/functor/hawaii.cc
+++ b/src/library/blas/functor/hawaii.cc
@@ -26,6 +26,7 @@
 #include "hawaii_sgemmBranchKernel.h"
 #include "hawaii_sgemmSplit64_32.h"
 #include "gcn_zgemm.h"
+#include "gpu_dtrsm192.h"
 
 FunctorSelectorHawaii FunctorSelectorHawaii::instance ;
 
@@ -190,6 +191,25 @@ clblasDtrsmFunctor * FunctorSelectorHawaii::select_dtrsm_specific(clblasDtrsmFun
 #else
   clblasDtrsmFunctor * functor;
   
+  
+  if ((args.M % 192 == 0) && (args.N % 192 == 0))
+  {
+	  //TODO: the implementation of sub block being 192 only supports 
+	  //side == right
+	  //uplo == upper
+	  //trans == notrans
+	  //M and N need to be mod192
+	  //subblock being 192 is prefered over 128 on Hawaii device since
+	  //it does not create "boundary" in DGEMM calls
+	  //Hawaii DGEMM calls have better performance when M N K are mod48
+	  if ((args.side == clblasRight) && (args.uplo == clblasUpper) && (args.transA == clblasNoTrans))
+	  {
+		  functor = clblasDtrsm192FunctorGpu::provide(args, "Hawaii");
+		  if (functor)
+			  return functor;
+	  }
+  }
+  //sub block is 128 here
   functor = clblasDtrsmFunctorGpu::provide(args, "Hawaii");
   if (functor) 
     return functor;
diff --git a/src/library/blas/functor/include/gpu_dtrsm192.h b/src/library/blas/functor/include/gpu_dtrsm192.h
new file mode 100644
index 0000000..aeda3a8
--- /dev/null
+++ b/src/library/blas/functor/include/gpu_dtrsm192.h
@@ -0,0 +1,28 @@
+#ifndef _CLBLAS_DTRSM192_FUNCTOR_GPU_H_
+#define _CLBLAS_DTRSM192_FUNCTOR_GPU_H_
+
+class clblasDtrsm192FunctorGpu : public clblasDtrsmFunctor 
+{
+public:
+  
+    
+private:  // Constructor & Destructor
+
+  clblasDtrsm192FunctorGpu(Args & args, cl_int & err, const char* DevName,  cl_uint _64BitsUse) ;
+  
+public:
+
+  // Provide a suitable clblasDtrsmFunctorTahiti for the specified args
+  // or NULL if none 
+  static clblasDtrsm192FunctorGpu * provide(clblasDtrsmFunctor::Args & args, const char* DevName) ;
+
+public: // inherited member from clblasDtrsmFunctor
+
+  virtual clblasStatus execute(Args &args) ;
+
+private:
+
+  cl_program      m_program ;  
+} ; 
+
+#endif
diff --git a/src/library/blas/gens/clTemplates/dtrsm_gpu192.cl b/src/library/blas/gens/clTemplates/dtrsm_gpu192.cl
new file mode 100644
index 0000000..551f64b
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/dtrsm_gpu192.cl
@@ -0,0 +1,1031 @@
+static const char * dtrsm_gpu192_kernels = "
+
+//only supports upper triangle matrix for now
+
+#define BLOCK_SIZE 12 // inner blocking size, <=32
+#define NB 192        // outer blocking size, >BLOCK_SIZE
+
+#define ZERO              ( 0.0)
+#define ONE               ( 1.0)
+
+#ifdef DOUBLE_PRECISION
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#else
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#endif
+#endif
+
+__kernel void DIAG_DTRTRI_KERNEL_UPPER(int  isDiagUnit, 
+				       __global double const * restrict A,
+                                       uint offA,
+				       __global double *d_dinvA,
+				       uint lda,
+                                       uint na)
+{
+
+  int i, j;
+  double Ystx = 0;
+  __local double *y = 0;
+  double switcher;
+  double neg_switcher;
+
+  // Thread index
+  int tx = get_local_id(0);
+
+  // Thread index
+  int gx = get_global_id(0);
+
+  // Block index
+  int bx = get_group_id(0);
+
+  A = A + offA;
+
+  __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE;
+  int NumBLperNB = NB/BLOCK_SIZE;
+  d_dinvA += bx/NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE);
+
+  __local double Bs[BLOCK_SIZE*BLOCK_SIZE];
+  __local double workspace[BLOCK_SIZE];    // workspace used to store the current working column
+
+  // load A
+  #pragma unroll
+  for( i=0; i < BLOCK_SIZE; i++ )
+    {
+      if(tx <= i && i+bx*BLOCK_SIZE < na )
+        {
+	  Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);    
+        }
+      else
+        {
+	  Bs[i*BLOCK_SIZE+tx] = ZERO; 
+        }
+    } 
+  // read in the whole square block of my A and zero out the non data triangular
+ 
+  // Synchronize to make sure the matrices are loaded
+  //__syncthreads(); 
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // solve the diagonals
+
+  if(isDiagUnit == 1)
+    {
+      Bs[tx*BLOCK_SIZE+tx] = ONE;
+    }
+  else
+    {
+      if( Bs[tx*BLOCK_SIZE+tx] == ZERO )
+	{
+	  Bs[tx*BLOCK_SIZE+tx] = ONE;  
+	}
+      else
+	{
+	  Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;
+	}      
+    }
+
+
+  /* the upper case */
+  for( i=0; i < BLOCK_SIZE; i++ ) {
+    Ystx =  ZERO;
+    if( tx < i)
+      {
+	switcher = ONE;
+      }
+    else
+      {
+	switcher = ZERO;
+      }
+
+    //dtrmv
+    workspace[tx] = *(Bs+i*BLOCK_SIZE+tx);
+    y = Bs+i*BLOCK_SIZE;
+
+    #pragma unroll
+    //for( j=tx; j < i; j++ )
+    for( j=0; j < i; j++ )
+      Ystx += switcher * (*(Bs+j*BLOCK_SIZE+tx)*workspace[j]);
+
+    //sscal
+    // if (tx != i) y[tx]=switcher*Ystx*(-Bs[i*BLOCK_SIZE+i]);
+
+    if( tx != i)
+      {
+	switcher = ONE;
+	neg_switcher =  ZERO;
+      }
+    else
+      {
+	switcher = ZERO;
+	neg_switcher =  ONE;
+      }
+
+    y[tx] = switcher *Ystx*(-Bs[i*BLOCK_SIZE+i])+neg_switcher*y[tx];
+
+    // __syncthreads();
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // write back A
+#pragma unroll
+  for( i=0; i < BLOCK_SIZE; i++ )
+    *(d_dinvA+i*NB+tx) = Bs[i*BLOCK_SIZE+tx];
+
+}
+
+/*
+ * B21 = -inv(A11)*A12*inv(A22)
+ * 12 to 24
+ */
+__kernel void 
+TRIPLE_DGEMM_UPDATE_12_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
+{
+	// Ain is the non inverse matrix; the size of Ain is lda * na
+	// offAin is the offset of Ain
+	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+	// blk is subblock size, which is 12 here.
+	// lda in leading dimension. Column major here
+	// npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8
+
+	//Work group size is [12]
+	//global work size is [96*number of blocks]
+	//each work item in each work group is responsible for every element in that row
+	//each work group is responsible for one gemm;\
+
+
+	////////////// A12*invA22
+    const uint gidx = get_group_id(0);
+    const uint idx = get_local_id(0);
+
+	const uint page = gidx % npages;
+	const uint page_block = page/8;//8 pages per page block
+	const uint page_index_in_block = page%8;
+	
+	
+	__global double *B, *C;
+    __local double lA[12][12];
+    __local double lB[12][12];
+	double privateC[12] = {(double)0};
+	
+	//decide A12 location for each page
+    Ain = Ain + offAin;
+	Ain += (page*blk*2 + blk) * lda + page * 2 * blk;
+	
+	//decide invA22 (B) location for each page
+	B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk*2 + blk) * NB + page_index_in_block * 2 * blk + blk;
+	
+	//decide invA12 location for each page
+	C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk*2 + blk) * NB + page_index_in_block * 2 * blk;
+	
+	//read A and B into LDS no transpose operated here
+	lA[idx][0] = Ain[idx];
+	lA[idx][1] = Ain[idx + lda];
+	lA[idx][2] = Ain[idx + lda*2];
+	lA[idx][3] = Ain[idx + lda*3];
+	lA[idx][4] = Ain[idx + lda*4];
+	lA[idx][5] = Ain[idx + lda*5];
+	lA[idx][6] = Ain[idx + lda*6];
+	lA[idx][7] = Ain[idx + lda*7];
+	lA[idx][8] = Ain[idx + lda*8];
+	lA[idx][9] = Ain[idx + lda*9];
+	lA[idx][10] = Ain[idx + lda*10];
+	lA[idx][11] = Ain[idx + lda*11];
+	
+	lB[idx][0] = B[idx];
+	lB[idx][1] = B[idx + NB];
+	lB[idx][2] = B[idx + NB*2];
+	lB[idx][3] = B[idx + NB*3];
+	lB[idx][4] = B[idx + NB*4];
+	lB[idx][5] = B[idx + NB*5];
+	lB[idx][6] = B[idx + NB*6];
+	lB[idx][7] = B[idx + NB*7];
+	lB[idx][8] = B[idx + NB*8];
+	lB[idx][9] = B[idx + NB*9];
+	lB[idx][10] = B[idx + NB*10];
+	lB[idx][11] = B[idx + NB*11];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	//do math
+	
+	uint i = 0;
+	
+	do{
+	    privateC[0]=mad(lA[idx][i],lB[i][0],privateC[0]); 
+	    privateC[1]=mad(lA[idx][i],lB[i][1],privateC[1]); 
+	    privateC[2]=mad(lA[idx][i],lB[i][2],privateC[2]); 
+	    privateC[3]=mad(lA[idx][i],lB[i][3],privateC[3]); 
+	    privateC[4]=mad(lA[idx][i],lB[i][4],privateC[4]); 
+	    privateC[5]=mad(lA[idx][i],lB[i][5],privateC[5]); 
+	    privateC[6]=mad(lA[idx][i],lB[i][6],privateC[6]); 
+	    privateC[7]=mad(lA[idx][i],lB[i][7],privateC[7]); 
+	    privateC[8]=mad(lA[idx][i],lB[i][8],privateC[8]); 
+	    privateC[9]=mad(lA[idx][i],lB[i][9],privateC[9]); 
+	    privateC[10]=mad(lA[idx][i],lB[i][10],privateC[10]); 
+	    privateC[11]=mad(lA[idx][i],lB[i][11],privateC[11]); 
+		//mem_fence(CLK_LOCAL_MEM_FENCE);
+	    i = i + 1;
+	}while(i < 12);
+	
+	i = 0;
+	do{
+        C[NB*i+idx] = privateC[i];
+	    i = i + 1;
+	}while(i < 12);
+	
+    ////////////// -invA11*invA12
+	barrier(CLK_GLOBAL_MEM_FENCE);
+	//A is moving to invA11
+	__global double *A;
+	A = d_dinvA + page_block*NB*NB + ( (page%4)*blk*2 ) * NB + (page%4) * 2 * blk;
+	//both B and C are pointing at invA12
+	B = C;
+	
+	//read A and B into LDS no transpose operated here
+	lA[idx][0] = A[idx];
+	lA[idx][1] = A[idx + NB];
+	lA[idx][2] = A[idx + NB*2];
+	lA[idx][3] = A[idx + NB*3];
+	lA[idx][4] = A[idx + NB*4];
+	lA[idx][5] = A[idx + NB*5];
+	lA[idx][6] = A[idx + NB*6];
+	lA[idx][7] = A[idx + NB*7];
+	lA[idx][8] = A[idx + NB*8];
+	lA[idx][9] = A[idx + NB*9];
+	lA[idx][10] = A[idx + NB*10];
+	lA[idx][11] = A[idx + NB*11];
+	
+	lB[idx][0] = B[idx];
+	lB[idx][1] = B[idx + NB];
+	lB[idx][2] = B[idx + NB*2];
+	lB[idx][3] = B[idx + NB*3];
+	lB[idx][4] = B[idx + NB*4];
+	lB[idx][5] = B[idx + NB*5];
+	lB[idx][6] = B[idx + NB*6];
+	lB[idx][7] = B[idx + NB*7];
+	lB[idx][8] = B[idx + NB*8];
+	lB[idx][9] = B[idx + NB*9];
+	lB[idx][10] = B[idx + NB*10];
+	lB[idx][11] = B[idx + NB*11];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+    //do math
+	
+	i = 0;
+	privateC[0] = 0;	
+	privateC[1] = 0;
+	privateC[2] = 0;
+	privateC[3] = 0;
+	privateC[4] = 0;
+	privateC[5] = 0;
+	privateC[6] = 0;
+	privateC[7] = 0;
+	privateC[8] = 0;
+	privateC[9] = 0;
+	privateC[10] = 0;
+	privateC[11] = 0;
+	do{
+	    privateC[0]=mad(lA[idx][i],lB[i][0],privateC[0]); 
+	    privateC[1]=mad(lA[idx][i],lB[i][1],privateC[1]); 
+	    privateC[2]=mad(lA[idx][i],lB[i][2],privateC[2]); 
+	    privateC[3]=mad(lA[idx][i],lB[i][3],privateC[3]); 
+	    privateC[4]=mad(lA[idx][i],lB[i][4],privateC[4]); 
+	    privateC[5]=mad(lA[idx][i],lB[i][5],privateC[5]); 
+	    privateC[6]=mad(lA[idx][i],lB[i][6],privateC[6]); 
+	    privateC[7]=mad(lA[idx][i],lB[i][7],privateC[7]); 
+	    privateC[8]=mad(lA[idx][i],lB[i][8],privateC[8]); 
+	    privateC[9]=mad(lA[idx][i],lB[i][9],privateC[9]); 
+	    privateC[10]=mad(lA[idx][i],lB[i][10],privateC[10]); 
+	    privateC[11]=mad(lA[idx][i],lB[i][11],privateC[11]); 
+		//mem_fence(CLK_LOCAL_MEM_FENCE);
+	    i = i + 1;
+	}while(i < 12);
+
+    i = 0;
+	do{
+        C[NB*i+idx] = -1*privateC[i];
+	    i = i + 1;
+	}while(i < 12);
+	
+}
+
+__kernel void 
+TRIPLE_DGEMM_UPDATE_24_PART1_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
+{
+	// Ain is the non inverse matrix; the size of Ain is lda * na
+	// offAin is the offset of Ain
+	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+	// blk is subblock size, which is 24 here.
+	// lda in leading dimension. Column major here
+	// npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4
+
+	//Work group size is [24, 2]
+	//global work size is [96*number of blocks, 2]
+	//each work item in each work group is responsible for 12 elements (half) in that row
+	//each work group is responsible for one gemm;
+
+
+	////////////// A12*invA22
+    const uint gidx = get_group_id(0);
+	const uint gidy = get_group_id(1);
+    const uint idx = get_local_id(0);
+	const uint idy = get_local_id(1);
+
+	const uint page = gidx % npages;//0-3 for 192; 0-1 for 96
+	const uint page_block = page/4;//4 pages per page block
+
+	
+	
+	__global double *B, *C;
+    __local double lA[24][24];
+    __local double lB[24][24];
+	double privateC[12] = {(double)0};
+	
+	//decide A12 location for each page
+    Ain = Ain + offAin;
+	Ain += (page*blk*2 + blk) * lda + page * 2 * blk;
+	
+	//decide invA22 (B) location for each page
+	B = d_dinvA + page_block*NB*NB + ((page%4)*blk*2 + blk) * NB + (page%4) * 2 * blk + blk;
+	
+	//decide invA12 location for each page
+	C = d_dinvA + page_block*NB*NB + ((page%4)*blk*2 + blk) * NB + (page%4) * 2 * blk;
+	
+	//read A and B into LDS no transpose operated here
+	//each work iteam loads half a row
+	lA[idx][0+idy*12] = Ain[idx + idy*12*lda];
+	lA[idx][1+idy*12] = Ain[idx + lda + idy*12*lda];
+	lA[idx][2+idy*12] = Ain[idx + lda*2 + idy*12*lda];
+	lA[idx][3+idy*12] = Ain[idx + lda*3 + idy*12*lda];
+	lA[idx][4+idy*12] = Ain[idx + lda*4 + idy*12*lda];
+	lA[idx][5+idy*12] = Ain[idx + lda*5 + idy*12*lda];
+	lA[idx][6+idy*12] = Ain[idx + lda*6 + idy*12*lda];
+	lA[idx][7+idy*12] = Ain[idx + lda*7 + idy*12*lda];
+	lA[idx][8+idy*12] = Ain[idx + lda*8 + idy*12*lda];
+	lA[idx][9+idy*12] = Ain[idx + lda*9 + idy*12*lda];
+	lA[idx][10+idy*12] = Ain[idx + lda*10 + idy*12*lda];
+	lA[idx][11+idy*12] = Ain[idx + lda*11 + idy*12*lda];
+	
+	lB[idx][0+idy*12] = B[idx + idy*12*NB];
+	lB[idx][1+idy*12] = B[idx + NB + idy*12*NB];
+	lB[idx][2+idy*12] = B[idx + NB*2 + idy*12*NB];
+	lB[idx][3+idy*12] = B[idx + NB*3 + idy*12*NB];
+	lB[idx][4+idy*12] = B[idx + NB*4 + idy*12*NB];
+	lB[idx][5+idy*12] = B[idx + NB*5 + idy*12*NB];
+	lB[idx][6+idy*12] = B[idx + NB*6 + idy*12*NB];
+	lB[idx][7+idy*12] = B[idx + NB*7 + idy*12*NB];
+	lB[idx][8+idy*12] = B[idx + NB*8 + idy*12*NB];
+	lB[idx][9+idy*12] = B[idx + NB*9 + idy*12*NB];
+	lB[idx][10+idy*12] = B[idx + NB*10 + idy*12*NB];
+	lB[idx][11+idy*12] = B[idx + NB*11 + idy*12*NB];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	//do math
+	
+	uint i = 0;
+	
+	do{
+	    privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]); 
+	    privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]); 
+	    privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]); 
+	    privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]); 
+	    privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]); 
+	    privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]); 
+	    privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]); 
+	    privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]); 
+	    privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]); 
+	    privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]); 
+	    privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]); 
+	    privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]); 
+	    i = i + 1;
+	}while(i < 24);
+	
+	i = 0;
+	do{
+        C[NB*idy*12+NB*i+idx] = privateC[i];
+	    i = i + 1;
+	}while(i < 12);
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_24_PART2_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+	// Ain is the non inverse matrix; the size of Ain is lda * na
+	// offAin is the offset of Ain
+	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+	// blk is subblock size, which is 24 here.
+	// lda in leading dimension. Column major here
+	// npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4
+
+	//Work group size is [24, 2]
+	//global work size is [96*number of blocks, 2]
+	//each work item in each work group is responsible for 12 elements (half) in that row
+	//each work group is responsible for one gemm;
+
+	
+	////////////// -invA11*invA12
+	const uint gidx = get_group_id(0);
+	const uint gidy = get_group_id(1);
+    const uint idx = get_local_id(0);
+	const uint idy = get_local_id(1);
+	
+	const uint page = gidx % npages;//0-3 for 192; 0-1 for 96
+	const uint page_block = page/4;//4 pages per page block
+	
+	__global double *A, *B, *C;
+    __local double lA[24][24];
+    __local double lB[24][24];
+	double privateC[12] = {(double)0};
+	
+	//decide invA11 location for each page
+	A = d_dinvA + page_block*NB*NB + (page%4)*blk*2 * NB + (page%4) * 2 * blk;
+	//decide invA12 location for each page
+	B = d_dinvA + page_block*NB*NB + ((page%4)*blk*2 + blk) * NB + (page%4) * 2 * blk;
+	C = B;
+	//C = d_dinvA + page_block*NB*NB + ((page%4)*blk*2) * NB + (page%4) * 2 * blk + blk;
+	
+	//read A and B into LDS no transpose operated here
+	//each work iteam loads half a row
+	lA[idx][0+idy*12] = A[idx + idy*12*NB];
+	lA[idx][1+idy*12] = A[idx + NB + idy*12*NB];
+	lA[idx][2+idy*12] = A[idx + NB*2 + idy*12*NB];
+	lA[idx][3+idy*12] = A[idx + NB*3 + idy*12*NB];
+	lA[idx][4+idy*12] = A[idx + NB*4 + idy*12*NB];
+	lA[idx][5+idy*12] = A[idx + NB*5 + idy*12*NB];
+	lA[idx][6+idy*12] = A[idx + NB*6 + idy*12*NB];
+	lA[idx][7+idy*12] = A[idx + NB*7 + idy*12*NB];
+	lA[idx][8+idy*12] = A[idx + NB*8 + idy*12*NB];
+	lA[idx][9+idy*12] = A[idx + NB*9 + idy*12*NB];
+	lA[idx][10+idy*12] = A[idx + NB*10 + idy*12*NB];
+	lA[idx][11+idy*12] = A[idx + NB*11 + idy*12*NB];
+	
+	lB[idx][0+idy*12] = B[idx + idy*12*NB];
+	lB[idx][1+idy*12] = B[idx + NB + idy*12*NB];
+	lB[idx][2+idy*12] = B[idx + NB*2 + idy*12*NB];
+	lB[idx][3+idy*12] = B[idx + NB*3 + idy*12*NB];
+	lB[idx][4+idy*12] = B[idx + NB*4 + idy*12*NB];
+	lB[idx][5+idy*12] = B[idx + NB*5 + idy*12*NB];
+	lB[idx][6+idy*12] = B[idx + NB*6 + idy*12*NB];
+	lB[idx][7+idy*12] = B[idx + NB*7 + idy*12*NB];
+	lB[idx][8+idy*12] = B[idx + NB*8 + idy*12*NB];
+	lB[idx][9+idy*12] = B[idx + NB*9 + idy*12*NB];
+	lB[idx][10+idy*12] = B[idx + NB*10 + idy*12*NB];
+	lB[idx][11+idy*12] = B[idx + NB*11 + idy*12*NB];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//do math
+	
+	uint i = 0;
+	
+	do{
+	    privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]); 
+	    privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]); 
+	    privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]); 
+	    privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]); 
+	    privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]); 
+	    privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]); 
+	    privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]); 
+	    privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]); 
+	    privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]); 
+	    privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]); 
+	    privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]); 
+	    privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]); 
+	    i = i + 1;
+	}while(i < 24);
+	
+	i = 0;
+	do{
+        C[NB*idy*12+NB*i+idx] = -1 * privateC[i];
+	    i = i + 1;
+	}while(i < 12);	
+
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_48_PART1_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+	// Ain is the non inverse matrix; the size of Ain is lda * na
+	// offAin is the offset of Ain
+	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+	// blk is subblock size, which is 48 here.
+	// lda in leading dimension. Column major here
+	// npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+	//Work group size is [24, 2]
+	//global work size is [96*number of blocks, 4]
+	//each work item in each work group is responsible for 12 elements (1/4) in that row
+	//each work group is responsible for 24 by 24 macro tile;
+	
+	////////////// A12*invA22
+    const uint gidx = get_group_id(0);
+	const uint gidy = get_group_id(1);
+    const uint idx = get_local_id(0);
+	const uint idy = get_local_id(1);
+	
+	//uint page = gidx / 2;//0-1 for 192; 0 for 96
+	const uint page = (gidx/2)%2;//index of page within a page_block; 2 pages per page_block
+	const uint page_block = gidx/4;//#index of page_block; 2 WG per page; 4 WG per page_block
+	
+	__global double *B, *C;
+    __local double lA[24][48];
+    __local double lB[48][24];
+	double privateC[12] = {(double)0};
+	
+	//decide A12 location for each page
+	//each workgroup loads half of A (left or right)
+    Ain = Ain + offAin;
+	Ain += page_block*NB*lda + page_block*NB + page*blk*2*lda + page*blk*2 + blk*lda + gidx%2*(blk/2);
+	
+	//decide invA22 (B) location for each page
+	//each workgroup loads half of B (up or down)
+	B = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + blk + gidy*(blk/2)*NB;
+	
+	//decide invA12 location for each page; 
+	//Actually this will be stored in invA21 temporarily
+	//each workgroup writes 1/4 of C
+	C = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
+	
+	//read A and B into LDS no transpose operated here
+	//each work item loads a half row of A and half column of B
+	//idx 0-23 idy 0-1
+	lA[idx][0+idy*24] = Ain[idx + idy*24*lda];
+	lA[idx][1+idy*24] = Ain[idx + lda + idy*24*lda];
+	lA[idx][2+idy*24] = Ain[idx + lda*2 + idy*24*lda];
+	lA[idx][3+idy*24] = Ain[idx + lda*3 + idy*24*lda];
+	lA[idx][4+idy*24] = Ain[idx + lda*4 + idy*24*lda];
+	lA[idx][5+idy*24] = Ain[idx + lda*5 + idy*24*lda];
+	lA[idx][6+idy*24] = Ain[idx + lda*6 + idy*24*lda];
+	lA[idx][7+idy*24] = Ain[idx + lda*7 + idy*24*lda];
+	lA[idx][8+idy*24] = Ain[idx + lda*8 + idy*24*lda];
+	lA[idx][9+idy*24] = Ain[idx + lda*9 + idy*24*lda];
+	lA[idx][10+idy*24] = Ain[idx + lda*10 + idy*24*lda];
+	lA[idx][11+idy*24] = Ain[idx + lda*11 + idy*24*lda];
+	lA[idx][12+idy*24] = Ain[idx + lda*12 + idy*24*lda];
+	lA[idx][13+idy*24] = Ain[idx + lda*13 + idy*24*lda];
+	lA[idx][14+idy*24] = Ain[idx + lda*14 + idy*24*lda];
+	lA[idx][15+idy*24] = Ain[idx + lda*15 + idy*24*lda];
+	lA[idx][16+idy*24] = Ain[idx + lda*16 + idy*24*lda];
+	lA[idx][17+idy*24] = Ain[idx + lda*17 + idy*24*lda];
+	lA[idx][18+idy*24] = Ain[idx + lda*18 + idy*24*lda];
+	lA[idx][19+idy*24] = Ain[idx + lda*19 + idy*24*lda];
+	lA[idx][20+idy*24] = Ain[idx + lda*20 + idy*24*lda];
+	lA[idx][21+idy*24] = Ain[idx + lda*21 + idy*24*lda];
+	lA[idx][22+idy*24] = Ain[idx + lda*22 + idy*24*lda];
+	lA[idx][23+idy*24] = Ain[idx + lda*23 + idy*24*lda];
+	
+	lB[0+idy*24][idx] = B[idx*NB + idy*24];
+	lB[1+idy*24][idx]  = B[idx*NB + idy*24 + 1];
+	lB[2+idy*24][idx]  = B[idx*NB + idy*24 + 2];
+	lB[3+idy*24][idx]  = B[idx*NB + idy*24 + 3];
+	lB[4+idy*24][idx]  = B[idx*NB + idy*24 + 4];
+	lB[5+idy*24][idx]  = B[idx*NB + idy*24 + 5];
+	lB[6+idy*24][idx]  = B[idx*NB + idy*24 + 6];
+	lB[7+idy*24][idx]  = B[idx*NB + idy*24 + 7];
+	lB[8+idy*24][idx]  = B[idx*NB + idy*24 + 8];
+	lB[9+idy*24][idx]  = B[idx*NB + idy*24 + 9];
+	lB[10+idy*24][idx]  = B[idx*NB + idy*24 + 10];
+	lB[11+idy*24][idx]  = B[idx*NB + idy*24 + 11];
+	lB[12+idy*24][idx]  = B[idx*NB + idy*24 + 12];
+	lB[13+idy*24][idx]  = B[idx*NB + idy*24 + 13];
+	lB[14+idy*24][idx]  = B[idx*NB + idy*24 + 14];
+	lB[15+idy*24][idx]  = B[idx*NB + idy*24 + 15];
+	lB[16+idy*24][idx]  = B[idx*NB + idy*24 + 16];
+	lB[17+idy*24][idx]  = B[idx*NB + idy*24 + 17];
+	lB[18+idy*24][idx]  = B[idx*NB + idy*24 + 18];
+	lB[19+idy*24][idx]  = B[idx*NB + idy*24 + 19];
+	lB[20+idy*24][idx]  = B[idx*NB + idy*24 + 20];
+	lB[21+idy*24][idx]  = B[idx*NB + idy*24 + 21];
+	lB[22+idy*24][idx]  = B[idx*NB + idy*24 + 22];
+	lB[23+idy*24][idx]  = B[idx*NB + idy*24 + 23];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+    //do math
+	
+	uint i = 0;
+	
+	do{
+	    privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]); 
+	    privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]); 
+	    privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]); 
+	    privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]); 
+	    privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]); 
+	    privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]); 
+	    privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]); 
+	    privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]); 
+	    privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]); 
+	    privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]); 
+	    privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]); 
+	    privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]); 
+	    i = i + 1;
+	}while(i < 48);
+	
+	i = 0;
+	do{
+        C[NB*idy*12+NB*i+idx] = privateC[i];
+	    i = i + 1;
+	}while(i < 12);
+	
+	
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_48_PART2_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+	// Ain is the non inverse matrix; the size of Ain is lda * na
+	// offAin is the offset of Ain
+	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+	// blk is subblock size, which is 48 here.
+	// lda in leading dimension. Column major here
+	// npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+	//Work group size is [24, 2]
+	//global work size is [96*number of blocks, 4]
+	//each work item in each work group is responsible for 12 elements (1/4) in that row
+	//each work group is responsible for 24 by 24 macro tile;
+	
+	////////////// -invA11*invA12
+    const uint gidx = get_group_id(0);
+	const uint gidy = get_group_id(1);
+    const uint idx = get_local_id(0);
+	const uint idy = get_local_id(1);
+	
+	//uint page = gidx / 2;//0-1 for 192; 0 for 96
+    const uint page = (gidx/2)%2;//index of page within a page_block; 2 pages per page_block
+	const uint page_block = gidx/4;//#index of page_block; 2 WG per page; 4 WG per page_block
+	
+	__global double *A, *B, *C;
+    __local double lA[24][48];
+    __local double lB[48][24];
+	double privateC[12] = {(double)0};
+	
+	//decide invA11 location for each page
+	//each workgroup loads half of A (left or right)
+	A = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + gidx%2*(blk/2);
+	
+	//decide invA12 (B) location for each page
+	//actually it was saved in invA21 from last kernel
+	//each workgroup loads half of B (up or down)
+	B = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + gidy*(blk/2)*NB;
+	
+	//decide invA12 location for each page
+	//each workgroup writes 1/4 of C
+	C = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
+	
+	//read A and B into LDS no transpose operated here
+	//each work item loads a half row of A and half column of B
+	//idx 0-23 idy 0-1
+	lA[idx][0+idy*24] = A[idx + idy*24*NB];
+	lA[idx][1+idy*24] = A[idx + NB + idy*24*NB];
+	lA[idx][2+idy*24] = A[idx + NB*2 + idy*24*NB];
+	lA[idx][3+idy*24] = A[idx + NB*3 + idy*24*NB];
+	lA[idx][4+idy*24] = A[idx + NB*4 + idy*24*NB];
+	lA[idx][5+idy*24] = A[idx + NB*5 + idy*24*NB];
+	lA[idx][6+idy*24] = A[idx + NB*6 + idy*24*NB];
+	lA[idx][7+idy*24] = A[idx + NB*7 + idy*24*NB];
+	lA[idx][8+idy*24] = A[idx + NB*8 + idy*24*NB];
+	lA[idx][9+idy*24] = A[idx + NB*9 + idy*24*NB];
+	lA[idx][10+idy*24] = A[idx + NB*10 + idy*24*NB];
+	lA[idx][11+idy*24] = A[idx + NB*11 + idy*24*NB];
+	lA[idx][12+idy*24] = A[idx + NB*12 + idy*24*NB];
+	lA[idx][13+idy*24] = A[idx + NB*13 + idy*24*NB];
+	lA[idx][14+idy*24] = A[idx + NB*14 + idy*24*NB];
+	lA[idx][15+idy*24] = A[idx + NB*15 + idy*24*NB];
+	lA[idx][16+idy*24] = A[idx + NB*16 + idy*24*NB];
+	lA[idx][17+idy*24] = A[idx + NB*17 + idy*24*NB];
+	lA[idx][18+idy*24] = A[idx + NB*18 + idy*24*NB];
+	lA[idx][19+idy*24] = A[idx + NB*19 + idy*24*NB];
+	lA[idx][20+idy*24] = A[idx + NB*20 + idy*24*NB];
+	lA[idx][21+idy*24] = A[idx + NB*21 + idy*24*NB];
+	lA[idx][22+idy*24] = A[idx + NB*22 + idy*24*NB];
+	lA[idx][23+idy*24] = A[idx + NB*23 + idy*24*NB];
+	
+	lB[0+idy*24][idx] = B[idx*NB + idy*24];
+	lB[1+idy*24][idx]  = B[idx*NB + idy*24 + 1];
+	lB[2+idy*24][idx]  = B[idx*NB + idy*24 + 2];
+	lB[3+idy*24][idx]  = B[idx*NB + idy*24 + 3];
+	lB[4+idy*24][idx]  = B[idx*NB + idy*24 + 4];
+	lB[5+idy*24][idx]  = B[idx*NB + idy*24 + 5];
+	lB[6+idy*24][idx]  = B[idx*NB + idy*24 + 6];
+	lB[7+idy*24][idx]  = B[idx*NB + idy*24 + 7];
+	lB[8+idy*24][idx]  = B[idx*NB + idy*24 + 8];
+	lB[9+idy*24][idx]  = B[idx*NB + idy*24 + 9];
+	lB[10+idy*24][idx]  = B[idx*NB + idy*24 + 10];
+	lB[11+idy*24][idx]  = B[idx*NB + idy*24 + 11];
+	lB[12+idy*24][idx]  = B[idx*NB + idy*24 + 12];
+	lB[13+idy*24][idx]  = B[idx*NB + idy*24 + 13];
+	lB[14+idy*24][idx]  = B[idx*NB + idy*24 + 14];
+	lB[15+idy*24][idx]  = B[idx*NB + idy*24 + 15];
+	lB[16+idy*24][idx]  = B[idx*NB + idy*24 + 16];
+	lB[17+idy*24][idx]  = B[idx*NB + idy*24 + 17];
+	lB[18+idy*24][idx]  = B[idx*NB + idy*24 + 18];
+	lB[19+idy*24][idx]  = B[idx*NB + idy*24 + 19];
+	lB[20+idy*24][idx]  = B[idx*NB + idy*24 + 20];
+	lB[21+idy*24][idx]  = B[idx*NB + idy*24 + 21];
+	lB[22+idy*24][idx]  = B[idx*NB + idy*24 + 22];
+	lB[23+idy*24][idx]  = B[idx*NB + idy*24 + 23];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+    //do math
+	
+	uint i = 0;
+	
+	do{
+	    privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]); 
+	    privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]); 
+	    privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]); 
+	    privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]); 
+	    privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]); 
+	    privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]); 
+	    privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]); 
+	    privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]); 
+	    privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]); 
+	    privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]); 
+	    privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]); 
+	    privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]); 
+	    i = i + 1;
+	}while(i < 48);
+	
+	i = 0;
+	do{
+        C[NB*idy*12+NB*i+idx] = -1 * privateC[i];
+	    i = i + 1;
+	}while(i < 12);
+	
+	
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_96_PART1_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+	// Ain is the non inverse matrix; the size of Ain is lda * na
+	// offAin is the offset of Ain
+	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+	// blk is subblock size, which is 96 here.
+	// lda in leading dimension. Column major here
+	// npages = (na-1)/12*2 + 1; for 192 this is 1 for 384 this is 2
+
+	//Work group size is [24, 2]
+	//global work size is [96*number of blocks, 8]
+	//each work item in each work group is responsible for 12 elements (1/8) in that row
+	//each work group is responsible for 24 by 24 macro tile;
+	
+	////////////// A12*invA22
+    const uint gidx = get_group_id(0);
+	const uint gidy = get_group_id(1);
+    const uint idx = get_local_id(0);
+	const uint idy = get_local_id(1);
+	
+	//uint page = gidx / 2;//0-1 for 192; 0 for 96
+	//const uint page = (gidx/4)%1;//index of page within a page_block; 1 pages per page_block
+	const uint page_block = gidx/4;//#index of page_block; 4 WG per page; 4 WG per page_block
+	
+	
+	__global double *B, *C;
+    __local double lA[24][48];
+    __local double lB[48][24];
+	double privateC[12] = {(double)0};
+	
+	//decide A12 location for each page
+	//each workgroup loads 1/4 of A (left or right)
+    Ain = Ain + offAin;
+	Ain += page_block*NB*lda + page_block*NB + blk*lda + gidx%4*(blk/4);
+	
+	//decide invA22 (B) location for each page
+	//each workgroup loads 1/4 of B (up or down)
+	B = d_dinvA + page_block*NB*NB + blk*NB + blk + gidy*(blk/4)*NB;
+	
+	//decide invA12 location for each page; 
+	//Actually this will be stored in invA21 temporarily
+	//each workgroup writes 1/4*1/4 of C
+	C = d_dinvA + page_block*NB*NB + blk*NB + gidx%4*(blk/4) + gidy*(blk/4)*NB;
+	
+	//read A and B into LDS no transpose operated here
+	//each work item loads a half row of A and half column of B
+	//each loop loads 1/4 row of A and 1/4 column of B
+	//idx 0-23 idy 0-1
+	
+	uint block_k = blk / 48; //thus we need 2 iterations here
+	do{
+	    barrier(CLK_LOCAL_MEM_FENCE);
+		lA[idx][0+idy*24] = Ain[idx + idy*24*lda];
+		lA[idx][1+idy*24] = Ain[idx + lda + idy*24*lda];
+		lA[idx][2+idy*24] = Ain[idx + lda*2 + idy*24*lda];
+		lA[idx][3+idy*24] = Ain[idx + lda*3 + idy*24*lda];
+		lA[idx][4+idy*24] = Ain[idx + lda*4 + idy*24*lda];
+		lA[idx][5+idy*24] = Ain[idx + lda*5 + idy*24*lda];
+		lA[idx][6+idy*24] = Ain[idx + lda*6 + idy*24*lda];
+		lA[idx][7+idy*24] = Ain[idx + lda*7 + idy*24*lda];
+		lA[idx][8+idy*24] = Ain[idx + lda*8 + idy*24*lda];
+		lA[idx][9+idy*24] = Ain[idx + lda*9 + idy*24*lda];
+		lA[idx][10+idy*24] = Ain[idx + lda*10 + idy*24*lda];
+		lA[idx][11+idy*24] = Ain[idx + lda*11 + idy*24*lda];
+		lA[idx][12+idy*24] = Ain[idx + lda*12 + idy*24*lda];
+		lA[idx][13+idy*24] = Ain[idx + lda*13 + idy*24*lda];
+		lA[idx][14+idy*24] = Ain[idx + lda*14 + idy*24*lda];
+		lA[idx][15+idy*24] = Ain[idx + lda*15 + idy*24*lda];
+		lA[idx][16+idy*24] = Ain[idx + lda*16 + idy*24*lda];
+		lA[idx][17+idy*24] = Ain[idx + lda*17 + idy*24*lda];
+		lA[idx][18+idy*24] = Ain[idx + lda*18 + idy*24*lda];
+		lA[idx][19+idy*24] = Ain[idx + lda*19 + idy*24*lda];
+		lA[idx][20+idy*24] = Ain[idx + lda*20 + idy*24*lda];
+		lA[idx][21+idy*24] = Ain[idx + lda*21 + idy*24*lda];
+		lA[idx][22+idy*24] = Ain[idx + lda*22 + idy*24*lda];
+		lA[idx][23+idy*24] = Ain[idx + lda*23 + idy*24*lda];
+		
+		lB[0+idy*24][idx] = B[idx*NB + idy*24];
+		lB[1+idy*24][idx]  = B[idx*NB + idy*24 + 1];
+		lB[2+idy*24][idx]  = B[idx*NB + idy*24 + 2];
+		lB[3+idy*24][idx]  = B[idx*NB + idy*24 + 3];
+		lB[4+idy*24][idx]  = B[idx*NB + idy*24 + 4];
+		lB[5+idy*24][idx]  = B[idx*NB + idy*24 + 5];
+		lB[6+idy*24][idx]  = B[idx*NB + idy*24 + 6];
+		lB[7+idy*24][idx]  = B[idx*NB + idy*24 + 7];
+		lB[8+idy*24][idx]  = B[idx*NB + idy*24 + 8];
+		lB[9+idy*24][idx]  = B[idx*NB + idy*24 + 9];
+		lB[10+idy*24][idx]  = B[idx*NB + idy*24 + 10];
+		lB[11+idy*24][idx]  = B[idx*NB + idy*24 + 11];
+		lB[12+idy*24][idx]  = B[idx*NB + idy*24 + 12];
+		lB[13+idy*24][idx]  = B[idx*NB + idy*24 + 13];
+		lB[14+idy*24][idx]  = B[idx*NB + idy*24 + 14];
+		lB[15+idy*24][idx]  = B[idx*NB + idy*24 + 15];
+		lB[16+idy*24][idx]  = B[idx*NB + idy*24 + 16];
+		lB[17+idy*24][idx]  = B[idx*NB + idy*24 + 17];
+		lB[18+idy*24][idx]  = B[idx*NB + idy*24 + 18];
+		lB[19+idy*24][idx]  = B[idx*NB + idy*24 + 19];
+		lB[20+idy*24][idx]  = B[idx*NB + idy*24 + 20];
+		lB[21+idy*24][idx]  = B[idx*NB + idy*24 + 21];
+		lB[22+idy*24][idx]  = B[idx*NB + idy*24 + 22];
+		lB[23+idy*24][idx]  = B[idx*NB + idy*24 + 23];
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		//do math
+		
+		uint i = 0;
+		
+		do{
+			privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]); 
+			privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]); 
+			privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]); 
+			privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]); 
+			privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]); 
+			privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]); 
+			privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]); 
+			privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]); 
+			privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]); 
+			privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]); 
+			privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]); 
+			privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]); 
+			i = i + 1;
+		}while(i < 48);
+		
+		Ain += 48*lda;
+		B += 48;
+	}while(--block_k>0);
+	
+	uint i = 0;
+	do{
+        C[NB*idy*12+NB*i+idx] = privateC[i];
+	    i = i + 1;
+	}while(i < 12);
+	
+	
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_96_PART2_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+	// Ain is the non inverse matrix; the size of Ain is lda * na
+	// offAin is the offset of Ain
+	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+	// blk is subblock size, which is 48 here.
+	// lda in leading dimension. Column major here
+	// npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+	//Work group size is [24, 2]
+	//global work size is [48*number of blocks, 4]
+	//each work item in each work group is responsible for 12 elements (1/4) in that row
+	//each work group is responsible for 24 by 24 macro tile;
+	
+	////////////// -invA11*invA12
+    const uint gidx = get_group_id(0);
+	const uint gidy = get_group_id(1);
+    const uint idx = get_local_id(0);
+	const uint idy = get_local_id(1);
+	
+	//uint page = gidx / 2;//0-1 for 192; 0 for 96
+    //const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block
+	const uint page_block = gidx/4;//#index of page_block; 4 WG per page; 4 WG per page_block
+	
+	__global double *A, *B, *C;
+    __local double lA[24][48];
+    __local double lB[48][24];
+	double privateC[12] = {(double)0};
+	
+	//decide invA11 location for each page
+	//each workgroup loads half of A (left or right)
+	//A = d_dinvA + page*NB*NB + gidx%2*(blk/2);
+	A = d_dinvA + page_block*NB*NB + gidx%4*(blk/4);
+	
+	//decide invA12 (B) location for each page
+	//actually it was saved in invA21 from last kernel
+	//each workgroup loads half of B (up or down)
+	//B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB;
+	B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk/4)*NB;
+	
+	//decide invA12 location for each page
+	//each workgroup writes 1/4 of C
+	//C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
+	C = d_dinvA + page_block*NB*NB + blk*NB + gidx%4*(blk/4) + gidy*(blk/4)*NB;
+	
+	//read A and B into LDS no transpose operated here
+	//each work item loads a half row of A and half column of B
+	//idx 0-23 idy 0-1
+	
+	uint block_k = blk / 48; //thus we need 2 iterations here
+	do{
+	    barrier(CLK_LOCAL_MEM_FENCE);
+		lA[idx][0+idy*24] = A[idx + idy*24*NB];
+		lA[idx][1+idy*24] = A[idx + NB + idy*24*NB];
+		lA[idx][2+idy*24] = A[idx + NB*2 + idy*24*NB];
+		lA[idx][3+idy*24] = A[idx + NB*3 + idy*24*NB];
+		lA[idx][4+idy*24] = A[idx + NB*4 + idy*24*NB];
+		lA[idx][5+idy*24] = A[idx + NB*5 + idy*24*NB];
+		lA[idx][6+idy*24] = A[idx + NB*6 + idy*24*NB];
+		lA[idx][7+idy*24] = A[idx + NB*7 + idy*24*NB];
+		lA[idx][8+idy*24] = A[idx + NB*8 + idy*24*NB];
+		lA[idx][9+idy*24] = A[idx + NB*9 + idy*24*NB];
+		lA[idx][10+idy*24] = A[idx + NB*10 + idy*24*NB];
+		lA[idx][11+idy*24] = A[idx + NB*11 + idy*24*NB];
+		lA[idx][12+idy*24] = A[idx + NB*12 + idy*24*NB];
+		lA[idx][13+idy*24] = A[idx + NB*13 + idy*24*NB];
+		lA[idx][14+idy*24] = A[idx + NB*14 + idy*24*NB];
+		lA[idx][15+idy*24] = A[idx + NB*15 + idy*24*NB];
+		lA[idx][16+idy*24] = A[idx + NB*16 + idy*24*NB];
+		lA[idx][17+idy*24] = A[idx + NB*17 + idy*24*NB];
+		lA[idx][18+idy*24] = A[idx + NB*18 + idy*24*NB];
+		lA[idx][19+idy*24] = A[idx + NB*19 + idy*24*NB];
+		lA[idx][20+idy*24] = A[idx + NB*20 + idy*24*NB];
+		lA[idx][21+idy*24] = A[idx + NB*21 + idy*24*NB];
+		lA[idx][22+idy*24] = A[idx + NB*22 + idy*24*NB];
+		lA[idx][23+idy*24] = A[idx + NB*23 + idy*24*NB];
+		
+		lB[0+idy*24][idx] = B[idx*NB + idy*24];
+		lB[1+idy*24][idx]  = B[idx*NB + idy*24 + 1];
+		lB[2+idy*24][idx]  = B[idx*NB + idy*24 + 2];
+		lB[3+idy*24][idx]  = B[idx*NB + idy*24 + 3];
+		lB[4+idy*24][idx]  = B[idx*NB + idy*24 + 4];
+		lB[5+idy*24][idx]  = B[idx*NB + idy*24 + 5];
+		lB[6+idy*24][idx]  = B[idx*NB + idy*24 + 6];
+		lB[7+idy*24][idx]  = B[idx*NB + idy*24 + 7];
+		lB[8+idy*24][idx]  = B[idx*NB + idy*24 + 8];
+		lB[9+idy*24][idx]  = B[idx*NB + idy*24 + 9];
+		lB[10+idy*24][idx]  = B[idx*NB + idy*24 + 10];
+		lB[11+idy*24][idx]  = B[idx*NB + idy*24 + 11];
+		lB[12+idy*24][idx]  = B[idx*NB + idy*24 + 12];
+		lB[13+idy*24][idx]  = B[idx*NB + idy*24 + 13];
+		lB[14+idy*24][idx]  = B[idx*NB + idy*24 + 14];
+		lB[15+idy*24][idx]  = B[idx*NB + idy*24 + 15];
+		lB[16+idy*24][idx]  = B[idx*NB + idy*24 + 16];
+		lB[17+idy*24][idx]  = B[idx*NB + idy*24 + 17];
+		lB[18+idy*24][idx]  = B[idx*NB + idy*24 + 18];
+		lB[19+idy*24][idx]  = B[idx*NB + idy*24 + 19];
+		lB[20+idy*24][idx]  = B[idx*NB + idy*24 + 20];
+		lB[21+idy*24][idx]  = B[idx*NB + idy*24 + 21];
+		lB[22+idy*24][idx]  = B[idx*NB + idy*24 + 22];
+		lB[23+idy*24][idx]  = B[idx*NB + idy*24 + 23];
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		//do math
+		
+		uint i = 0;
+		
+		do{
+			privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]); 
+			privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]); 
+			privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]); 
+			privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]); 
+			privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]); 
+			privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]); 
+			privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]); 
+			privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]); 
+			privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]); 
+			privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]); 
+			privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]); 
+			privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]); 
+			i = i + 1;
+		}while(i < 48);
+		
+		A += 48*NB;
+		B += 48;
+	
+	}while(--block_k>0);
+	
+	uint i = 0;
+	do{
+        C[NB*idy*12+NB*i+idx] = -1 * privateC[i];
+	    i = i + 1;
+	}while(i < 12);
+	
+	
+}
+
+";
\ No newline at end of file

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git