[clblas] 51/61: add codepath for dtrsm when M and N are mod192
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri Jul 24 22:49:48 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit f7c65361706cacfc89b051dbd7533df5a937892a
Author: Timmy <timmy.liu at amd.com>
Date: Wed Jun 24 11:17:12 2015 -0500
add codepath for dtrsm when M and N are mod192
---
src/library/CMakeLists.txt | 5 +-
src/library/bingen.cmake | 1 +
src/library/blas/functor/gpu_dtrsm192.cc | 596 ++++++++++++
src/library/blas/functor/hawaii.cc | 20 +
src/library/blas/functor/include/gpu_dtrsm192.h | 28 +
src/library/blas/gens/clTemplates/dtrsm_gpu192.cl | 1031 +++++++++++++++++++++
6 files changed, 1680 insertions(+), 1 deletion(-)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index e45a2aa..bcc482b 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -69,6 +69,7 @@ set(SRC_BLAS
blas/functor/bonaire.cc
blas/functor/gcn_dgemm.cc
blas/functor/gpu_dtrsm.cc
+ blas/functor/gpu_dtrsm192.cc
blas/functor/functor_fill.cc
blas/functor/hawaii_dgemmChannelConflict.cc
blas/functor/hawaii_dgemmSplitKernel.cc
@@ -101,6 +102,7 @@ set(SRC_BLAS_HEADERS
blas/functor/include/bonaire.h
blas/functor/include/gcn_dgemm.h
blas/functor/include/gpu_dtrsm.h
+ blas/functor/include/gpu_dtrsm192.h
blas/functor/include/BinaryBuild.h
blas/functor/include/hawaii_dgemmChannelConflict.h
blas/functor/include/hawaii_dgemmSplitKernel.h
@@ -229,7 +231,7 @@ set (SRC_CL_TEMPLATES
dgemm_hawaiiSplitKernel.cl
sgemm_hawaiiSplitKernel.cl
dtrsm_gpu.cl
-
+ dtrsm_gpu192.cl
dgemm_gcn_SmallMatrices.cl
sgemm_gcn_SmallMatrices.cl
sgemm_gcn.cl
@@ -239,6 +241,7 @@ set (SRC_CL_TEMPLATES
set(SRC_CL_TEMPLATES_GEN
dgemm_hawai.clHawaii_64.bin.cl
dtrsm_gpu.clHawaii_64.bin.cl
+ dtrsm_gpu192.clHawaii_64.bin.cl
dgemm_hawaiiChannelConfilct.clHawaii_64.bin.cl
dgemm_hawaiiSplitKernel.clHawaii_64.bin.cl
sgemm_hawaiiSplitKernel.clHawaii_64.bin.cl
diff --git a/src/library/bingen.cmake b/src/library/bingen.cmake
index 7b13557..4511f45 100644
--- a/src/library/bingen.cmake
+++ b/src/library/bingen.cmake
@@ -16,6 +16,7 @@ ${CLTEMPLATE_PATH}/sgemm_gcn.cl
${CLTEMPLATE_PATH}/zgemm_gcn.cl
${CLTEMPLATE_PATH}/sgemm_gcn_SmallMatrices.cl
${CLTEMPLATE_PATH}/sgemm_hawaiiSplit64_32.cl
+${CLTEMPLATE_PATH}/dtrsm_gpu192.cl
)
diff --git a/src/library/blas/functor/gpu_dtrsm192.cc b/src/library/blas/functor/gpu_dtrsm192.cc
new file mode 100644
index 0000000..f7a0006
--- /dev/null
+++ b/src/library/blas/functor/gpu_dtrsm192.cc
@@ -0,0 +1,596 @@
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+#include "functor.h"
+#include "binary_lookup.h"
+#include <iostream>
+
+#include "functor_xtrsm.h"
+#include "gpu_dtrsm192.h"
+//#include "tahiti.h"
+#include "BinaryBuild.h"
+
+#if BUILD_KERNEL_FROM_STRING
+#include "dtrsm_gpu192.clT"
+#else
+
+#include "dtrsm_gpu192.clHawaii_64.bin.clT"
+
+#endif
+
+
+// Make it 1 to enable additional debug 'print'
+#define VERB 0
+
+
+//TODO
+//clReleaseKernel(kernel) ;
+
+#define BLOCK_SIZE 12 // inner blocking size, <=32
+#define NB 192 // outer blocking size, >BLOCK_SIZE
+
+
+//
+// The static cache used to store all instances of clblasDtrsmFunctorGpu /clblasDgemmFunctorTahiti
+//
+typedef clblasFunctorCache<clblasDtrsm192FunctorGpu, bool> Cache ;
+static Cache cache ;
+
+
+clblasDtrsm192FunctorGpu::clblasDtrsm192FunctorGpu(Args & args, cl_int & err, const char* DevName, cl_uint _64BitsUse) :
+ m_program(0)
+{
+
+ cl_device_id device;
+ cl_context context;
+
+ cl_command_queue queue = args.queue;
+ err = getDeviceAndContext(queue, device, context);
+
+
+
+
+ if( err != CL_SUCCESS )
+ {
+ return;
+ }
+
+ if (VERB) printf(" ===> GET KERNEL %s\n", "clblasDtrsmFunctorGpu") ;
+
+ BinaryLookup bl(context, device, "clblasDtrsm192FunctorGpu");
+
+ if ( !bl.found() ) // may create empty file or may wait until file is ready
+ {
+ // directly build from a char*
+#if BUILD_KERNEL_FROM_STRING
+ err = bl.buildFromSource(dtrsm_gpu_kernels);
+#else
+ if(!strcmp(DevName, "Tahiti"))
+ {
+ }
+
+ else if(!strcmp(DevName, "Hawaii"))
+ {
+#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
+ if(_64BitsUse==64)
+ err = bl.buildFromBinary(dtrsm_gpu192_kernels_64_bin_Hawaii, sizeof(dtrsm_gpu192_kernels_64_bin_Hawaii), NULL);
+ else
+ {
+ std::cout<<"we don't support clblas on 32 bits"<< std::endl;
+ assert(1);
+ }
+#endif
+ }
+
+#endif
+
+ if ( err != CL_SUCCESS )
+ {
+ if (VERB) printf(" ===> BUILD PROBLEM\n") ;
+
+ return;
+ }
+ }
+
+ this->m_program = bl.getProgram();
+}
+
+
+
+
+#define CALL_KERNEL_TRIPLE_UPDATE(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event) \
+do{ \
+ err = call_kernel_triple_update192(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event); \
+ if(err != CL_SUCCESS) { \
+ return err; \
+ } \
+} while(0)
+
+
+
+cl_int call_kernel_triple_update192(const char* kernel_name,
+ const cl_program prg,
+ const cl_command_queue queue,
+ cl_mem A,
+ unsigned int offA,
+ cl_mem d_dinvA,
+ int i,
+ unsigned int lda,
+ int M,
+ cl_event *event)
+{
+ cl_int err = 0;
+
+ unsigned int m = M;
+
+ int npages = M/(i*2)+(M%(i*2)!=0);
+
+
+ size_t globalLocal [2];
+ size_t globalThreads[2];
+ switch (i)
+ {
+ case 12:
+ globalLocal[0] = 12;
+ globalLocal[1] = 1;
+ globalThreads[0] = npages * 12;
+ globalThreads[1] = 1;
+ break;
+ case 24:
+ globalLocal[0] = 24;
+ globalLocal[1] = 2;
+ globalThreads[0] = npages * 24;
+ globalThreads[1] = 2;
+ break;
+ case 48:
+ globalLocal[0] = 24;
+ globalLocal[1] = 2;
+ globalThreads[0] = npages * 48;
+ globalThreads[1] = 4;
+ break;
+ case 96:
+ globalLocal[0] = 24;
+ globalLocal[1] = 2;
+ globalThreads[0] = npages * 96;
+ globalThreads[1] = 8;
+ break;
+ default:
+ break;
+ }
+
+ cl_kernel kernel = clCreateKernel(prg, kernel_name, &err);
+ if (err != CL_SUCCESS) {
+ //printf( "create kernel %s failed with %d\n", kernel_name, err );
+ return err;
+ }
+
+ clSetKernelArg(kernel, 0, sizeof(cl_mem), &A);
+ clSetKernelArg(kernel, 1, sizeof(unsigned int), &offA);
+ clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_dinvA);
+ clSetKernelArg(kernel, 3, sizeof(int), &i);
+ clSetKernelArg(kernel, 4, sizeof(unsigned int), &lda);
+ clSetKernelArg(kernel, 5, sizeof(int), &npages);
+ clSetKernelArg(kernel, 6, sizeof(unsigned int), &m);
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
+ globalThreads, globalLocal ,
+ 0, NULL, event);
+
+
+ if (err != CL_SUCCESS) {
+ clReleaseKernel(kernel);
+ //printf( "execution of kernel %s failed with %d\n", kernel_name, err );
+ return err;
+ }
+
+ err = clReleaseKernel(kernel);
+ return err;
+
+}
+
+
+
+
+//extern "C"
+cl_int diag_dtrtri192 (cl_program prg,
+ cl_command_queue queue,
+ int M,
+ clblasUplo uplo,
+ clblasDiag diag,
+ cl_mem A,
+ size_t offA,
+ cl_mem d_dinvA,
+ size_t lda,
+ cl_event *event )
+{
+
+ cl_int err = 0;
+
+ /*
+ This routine is used in dtrsm
+ */
+
+ //For side==right, M is actually N here
+ int nthreads = (M/BLOCK_SIZE + (M % BLOCK_SIZE != 0)) * BLOCK_SIZE;
+ unsigned int m = M;
+
+ if (uplo == clblasLower) {
+
+ //lower is not supported yet
+ }
+ else {
+
+ cl_kernel diag_dtrtri_kernel_upper = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_UPPER", &err);
+ if (err != CL_SUCCESS) {
+ //printf( "create kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
+ return err;
+ }
+
+ int isDiagUnit = (diag == clblasUnit);
+ clSetKernelArg(diag_dtrtri_kernel_upper, 0, sizeof(int), &isDiagUnit);
+ clSetKernelArg(diag_dtrtri_kernel_upper, 1, sizeof(cl_mem), &A);
+ clSetKernelArg(diag_dtrtri_kernel_upper, 2, sizeof(unsigned int), &offA);
+ clSetKernelArg(diag_dtrtri_kernel_upper, 3, sizeof(cl_mem), &d_dinvA);
+ clSetKernelArg(diag_dtrtri_kernel_upper, 4, sizeof(unsigned int), &lda);
+ clSetKernelArg(diag_dtrtri_kernel_upper, 5, sizeof(unsigned int), &m);
+
+ size_t globalThreads[1] = { nthreads };
+ size_t globalLocal [1] = { BLOCK_SIZE };
+
+ err = clEnqueueNDRangeKernel(queue, diag_dtrtri_kernel_upper, 1, NULL,
+ globalThreads, globalLocal ,
+ 0, NULL, event);
+
+ if (err != CL_SUCCESS) {
+ //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
+ return err;
+ }
+
+ clReleaseKernel(diag_dtrtri_kernel_upper);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ // update the inverse up to the size of BLOCK_SIZE
+ for( int i=BLOCK_SIZE; i < NB; i*=2 ) {
+
+ switch (i) {
+ case 12:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_12_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ case 24:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ case 48:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ case 96:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ default:
+
+ break;
+ }
+
+ if (i*2 >= M) break;
+ }
+
+ }
+
+ return err;
+
+}
+
+
+
+
+#define check_error(cmd) \
+do{ \
+ cl_int xxxerr = cmd ; \
+ if (xxxerr != CL_SUCCESS) { \
+ if(InvA != 0) \
+ clReleaseMemObject(InvA); \
+ if(X != 0) \
+ clReleaseMemObject(X); \
+ return xxxerr; \
+ } \
+} while(0)
+
+
+static cl_int clearBuffer192( cl_command_queue queue ,
+ cl_mem buffer ,
+ size_t buffer_size )
+{
+
+ cl_int err = 0;
+ cl_event event;
+ // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!!
+ double zero = 0.0 ;
+ err = clEnqueueFillBuffer(queue,
+ buffer,
+ &zero,
+ sizeof(double),
+ 0, // offset
+ buffer_size,
+ 0,
+ NULL,
+ &event
+ ) ;
+
+ return err;
+
+}
+
+
+
+#define nb 192 // outer blocking size, >BLOCK_SIZE
+#define min(x,y) ((x)<(y)?(x):(y))
+
+
+
+cl_int cl_dtrsm192( cl_program prg,
+ cl_command_queue queue ,
+ clblasSide side,
+ clblasUplo uplo,
+ clblasTranspose transA,
+ clblasDiag diag,
+ int M,
+ int N,
+ double alpha,
+ cl_mem A, size_t offA, size_t ldA,
+ cl_mem B, size_t offB, size_t ldB,
+ cl_event *event
+ )
+{
+ cl_int err = 0;
+
+ int i;
+ cl_context context;
+ err = getQueueContext(queue, &context);
+ if(err != CL_SUCCESS) return err;
+
+ /* quick return on wrong size */
+ if (M <= 0 || N <= 0)
+ return clblasInvalidDim;
+
+ double neg_one = -1.0 ;
+ double one = 1.0 ;
+ double zero = 0.0 ;
+
+
+
+ // Compute the number of blocks of the specified 'size' to fully cover 'n'
+ // Simply speaking, this is n/size rounded up.
+#define BLOCKS(n,size) ( ( (n) / size ) + ( (n) % (size) != 0 ) )
+
+#define CLEANUP
+
+#define END_DGEMM_ARGS 1,&queue,0,NULL,event
+
+ // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm
+#define _(M,i,j) M , (off##M + ((i)+(j)*ld##M) ) , ld##M
+
+ cl_mem InvA = 0;
+ cl_mem X = 0;
+ // X of size mxn will contain the result
+ size_t ldX = M ;
+ size_t offX = 0; //must be 0: needed by the _(X,i,j) macro
+ size_t size_X = N*ldX * sizeof(double);
+ X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err);
+ check_error(err) ;
+ err = clearBuffer192( queue, X, size_X ) ;
+ check_error(err) ;
+
+
+ if (side == clblasLeft)
+ {
+ // side=L
+ /* invert the diagonals
+ * Allocate device memory for the inverted diagonal blocks, size=m*nb
+ */
+ // Not supported yet with 192 block size
+
+ }
+ else
+ {
+
+ //
+ // Helper for C = alpha * B * A + beta * C
+ //
+ // In the calls below
+ // - the 2nd matrix shall be either A or InvA transposed according to transA
+ // - the 1st and 3rd matrices are either B and X
+ //
+#define DGEMM_RIGHT(m,n,k, alpha, B, A, beta, C ) \
+ do { \
+ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, transA , m, n, k, alpha, B, A, beta, C , 1, &queue, 0, NULL, event ) ; \
+ check_error(err) ; \
+ } while(0)
+
+
+ // side=R
+ /* invert the diagonals
+ * Allocate device memory for the inverted diagonal blocks, size=n*BLOCK_SIZE
+ */
+
+ /* invert the diagonals
+ * Allocate device memory for the inverted diagonal blocks, size=m*nb
+ */
+ size_t ldInvA = nb ;
+ size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro
+ size_t size_InvA = ldInvA * BLOCKS(N,nb) * nb *sizeof(double);
+ InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err);
+ check_error(err) ;
+ err = clearBuffer192( queue, InvA, size_InvA ) ;
+ check_error(err) ;
+
+ diag_dtrtri192 (prg, queue, N, uplo, diag, A, offA, InvA, ldA, event);
+
+ if (transA == clblasNoTrans)
+ {
+ /* the non-transpose case */
+ if (uplo == clblasLower)
+ {
+ /* the lower case */
+ /* handle the first block seperately with alpha */
+ // lower is not implemented yet
+
+
+ }
+ else
+ {
+ /* the upper case */
+ /* handle the first block seperately with alpha */
+ int nn = min(nb, (int)N);
+ //DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) );
+ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, alpha, B, offB, ldB, InvA, offInvA, ldInvA, zero, X, offX, ldX, 1, &queue, 0, NULL, event);
+ check_error(err);
+
+ if (nb < N)
+ {
+
+ //DGEMM_RIGHT( M, N-nb, nb, neg_one, _(X,0,0), _(A,0,nb), alpha, _(B,0,nb) );
+ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - nb, nb, neg_one, X, offX, ldX, A, offA+ldA*nb, ldA, alpha, B, offB+nb*ldB, ldB, 1, &queue, 0, NULL, event);
+ assert(err == CL_SUCCESS);
+
+ /* the rest blocks */
+ for( i=nb; i < N; i += nb )
+ {
+ nn = min(nb, (int)N-i);
+ //DGEMM_RIGHT( M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) );
+ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, one, B, offB+i*ldB, ldB, InvA, offInvA+i*nb, ldInvA, zero, X, offX+i*ldX, ldX, 1, &queue, 0, NULL, event);
+ assert(err == CL_SUCCESS);
+
+ if (i+nb >= N)
+ break;
+
+ //DGEMM_RIGHT( M, N-i-nb, nb, neg_one, _(X,0,i), _(A,i,i+nb), one, _(B,0,i+nb) );
+ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - i - nb, nb, neg_one, X, offX+i*ldX, ldX, A, offA + i + (nb + i)*ldA, ldA, one, B, offB + (i + nb)*ldB, ldB, 1, &queue, 0, NULL, event);
+ assert(err == CL_SUCCESS);
+ }
+ }
+ }
+ }
+ else
+ {
+
+ /* the transpose case */
+ // trans is not implemented yet
+ }
+
+ }
+
+ // Copy X(m,n) to B(m,n)
+ {
+ size_t src_origin[3] = { 0, 0, 0 } ;
+ size_t dst_origin[3] = { offB*sizeof(double), 0, 0 } ;
+ size_t region[3] = { M*sizeof(double), N, 1 } ;
+
+
+ err = clEnqueueCopyBufferRect( queue,
+ X,
+ B,
+ src_origin,
+ dst_origin,
+ region,
+ ldX*sizeof(double), 0,
+ ldB*sizeof(double), 0,
+ 0, NULL,
+ event) ;
+ check_error(err) ;
+
+ clReleaseMemObject(InvA);
+ clReleaseMemObject(X);
+
+ }
+
+ return err;
+
+}
+
+
+
+clblasStatus clblasDtrsm192FunctorGpu::execute(Args &args)
+{
+ cl_int err;
+ cl_command_queue queue = args.queue;
+
+ if (VERB) printf(" ===> EXECUTE KERNEL %s\n", "dtrsm_gpu") ;
+
+
+ cl_program prg = this->m_program;
+
+
+ err = cl_dtrsm192( prg,
+ queue ,
+ args.side,
+ args.uplo,
+ args.transA,
+ args.diag,
+ args.M,
+ args.N,
+ args.alpha,
+ args.A, args.offA, args.lda,
+ args.B, args.offB, args.ldb,
+ args.events
+ );
+
+
+
+ if (VERB) printf(" ===> ERR=%d \n",(int)err) ;
+
+ return clblasStatus(err) ;
+}
+
+
+
+clblasDtrsm192FunctorGpu *
+clblasDtrsm192FunctorGpu::provide(clblasDtrsmFunctor::Args & args , const char* DevName)
+{
+
+ if ( args.order == clblasRowMajor )
+ return NULL ; // The RowMajor case shall never occur.
+
+
+ cl_device_id dev;
+ cl_context ctxt;
+
+ cl_int err = getDeviceAndContext(args.queue, dev, ctxt);
+
+ if (err != CL_SUCCESS)
+ {
+ return NULL;
+ }
+ cl_uint bitness = getAddressBits(dev);
+ Cache::Lookup lookup(cache, ctxt, dev, true) ;
+
+ if ( lookup.ok() ){
+ clblasDtrsm192FunctorGpu * functor = lookup.get();
+ functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used
+ return functor;
+ }
+
+ clblasDtrsm192FunctorGpu * functor = new clblasDtrsm192FunctorGpu(args, err, DevName, bitness);
+ if (err != CL_SUCCESS)
+ {
+ return NULL;
+ }
+
+ lookup.set(functor) ;
+
+ return functor;
+
+}
+
diff --git a/src/library/blas/functor/hawaii.cc b/src/library/blas/functor/hawaii.cc
index 94a23a5..62f6666 100644
--- a/src/library/blas/functor/hawaii.cc
+++ b/src/library/blas/functor/hawaii.cc
@@ -26,6 +26,7 @@
#include "hawaii_sgemmBranchKernel.h"
#include "hawaii_sgemmSplit64_32.h"
#include "gcn_zgemm.h"
+#include "gpu_dtrsm192.h"
FunctorSelectorHawaii FunctorSelectorHawaii::instance ;
@@ -190,6 +191,25 @@ clblasDtrsmFunctor * FunctorSelectorHawaii::select_dtrsm_specific(clblasDtrsmFun
#else
clblasDtrsmFunctor * functor;
+
+ if ((args.M % 192 == 0) && (args.N % 192 == 0))
+ {
+ //TODO: the implementation of sub block being 192 only supports
+ //side == right
+ //uplo == upper
+ //trans == notrans
+ //M and N need to be mod192
+ //subblock being 192 is prefered over 128 on Hawaii device since
+ //it does not create "boundary" in DGEMM calls
+ //Hawaii DGEMM calls have better performance when M N K are mod48
+ if ((args.side == clblasRight) && (args.uplo == clblasUpper) && (args.transA == clblasNoTrans))
+ {
+ functor = clblasDtrsm192FunctorGpu::provide(args, "Hawaii");
+ if (functor)
+ return functor;
+ }
+ }
+ //sub block is 128 here
functor = clblasDtrsmFunctorGpu::provide(args, "Hawaii");
if (functor)
return functor;
diff --git a/src/library/blas/functor/include/gpu_dtrsm192.h b/src/library/blas/functor/include/gpu_dtrsm192.h
new file mode 100644
index 0000000..aeda3a8
--- /dev/null
+++ b/src/library/blas/functor/include/gpu_dtrsm192.h
@@ -0,0 +1,28 @@
+#ifndef _CLBLAS_DTRSM192_FUNCTOR_GPU_H_
+#define _CLBLAS_DTRSM192_FUNCTOR_GPU_H_
+
+class clblasDtrsm192FunctorGpu : public clblasDtrsmFunctor
+{
+public:
+
+
+private: // Constructor & Destructor
+
+ clblasDtrsm192FunctorGpu(Args & args, cl_int & err, const char* DevName, cl_uint _64BitsUse) ;
+
+public:
+
+ // Provide a suitable clblasDtrsmFunctorTahiti for the specified args
+ // or NULL if none
+ static clblasDtrsm192FunctorGpu * provide(clblasDtrsmFunctor::Args & args, const char* DevName) ;
+
+public: // inherited member from clblasDtrsmFunctor
+
+ virtual clblasStatus execute(Args &args) ;
+
+private:
+
+ cl_program m_program ;
+} ;
+
+#endif
diff --git a/src/library/blas/gens/clTemplates/dtrsm_gpu192.cl b/src/library/blas/gens/clTemplates/dtrsm_gpu192.cl
new file mode 100644
index 0000000..551f64b
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/dtrsm_gpu192.cl
@@ -0,0 +1,1031 @@
+static const char * dtrsm_gpu192_kernels = "
+
+//only supports upper triangle matrix for now
+
+#define BLOCK_SIZE 12 // inner blocking size, <=32
+#define NB 192 // outer blocking size, >BLOCK_SIZE
+
+#define ZERO ( 0.0)
+#define ONE ( 1.0)
+
+#ifdef DOUBLE_PRECISION
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#else
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#endif
+#endif
+
+__kernel void DIAG_DTRTRI_KERNEL_UPPER(int isDiagUnit,
+ __global double const * restrict A,
+ uint offA,
+ __global double *d_dinvA,
+ uint lda,
+ uint na)
+{
+
+ int i, j;
+ double Ystx = 0;
+ __local double *y = 0;
+ double switcher;
+ double neg_switcher;
+
+ // Thread index
+ int tx = get_local_id(0);
+
+ // Thread index
+ int gx = get_global_id(0);
+
+ // Block index
+ int bx = get_group_id(0);
+
+ A = A + offA;
+
+ __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE;
+ int NumBLperNB = NB/BLOCK_SIZE;
+ d_dinvA += bx/NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE);
+
+ __local double Bs[BLOCK_SIZE*BLOCK_SIZE];
+ __local double workspace[BLOCK_SIZE]; // workspace used to store the current working column
+
+ // load A
+ #pragma unroll
+ for( i=0; i < BLOCK_SIZE; i++ )
+ {
+ if(tx <= i && i+bx*BLOCK_SIZE < na )
+ {
+ Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);
+ }
+ else
+ {
+ Bs[i*BLOCK_SIZE+tx] = ZERO;
+ }
+ }
+ // read in the whole square block of my A and zero out the non data triangular
+
+ // Synchronize to make sure the matrices are loaded
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // solve the diagonals
+
+ if(isDiagUnit == 1)
+ {
+ Bs[tx*BLOCK_SIZE+tx] = ONE;
+ }
+ else
+ {
+ if( Bs[tx*BLOCK_SIZE+tx] == ZERO )
+ {
+ Bs[tx*BLOCK_SIZE+tx] = ONE;
+ }
+ else
+ {
+ Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;
+ }
+ }
+
+
+ /* the upper case */
+ for( i=0; i < BLOCK_SIZE; i++ ) {
+ Ystx = ZERO;
+ if( tx < i)
+ {
+ switcher = ONE;
+ }
+ else
+ {
+ switcher = ZERO;
+ }
+
+ //dtrmv
+ workspace[tx] = *(Bs+i*BLOCK_SIZE+tx);
+ y = Bs+i*BLOCK_SIZE;
+
+ #pragma unroll
+ //for( j=tx; j < i; j++ )
+ for( j=0; j < i; j++ )
+ Ystx += switcher * (*(Bs+j*BLOCK_SIZE+tx)*workspace[j]);
+
+ //sscal
+ // if (tx != i) y[tx]=switcher*Ystx*(-Bs[i*BLOCK_SIZE+i]);
+
+ if( tx != i)
+ {
+ switcher = ONE;
+ neg_switcher = ZERO;
+ }
+ else
+ {
+ switcher = ZERO;
+ neg_switcher = ONE;
+ }
+
+ y[tx] = switcher *Ystx*(-Bs[i*BLOCK_SIZE+i])+neg_switcher*y[tx];
+
+ // __syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // write back A
+#pragma unroll
+ for( i=0; i < BLOCK_SIZE; i++ )
+ *(d_dinvA+i*NB+tx) = Bs[i*BLOCK_SIZE+tx];
+
+}
+
+/*
+ * B21 = -inv(A11)*A12*inv(A22)
+ * 12 to 24
+ */
+__kernel void
+TRIPLE_DGEMM_UPDATE_12_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 12 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8
+
+ //Work group size is [12]
+ //global work size is [96*number of blocks]
+ //each work item in each work group is responsible for every element in that row
+ //each work group is responsible for one gemm;\
+
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0);
+ const uint idx = get_local_id(0);
+
+ const uint page = gidx % npages;
+ const uint page_block = page/8;//8 pages per page block
+ const uint page_index_in_block = page%8;
+
+
+ __global double *B, *C;
+ __local double lA[12][12];
+ __local double lB[12][12];
+ double privateC[12] = {(double)0};
+
+ //decide A12 location for each page
+ Ain = Ain + offAin;
+ Ain += (page*blk*2 + blk) * lda + page * 2 * blk;
+
+ //decide invA22 (B) location for each page
+ B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk*2 + blk) * NB + page_index_in_block * 2 * blk + blk;
+
+ //decide invA12 location for each page
+ C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk*2 + blk) * NB + page_index_in_block * 2 * blk;
+
+ //read A and B into LDS no transpose operated here
+ lA[idx][0] = Ain[idx];
+ lA[idx][1] = Ain[idx + lda];
+ lA[idx][2] = Ain[idx + lda*2];
+ lA[idx][3] = Ain[idx + lda*3];
+ lA[idx][4] = Ain[idx + lda*4];
+ lA[idx][5] = Ain[idx + lda*5];
+ lA[idx][6] = Ain[idx + lda*6];
+ lA[idx][7] = Ain[idx + lda*7];
+ lA[idx][8] = Ain[idx + lda*8];
+ lA[idx][9] = Ain[idx + lda*9];
+ lA[idx][10] = Ain[idx + lda*10];
+ lA[idx][11] = Ain[idx + lda*11];
+
+ lB[idx][0] = B[idx];
+ lB[idx][1] = B[idx + NB];
+ lB[idx][2] = B[idx + NB*2];
+ lB[idx][3] = B[idx + NB*3];
+ lB[idx][4] = B[idx + NB*4];
+ lB[idx][5] = B[idx + NB*5];
+ lB[idx][6] = B[idx + NB*6];
+ lB[idx][7] = B[idx + NB*7];
+ lB[idx][8] = B[idx + NB*8];
+ lB[idx][9] = B[idx + NB*9];
+ lB[idx][10] = B[idx + NB*10];
+ lB[idx][11] = B[idx + NB*11];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do math
+
+ uint i = 0;
+
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11],privateC[11]);
+ //mem_fence(CLK_LOCAL_MEM_FENCE);
+ i = i + 1;
+ }while(i < 12);
+
+ i = 0;
+ do{
+ C[NB*i+idx] = privateC[i];
+ i = i + 1;
+ }while(i < 12);
+
+ ////////////// -invA11*invA12
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ //A is moving to invA11
+ __global double *A;
+ A = d_dinvA + page_block*NB*NB + ( (page%4)*blk*2 ) * NB + (page%4) * 2 * blk;
+ //both B and C are pointing at invA12
+ B = C;
+
+ //read A and B into LDS no transpose operated here
+ lA[idx][0] = A[idx];
+ lA[idx][1] = A[idx + NB];
+ lA[idx][2] = A[idx + NB*2];
+ lA[idx][3] = A[idx + NB*3];
+ lA[idx][4] = A[idx + NB*4];
+ lA[idx][5] = A[idx + NB*5];
+ lA[idx][6] = A[idx + NB*6];
+ lA[idx][7] = A[idx + NB*7];
+ lA[idx][8] = A[idx + NB*8];
+ lA[idx][9] = A[idx + NB*9];
+ lA[idx][10] = A[idx + NB*10];
+ lA[idx][11] = A[idx + NB*11];
+
+ lB[idx][0] = B[idx];
+ lB[idx][1] = B[idx + NB];
+ lB[idx][2] = B[idx + NB*2];
+ lB[idx][3] = B[idx + NB*3];
+ lB[idx][4] = B[idx + NB*4];
+ lB[idx][5] = B[idx + NB*5];
+ lB[idx][6] = B[idx + NB*6];
+ lB[idx][7] = B[idx + NB*7];
+ lB[idx][8] = B[idx + NB*8];
+ lB[idx][9] = B[idx + NB*9];
+ lB[idx][10] = B[idx + NB*10];
+ lB[idx][11] = B[idx + NB*11];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do math
+
+ i = 0;
+ privateC[0] = 0;
+ privateC[1] = 0;
+ privateC[2] = 0;
+ privateC[3] = 0;
+ privateC[4] = 0;
+ privateC[5] = 0;
+ privateC[6] = 0;
+ privateC[7] = 0;
+ privateC[8] = 0;
+ privateC[9] = 0;
+ privateC[10] = 0;
+ privateC[11] = 0;
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11],privateC[11]);
+ //mem_fence(CLK_LOCAL_MEM_FENCE);
+ i = i + 1;
+ }while(i < 12);
+
+ i = 0;
+ do{
+ C[NB*i+idx] = -1*privateC[i];
+ i = i + 1;
+ }while(i < 12);
+
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_24_PART1_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 24 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 2]
+ //each work item in each work group is responsible for 12 elements (half) in that row
+ //each work group is responsible for one gemm;
+
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0);
+ const uint gidy = get_group_id(1);
+ const uint idx = get_local_id(0);
+ const uint idy = get_local_id(1);
+
+ const uint page = gidx % npages;//0-3 for 192; 0-1 for 96
+ const uint page_block = page/4;//4 pages per page block
+
+
+
+ __global double *B, *C;
+ __local double lA[24][24];
+ __local double lB[24][24];
+ double privateC[12] = {(double)0};
+
+ //decide A12 location for each page
+ Ain = Ain + offAin;
+ Ain += (page*blk*2 + blk) * lda + page * 2 * blk;
+
+ //decide invA22 (B) location for each page
+ B = d_dinvA + page_block*NB*NB + ((page%4)*blk*2 + blk) * NB + (page%4) * 2 * blk + blk;
+
+ //decide invA12 location for each page
+ C = d_dinvA + page_block*NB*NB + ((page%4)*blk*2 + blk) * NB + (page%4) * 2 * blk;
+
+ //read A and B into LDS no transpose operated here
+ //each work iteam loads half a row
+ lA[idx][0+idy*12] = Ain[idx + idy*12*lda];
+ lA[idx][1+idy*12] = Ain[idx + lda + idy*12*lda];
+ lA[idx][2+idy*12] = Ain[idx + lda*2 + idy*12*lda];
+ lA[idx][3+idy*12] = Ain[idx + lda*3 + idy*12*lda];
+ lA[idx][4+idy*12] = Ain[idx + lda*4 + idy*12*lda];
+ lA[idx][5+idy*12] = Ain[idx + lda*5 + idy*12*lda];
+ lA[idx][6+idy*12] = Ain[idx + lda*6 + idy*12*lda];
+ lA[idx][7+idy*12] = Ain[idx + lda*7 + idy*12*lda];
+ lA[idx][8+idy*12] = Ain[idx + lda*8 + idy*12*lda];
+ lA[idx][9+idy*12] = Ain[idx + lda*9 + idy*12*lda];
+ lA[idx][10+idy*12] = Ain[idx + lda*10 + idy*12*lda];
+ lA[idx][11+idy*12] = Ain[idx + lda*11 + idy*12*lda];
+
+ lB[idx][0+idy*12] = B[idx + idy*12*NB];
+ lB[idx][1+idy*12] = B[idx + NB + idy*12*NB];
+ lB[idx][2+idy*12] = B[idx + NB*2 + idy*12*NB];
+ lB[idx][3+idy*12] = B[idx + NB*3 + idy*12*NB];
+ lB[idx][4+idy*12] = B[idx + NB*4 + idy*12*NB];
+ lB[idx][5+idy*12] = B[idx + NB*5 + idy*12*NB];
+ lB[idx][6+idy*12] = B[idx + NB*6 + idy*12*NB];
+ lB[idx][7+idy*12] = B[idx + NB*7 + idy*12*NB];
+ lB[idx][8+idy*12] = B[idx + NB*8 + idy*12*NB];
+ lB[idx][9+idy*12] = B[idx + NB*9 + idy*12*NB];
+ lB[idx][10+idy*12] = B[idx + NB*10 + idy*12*NB];
+ lB[idx][11+idy*12] = B[idx + NB*11 + idy*12*NB];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do math
+
+ uint i = 0;
+
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]);
+ i = i + 1;
+ }while(i < 24);
+
+ i = 0;
+ do{
+ C[NB*idy*12+NB*i+idx] = privateC[i];
+ i = i + 1;
+ }while(i < 12);
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_24_PART2_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 24 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 2]
+ //each work item in each work group is responsible for 12 elements (half) in that row
+ //each work group is responsible for one gemm;
+
+
+ ////////////// -invA11*invA12
+ const uint gidx = get_group_id(0);
+ const uint gidy = get_group_id(1);
+ const uint idx = get_local_id(0);
+ const uint idy = get_local_id(1);
+
+ const uint page = gidx % npages;//0-3 for 192; 0-1 for 96
+ const uint page_block = page/4;//4 pages per page block
+
+ __global double *A, *B, *C;
+ __local double lA[24][24];
+ __local double lB[24][24];
+ double privateC[12] = {(double)0};
+
+ //decide invA11 location for each page
+ A = d_dinvA + page_block*NB*NB + (page%4)*blk*2 * NB + (page%4) * 2 * blk;
+ //decide invA12 location for each page
+ B = d_dinvA + page_block*NB*NB + ((page%4)*blk*2 + blk) * NB + (page%4) * 2 * blk;
+ C = B;
+ //C = d_dinvA + page_block*NB*NB + ((page%4)*blk*2) * NB + (page%4) * 2 * blk + blk;
+
+ //read A and B into LDS no transpose operated here
+ //each work iteam loads half a row
+ lA[idx][0+idy*12] = A[idx + idy*12*NB];
+ lA[idx][1+idy*12] = A[idx + NB + idy*12*NB];
+ lA[idx][2+idy*12] = A[idx + NB*2 + idy*12*NB];
+ lA[idx][3+idy*12] = A[idx + NB*3 + idy*12*NB];
+ lA[idx][4+idy*12] = A[idx + NB*4 + idy*12*NB];
+ lA[idx][5+idy*12] = A[idx + NB*5 + idy*12*NB];
+ lA[idx][6+idy*12] = A[idx + NB*6 + idy*12*NB];
+ lA[idx][7+idy*12] = A[idx + NB*7 + idy*12*NB];
+ lA[idx][8+idy*12] = A[idx + NB*8 + idy*12*NB];
+ lA[idx][9+idy*12] = A[idx + NB*9 + idy*12*NB];
+ lA[idx][10+idy*12] = A[idx + NB*10 + idy*12*NB];
+ lA[idx][11+idy*12] = A[idx + NB*11 + idy*12*NB];
+
+ lB[idx][0+idy*12] = B[idx + idy*12*NB];
+ lB[idx][1+idy*12] = B[idx + NB + idy*12*NB];
+ lB[idx][2+idy*12] = B[idx + NB*2 + idy*12*NB];
+ lB[idx][3+idy*12] = B[idx + NB*3 + idy*12*NB];
+ lB[idx][4+idy*12] = B[idx + NB*4 + idy*12*NB];
+ lB[idx][5+idy*12] = B[idx + NB*5 + idy*12*NB];
+ lB[idx][6+idy*12] = B[idx + NB*6 + idy*12*NB];
+ lB[idx][7+idy*12] = B[idx + NB*7 + idy*12*NB];
+ lB[idx][8+idy*12] = B[idx + NB*8 + idy*12*NB];
+ lB[idx][9+idy*12] = B[idx + NB*9 + idy*12*NB];
+ lB[idx][10+idy*12] = B[idx + NB*10 + idy*12*NB];
+ lB[idx][11+idy*12] = B[idx + NB*11 + idy*12*NB];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ //do math
+
+ uint i = 0;
+
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]);
+ i = i + 1;
+ }while(i < 24);
+
+ i = 0;
+ do{
+ C[NB*idy*12+NB*i+idx] = -1 * privateC[i];
+ i = i + 1;
+ }while(i < 12);
+
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_48_PART1_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 48 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 4]
+ //each work item in each work group is responsible for 12 elements (1/4) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0);
+ const uint gidy = get_group_id(1);
+ const uint idx = get_local_id(0);
+ const uint idy = get_local_id(1);
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ const uint page = (gidx/2)%2;//index of page within a page_block; 2 pages per page_block
+ const uint page_block = gidx/4;//#index of page_block; 2 WG per page; 4 WG per page_block
+
+ __global double *B, *C;
+ __local double lA[24][48];
+ __local double lB[48][24];
+ double privateC[12] = {(double)0};
+
+ //decide A12 location for each page
+ //each workgroup loads half of A (left or right)
+ Ain = Ain + offAin;
+ Ain += page_block*NB*lda + page_block*NB + page*blk*2*lda + page*blk*2 + blk*lda + gidx%2*(blk/2);
+
+ //decide invA22 (B) location for each page
+ //each workgroup loads half of B (up or down)
+ B = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + blk + gidy*(blk/2)*NB;
+
+ //decide invA12 location for each page;
+ //Actually this will be stored in invA21 temporarily
+ //each workgroup writes 1/4 of C
+ C = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //idx 0-23 idy 0-1
+ lA[idx][0+idy*24] = Ain[idx + idy*24*lda];
+ lA[idx][1+idy*24] = Ain[idx + lda + idy*24*lda];
+ lA[idx][2+idy*24] = Ain[idx + lda*2 + idy*24*lda];
+ lA[idx][3+idy*24] = Ain[idx + lda*3 + idy*24*lda];
+ lA[idx][4+idy*24] = Ain[idx + lda*4 + idy*24*lda];
+ lA[idx][5+idy*24] = Ain[idx + lda*5 + idy*24*lda];
+ lA[idx][6+idy*24] = Ain[idx + lda*6 + idy*24*lda];
+ lA[idx][7+idy*24] = Ain[idx + lda*7 + idy*24*lda];
+ lA[idx][8+idy*24] = Ain[idx + lda*8 + idy*24*lda];
+ lA[idx][9+idy*24] = Ain[idx + lda*9 + idy*24*lda];
+ lA[idx][10+idy*24] = Ain[idx + lda*10 + idy*24*lda];
+ lA[idx][11+idy*24] = Ain[idx + lda*11 + idy*24*lda];
+ lA[idx][12+idy*24] = Ain[idx + lda*12 + idy*24*lda];
+ lA[idx][13+idy*24] = Ain[idx + lda*13 + idy*24*lda];
+ lA[idx][14+idy*24] = Ain[idx + lda*14 + idy*24*lda];
+ lA[idx][15+idy*24] = Ain[idx + lda*15 + idy*24*lda];
+ lA[idx][16+idy*24] = Ain[idx + lda*16 + idy*24*lda];
+ lA[idx][17+idy*24] = Ain[idx + lda*17 + idy*24*lda];
+ lA[idx][18+idy*24] = Ain[idx + lda*18 + idy*24*lda];
+ lA[idx][19+idy*24] = Ain[idx + lda*19 + idy*24*lda];
+ lA[idx][20+idy*24] = Ain[idx + lda*20 + idy*24*lda];
+ lA[idx][21+idy*24] = Ain[idx + lda*21 + idy*24*lda];
+ lA[idx][22+idy*24] = Ain[idx + lda*22 + idy*24*lda];
+ lA[idx][23+idy*24] = Ain[idx + lda*23 + idy*24*lda];
+
+ lB[0+idy*24][idx] = B[idx*NB + idy*24];
+ lB[1+idy*24][idx] = B[idx*NB + idy*24 + 1];
+ lB[2+idy*24][idx] = B[idx*NB + idy*24 + 2];
+ lB[3+idy*24][idx] = B[idx*NB + idy*24 + 3];
+ lB[4+idy*24][idx] = B[idx*NB + idy*24 + 4];
+ lB[5+idy*24][idx] = B[idx*NB + idy*24 + 5];
+ lB[6+idy*24][idx] = B[idx*NB + idy*24 + 6];
+ lB[7+idy*24][idx] = B[idx*NB + idy*24 + 7];
+ lB[8+idy*24][idx] = B[idx*NB + idy*24 + 8];
+ lB[9+idy*24][idx] = B[idx*NB + idy*24 + 9];
+ lB[10+idy*24][idx] = B[idx*NB + idy*24 + 10];
+ lB[11+idy*24][idx] = B[idx*NB + idy*24 + 11];
+ lB[12+idy*24][idx] = B[idx*NB + idy*24 + 12];
+ lB[13+idy*24][idx] = B[idx*NB + idy*24 + 13];
+ lB[14+idy*24][idx] = B[idx*NB + idy*24 + 14];
+ lB[15+idy*24][idx] = B[idx*NB + idy*24 + 15];
+ lB[16+idy*24][idx] = B[idx*NB + idy*24 + 16];
+ lB[17+idy*24][idx] = B[idx*NB + idy*24 + 17];
+ lB[18+idy*24][idx] = B[idx*NB + idy*24 + 18];
+ lB[19+idy*24][idx] = B[idx*NB + idy*24 + 19];
+ lB[20+idy*24][idx] = B[idx*NB + idy*24 + 20];
+ lB[21+idy*24][idx] = B[idx*NB + idy*24 + 21];
+ lB[22+idy*24][idx] = B[idx*NB + idy*24 + 22];
+ lB[23+idy*24][idx] = B[idx*NB + idy*24 + 23];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do math
+
+ uint i = 0;
+
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]);
+ i = i + 1;
+ }while(i < 48);
+
+ i = 0;
+ do{
+ C[NB*idy*12+NB*i+idx] = privateC[i];
+ i = i + 1;
+ }while(i < 12);
+
+
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_48_PART2_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 48 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 4]
+ //each work item in each work group is responsible for 12 elements (1/4) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// -invA11*invA12
+ const uint gidx = get_group_id(0);
+ const uint gidy = get_group_id(1);
+ const uint idx = get_local_id(0);
+ const uint idy = get_local_id(1);
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ const uint page = (gidx/2)%2;//index of page within a page_block; 2 pages per page_block
+ const uint page_block = gidx/4;//#index of page_block; 2 WG per page; 4 WG per page_block
+
+ __global double *A, *B, *C;
+ __local double lA[24][48];
+ __local double lB[48][24];
+ double privateC[12] = {(double)0};
+
+ //decide invA11 location for each page
+ //each workgroup loads half of A (left or right)
+ A = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + gidx%2*(blk/2);
+
+ //decide invA12 (B) location for each page
+ //actually it was saved in invA21 from last kernel
+ //each workgroup loads half of B (up or down)
+ B = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + gidy*(blk/2)*NB;
+
+ //decide invA12 location for each page
+ //each workgroup writes 1/4 of C
+ C = d_dinvA + page_block*NB*NB + page*blk*2*NB + page*blk*2 + blk*NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //idx 0-23 idy 0-1
+ lA[idx][0+idy*24] = A[idx + idy*24*NB];
+ lA[idx][1+idy*24] = A[idx + NB + idy*24*NB];
+ lA[idx][2+idy*24] = A[idx + NB*2 + idy*24*NB];
+ lA[idx][3+idy*24] = A[idx + NB*3 + idy*24*NB];
+ lA[idx][4+idy*24] = A[idx + NB*4 + idy*24*NB];
+ lA[idx][5+idy*24] = A[idx + NB*5 + idy*24*NB];
+ lA[idx][6+idy*24] = A[idx + NB*6 + idy*24*NB];
+ lA[idx][7+idy*24] = A[idx + NB*7 + idy*24*NB];
+ lA[idx][8+idy*24] = A[idx + NB*8 + idy*24*NB];
+ lA[idx][9+idy*24] = A[idx + NB*9 + idy*24*NB];
+ lA[idx][10+idy*24] = A[idx + NB*10 + idy*24*NB];
+ lA[idx][11+idy*24] = A[idx + NB*11 + idy*24*NB];
+ lA[idx][12+idy*24] = A[idx + NB*12 + idy*24*NB];
+ lA[idx][13+idy*24] = A[idx + NB*13 + idy*24*NB];
+ lA[idx][14+idy*24] = A[idx + NB*14 + idy*24*NB];
+ lA[idx][15+idy*24] = A[idx + NB*15 + idy*24*NB];
+ lA[idx][16+idy*24] = A[idx + NB*16 + idy*24*NB];
+ lA[idx][17+idy*24] = A[idx + NB*17 + idy*24*NB];
+ lA[idx][18+idy*24] = A[idx + NB*18 + idy*24*NB];
+ lA[idx][19+idy*24] = A[idx + NB*19 + idy*24*NB];
+ lA[idx][20+idy*24] = A[idx + NB*20 + idy*24*NB];
+ lA[idx][21+idy*24] = A[idx + NB*21 + idy*24*NB];
+ lA[idx][22+idy*24] = A[idx + NB*22 + idy*24*NB];
+ lA[idx][23+idy*24] = A[idx + NB*23 + idy*24*NB];
+
+ lB[0+idy*24][idx] = B[idx*NB + idy*24];
+ lB[1+idy*24][idx] = B[idx*NB + idy*24 + 1];
+ lB[2+idy*24][idx] = B[idx*NB + idy*24 + 2];
+ lB[3+idy*24][idx] = B[idx*NB + idy*24 + 3];
+ lB[4+idy*24][idx] = B[idx*NB + idy*24 + 4];
+ lB[5+idy*24][idx] = B[idx*NB + idy*24 + 5];
+ lB[6+idy*24][idx] = B[idx*NB + idy*24 + 6];
+ lB[7+idy*24][idx] = B[idx*NB + idy*24 + 7];
+ lB[8+idy*24][idx] = B[idx*NB + idy*24 + 8];
+ lB[9+idy*24][idx] = B[idx*NB + idy*24 + 9];
+ lB[10+idy*24][idx] = B[idx*NB + idy*24 + 10];
+ lB[11+idy*24][idx] = B[idx*NB + idy*24 + 11];
+ lB[12+idy*24][idx] = B[idx*NB + idy*24 + 12];
+ lB[13+idy*24][idx] = B[idx*NB + idy*24 + 13];
+ lB[14+idy*24][idx] = B[idx*NB + idy*24 + 14];
+ lB[15+idy*24][idx] = B[idx*NB + idy*24 + 15];
+ lB[16+idy*24][idx] = B[idx*NB + idy*24 + 16];
+ lB[17+idy*24][idx] = B[idx*NB + idy*24 + 17];
+ lB[18+idy*24][idx] = B[idx*NB + idy*24 + 18];
+ lB[19+idy*24][idx] = B[idx*NB + idy*24 + 19];
+ lB[20+idy*24][idx] = B[idx*NB + idy*24 + 20];
+ lB[21+idy*24][idx] = B[idx*NB + idy*24 + 21];
+ lB[22+idy*24][idx] = B[idx*NB + idy*24 + 22];
+ lB[23+idy*24][idx] = B[idx*NB + idy*24 + 23];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do math
+
+ uint i = 0;
+
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]);
+ i = i + 1;
+ }while(i < 48);
+
+ i = 0;
+ do{
+ C[NB*idy*12+NB*i+idx] = -1 * privateC[i];
+ i = i + 1;
+ }while(i < 12);
+
+
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_96_PART1_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 96 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 192 this is 1 for 384 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 8]
+ //each work item in each work group is responsible for 12 elements (1/8) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0);
+ const uint gidy = get_group_id(1);
+ const uint idx = get_local_id(0);
+ const uint idy = get_local_id(1);
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ //const uint page = (gidx/4)%1;//index of page within a page_block; 1 pages per page_block
+ const uint page_block = gidx/4;//#index of page_block; 4 WG per page; 4 WG per page_block
+
+
+ __global double *B, *C;
+ __local double lA[24][48];
+ __local double lB[48][24];
+ double privateC[12] = {(double)0};
+
+ //decide A12 location for each page
+ //each workgroup loads 1/4 of A (left or right)
+ Ain = Ain + offAin;
+ Ain += page_block*NB*lda + page_block*NB + blk*lda + gidx%4*(blk/4);
+
+ //decide invA22 (B) location for each page
+ //each workgroup loads 1/4 of B (up or down)
+ B = d_dinvA + page_block*NB*NB + blk*NB + blk + gidy*(blk/4)*NB;
+
+ //decide invA12 location for each page;
+ //Actually this will be stored in invA21 temporarily
+ //each workgroup writes 1/4*1/4 of C
+ C = d_dinvA + page_block*NB*NB + blk*NB + gidx%4*(blk/4) + gidy*(blk/4)*NB;
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //each loop loads 1/4 row of A and 1/4 column of B
+ //idx 0-23 idy 0-1
+
+ uint block_k = blk / 48; //thus we need 2 iterations here
+ do{
+ barrier(CLK_LOCAL_MEM_FENCE);
+ lA[idx][0+idy*24] = Ain[idx + idy*24*lda];
+ lA[idx][1+idy*24] = Ain[idx + lda + idy*24*lda];
+ lA[idx][2+idy*24] = Ain[idx + lda*2 + idy*24*lda];
+ lA[idx][3+idy*24] = Ain[idx + lda*3 + idy*24*lda];
+ lA[idx][4+idy*24] = Ain[idx + lda*4 + idy*24*lda];
+ lA[idx][5+idy*24] = Ain[idx + lda*5 + idy*24*lda];
+ lA[idx][6+idy*24] = Ain[idx + lda*6 + idy*24*lda];
+ lA[idx][7+idy*24] = Ain[idx + lda*7 + idy*24*lda];
+ lA[idx][8+idy*24] = Ain[idx + lda*8 + idy*24*lda];
+ lA[idx][9+idy*24] = Ain[idx + lda*9 + idy*24*lda];
+ lA[idx][10+idy*24] = Ain[idx + lda*10 + idy*24*lda];
+ lA[idx][11+idy*24] = Ain[idx + lda*11 + idy*24*lda];
+ lA[idx][12+idy*24] = Ain[idx + lda*12 + idy*24*lda];
+ lA[idx][13+idy*24] = Ain[idx + lda*13 + idy*24*lda];
+ lA[idx][14+idy*24] = Ain[idx + lda*14 + idy*24*lda];
+ lA[idx][15+idy*24] = Ain[idx + lda*15 + idy*24*lda];
+ lA[idx][16+idy*24] = Ain[idx + lda*16 + idy*24*lda];
+ lA[idx][17+idy*24] = Ain[idx + lda*17 + idy*24*lda];
+ lA[idx][18+idy*24] = Ain[idx + lda*18 + idy*24*lda];
+ lA[idx][19+idy*24] = Ain[idx + lda*19 + idy*24*lda];
+ lA[idx][20+idy*24] = Ain[idx + lda*20 + idy*24*lda];
+ lA[idx][21+idy*24] = Ain[idx + lda*21 + idy*24*lda];
+ lA[idx][22+idy*24] = Ain[idx + lda*22 + idy*24*lda];
+ lA[idx][23+idy*24] = Ain[idx + lda*23 + idy*24*lda];
+
+ lB[0+idy*24][idx] = B[idx*NB + idy*24];
+ lB[1+idy*24][idx] = B[idx*NB + idy*24 + 1];
+ lB[2+idy*24][idx] = B[idx*NB + idy*24 + 2];
+ lB[3+idy*24][idx] = B[idx*NB + idy*24 + 3];
+ lB[4+idy*24][idx] = B[idx*NB + idy*24 + 4];
+ lB[5+idy*24][idx] = B[idx*NB + idy*24 + 5];
+ lB[6+idy*24][idx] = B[idx*NB + idy*24 + 6];
+ lB[7+idy*24][idx] = B[idx*NB + idy*24 + 7];
+ lB[8+idy*24][idx] = B[idx*NB + idy*24 + 8];
+ lB[9+idy*24][idx] = B[idx*NB + idy*24 + 9];
+ lB[10+idy*24][idx] = B[idx*NB + idy*24 + 10];
+ lB[11+idy*24][idx] = B[idx*NB + idy*24 + 11];
+ lB[12+idy*24][idx] = B[idx*NB + idy*24 + 12];
+ lB[13+idy*24][idx] = B[idx*NB + idy*24 + 13];
+ lB[14+idy*24][idx] = B[idx*NB + idy*24 + 14];
+ lB[15+idy*24][idx] = B[idx*NB + idy*24 + 15];
+ lB[16+idy*24][idx] = B[idx*NB + idy*24 + 16];
+ lB[17+idy*24][idx] = B[idx*NB + idy*24 + 17];
+ lB[18+idy*24][idx] = B[idx*NB + idy*24 + 18];
+ lB[19+idy*24][idx] = B[idx*NB + idy*24 + 19];
+ lB[20+idy*24][idx] = B[idx*NB + idy*24 + 20];
+ lB[21+idy*24][idx] = B[idx*NB + idy*24 + 21];
+ lB[22+idy*24][idx] = B[idx*NB + idy*24 + 22];
+ lB[23+idy*24][idx] = B[idx*NB + idy*24 + 23];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do math
+
+ uint i = 0;
+
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]);
+ i = i + 1;
+ }while(i < 48);
+
+ Ain += 48*lda;
+ B += 48;
+ }while(--block_k>0);
+
+ uint i = 0;
+ do{
+ C[NB*idy*12+NB*i+idx] = privateC[i];
+ i = i + 1;
+ }while(i < 12);
+
+
+}
+
+__kernel void
+TRIPLE_DGEMM_UPDATE_96_PART2_R (__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 48 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [48*number of blocks, 4]
+ //each work item in each work group is responsible for 12 elements (1/4) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// -invA11*invA12
+ const uint gidx = get_group_id(0);
+ const uint gidy = get_group_id(1);
+ const uint idx = get_local_id(0);
+ const uint idy = get_local_id(1);
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ //const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block
+ const uint page_block = gidx/4;//#index of page_block; 4 WG per page; 4 WG per page_block
+
+ __global double *A, *B, *C;
+ __local double lA[24][48];
+ __local double lB[48][24];
+ double privateC[12] = {(double)0};
+
+ //decide invA11 location for each page
+ //each workgroup loads half of A (left or right)
+ //A = d_dinvA + page*NB*NB + gidx%2*(blk/2);
+ A = d_dinvA + page_block*NB*NB + gidx%4*(blk/4);
+
+ //decide invA12 (B) location for each page
+ //actually it was saved in invA21 from last kernel
+ //each workgroup loads half of B (up or down)
+ //B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB;
+ B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk/4)*NB;
+
+ //decide invA12 location for each page
+ //each workgroup writes 1/4 of C
+ //C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
+ C = d_dinvA + page_block*NB*NB + blk*NB + gidx%4*(blk/4) + gidy*(blk/4)*NB;
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //idx 0-23 idy 0-1
+
+ uint block_k = blk / 48; //thus we need 2 iterations here
+ do{
+ barrier(CLK_LOCAL_MEM_FENCE);
+ lA[idx][0+idy*24] = A[idx + idy*24*NB];
+ lA[idx][1+idy*24] = A[idx + NB + idy*24*NB];
+ lA[idx][2+idy*24] = A[idx + NB*2 + idy*24*NB];
+ lA[idx][3+idy*24] = A[idx + NB*3 + idy*24*NB];
+ lA[idx][4+idy*24] = A[idx + NB*4 + idy*24*NB];
+ lA[idx][5+idy*24] = A[idx + NB*5 + idy*24*NB];
+ lA[idx][6+idy*24] = A[idx + NB*6 + idy*24*NB];
+ lA[idx][7+idy*24] = A[idx + NB*7 + idy*24*NB];
+ lA[idx][8+idy*24] = A[idx + NB*8 + idy*24*NB];
+ lA[idx][9+idy*24] = A[idx + NB*9 + idy*24*NB];
+ lA[idx][10+idy*24] = A[idx + NB*10 + idy*24*NB];
+ lA[idx][11+idy*24] = A[idx + NB*11 + idy*24*NB];
+ lA[idx][12+idy*24] = A[idx + NB*12 + idy*24*NB];
+ lA[idx][13+idy*24] = A[idx + NB*13 + idy*24*NB];
+ lA[idx][14+idy*24] = A[idx + NB*14 + idy*24*NB];
+ lA[idx][15+idy*24] = A[idx + NB*15 + idy*24*NB];
+ lA[idx][16+idy*24] = A[idx + NB*16 + idy*24*NB];
+ lA[idx][17+idy*24] = A[idx + NB*17 + idy*24*NB];
+ lA[idx][18+idy*24] = A[idx + NB*18 + idy*24*NB];
+ lA[idx][19+idy*24] = A[idx + NB*19 + idy*24*NB];
+ lA[idx][20+idy*24] = A[idx + NB*20 + idy*24*NB];
+ lA[idx][21+idy*24] = A[idx + NB*21 + idy*24*NB];
+ lA[idx][22+idy*24] = A[idx + NB*22 + idy*24*NB];
+ lA[idx][23+idy*24] = A[idx + NB*23 + idy*24*NB];
+
+ lB[0+idy*24][idx] = B[idx*NB + idy*24];
+ lB[1+idy*24][idx] = B[idx*NB + idy*24 + 1];
+ lB[2+idy*24][idx] = B[idx*NB + idy*24 + 2];
+ lB[3+idy*24][idx] = B[idx*NB + idy*24 + 3];
+ lB[4+idy*24][idx] = B[idx*NB + idy*24 + 4];
+ lB[5+idy*24][idx] = B[idx*NB + idy*24 + 5];
+ lB[6+idy*24][idx] = B[idx*NB + idy*24 + 6];
+ lB[7+idy*24][idx] = B[idx*NB + idy*24 + 7];
+ lB[8+idy*24][idx] = B[idx*NB + idy*24 + 8];
+ lB[9+idy*24][idx] = B[idx*NB + idy*24 + 9];
+ lB[10+idy*24][idx] = B[idx*NB + idy*24 + 10];
+ lB[11+idy*24][idx] = B[idx*NB + idy*24 + 11];
+ lB[12+idy*24][idx] = B[idx*NB + idy*24 + 12];
+ lB[13+idy*24][idx] = B[idx*NB + idy*24 + 13];
+ lB[14+idy*24][idx] = B[idx*NB + idy*24 + 14];
+ lB[15+idy*24][idx] = B[idx*NB + idy*24 + 15];
+ lB[16+idy*24][idx] = B[idx*NB + idy*24 + 16];
+ lB[17+idy*24][idx] = B[idx*NB + idy*24 + 17];
+ lB[18+idy*24][idx] = B[idx*NB + idy*24 + 18];
+ lB[19+idy*24][idx] = B[idx*NB + idy*24 + 19];
+ lB[20+idy*24][idx] = B[idx*NB + idy*24 + 20];
+ lB[21+idy*24][idx] = B[idx*NB + idy*24 + 21];
+ lB[22+idy*24][idx] = B[idx*NB + idy*24 + 22];
+ lB[23+idy*24][idx] = B[idx*NB + idy*24 + 23];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do math
+
+ uint i = 0;
+
+ do{
+ privateC[0]=mad(lA[idx][i],lB[i][0+idy*12],privateC[0]);
+ privateC[1]=mad(lA[idx][i],lB[i][1+idy*12],privateC[1]);
+ privateC[2]=mad(lA[idx][i],lB[i][2+idy*12],privateC[2]);
+ privateC[3]=mad(lA[idx][i],lB[i][3+idy*12],privateC[3]);
+ privateC[4]=mad(lA[idx][i],lB[i][4+idy*12],privateC[4]);
+ privateC[5]=mad(lA[idx][i],lB[i][5+idy*12],privateC[5]);
+ privateC[6]=mad(lA[idx][i],lB[i][6+idy*12],privateC[6]);
+ privateC[7]=mad(lA[idx][i],lB[i][7+idy*12],privateC[7]);
+ privateC[8]=mad(lA[idx][i],lB[i][8+idy*12],privateC[8]);
+ privateC[9]=mad(lA[idx][i],lB[i][9+idy*12],privateC[9]);
+ privateC[10]=mad(lA[idx][i],lB[i][10+idy*12],privateC[10]);
+ privateC[11]=mad(lA[idx][i],lB[i][11+idy*12],privateC[11]);
+ i = i + 1;
+ }while(i < 48);
+
+ A += 48*NB;
+ B += 48;
+
+ }while(--block_k>0);
+
+ uint i = 0;
+ do{
+ C[NB*idy*12+NB*i+idx] = -1 * privateC[i];
+ i = i + 1;
+ }while(i < 12);
+
+
+}
+
+";
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list