[clblas] 35/67: dtrsm 192 trtri
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Oct 27 08:02:13 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit ba1bbdd303f3cd2b84ee1a2c11adeee6cb36e94d
Author: Timmy <timmy.liu at amd.com>
Date: Thu Sep 10 17:08:36 2015 -0500
dtrsm 192 trtri
---
src/library/CMakeLists.txt | 1 +
src/library/blas/trtri/TrtriClKernels.h | 16 +
.../blas/trtri/TrtriKernelSourceIncludes.cpp | 19 +
src/library/blas/trtri/TrtriKernelSourceIncludes.h | 45 ++
.../blas/trtri/diag_dtrtri_upper_192_12.cpp | 148 +++++
.../blas/trtri/triple_dgemm_update_192_12_R.cpp | 193 ++++++
.../trtri/triple_dgemm_update_192_24_PART1_R.cpp | 116 ++++
.../trtri/triple_dgemm_update_192_24_PART2_R.cpp | 111 ++++
.../trtri/triple_dgemm_update_192_48_PART1_R.cpp | 143 +++++
.../trtri/triple_dgemm_update_192_48_PART2_R.cpp | 144 +++++
.../trtri/triple_dgemm_update_192_96_PART1_R.cpp | 155 +++++
.../trtri/triple_dgemm_update_192_96_PART2_R.cpp | 156 +++++
src/library/blas/xtrsm.cc | 677 +++++++++++++++++++++
13 files changed, 1924 insertions(+)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 325644e..9a6b2f9 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -641,6 +641,7 @@ include_directories(${OPENCL_INCLUDE_DIRS}
${clBLAS_SOURCE_DIR}/library/blas/AutoGemm
${clBLAS_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources
${clBLAS_SOURCE_DIR}/library/blas/specialCases/include
+ ${clBLAS_SOURCE_DIR}/library/blas/trtri
)
option( BLAS_DUMP_CLBLAS_KERNELS "Force the library to dump OpenCL kernels to disk" OFF )
diff --git a/src/library/blas/trtri/TrtriClKernels.h b/src/library/blas/trtri/TrtriClKernels.h
new file mode 100644
index 0000000..dbc9744
--- /dev/null
+++ b/src/library/blas/trtri/TrtriClKernels.h
@@ -0,0 +1,16 @@
+
+#ifndef TRTRI_CL_KERNELS_H
+#define TRTRI_CL_KERNELS_H
+#include "CL/cl.h"
+
+static cl_kernel diag_dtrtri_upper_192_12_clKernel = NULL;
+static cl_kernel triple_dgemm_update_192_12_R = NULL;
+static cl_kernel triple_dgemm_update_192_24_PART1_R = NULL;
+static cl_kernel triple_dgemm_update_192_24_PART2_R = NULL;
+static cl_kernel triple_dgemm_update_192_48_PART1_R = NULL;
+static cl_kernel triple_dgemm_update_192_48_PART2_R = NULL;
+static cl_kernel triple_dgemm_update_192_96_PART1_R = NULL;
+static cl_kernel triple_dgemm_update_192_96_PART2_R = NULL;
+
+
+#endif
\ No newline at end of file
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
new file mode 100644
index 0000000..8750cee
--- /dev/null
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
@@ -0,0 +1,19 @@
+/*******************************************************************************
+ * This file is NOT auto-generated; populate it with hand-written kernels
+ ******************************************************************************/
+
+
+#ifndef TRTRI_SOURCE_INCLUDES_CPP
+#define TRTRI_SOURCE_INCLUDES_CPP
+
+#include "diag_dtrtri_upper_192_12.cpp"
+#include "triple_dgemm_update_192_12_R.cpp"
+#include "triple_dgemm_update_192_24_PART1_R.cpp"
+#include "triple_dgemm_update_192_24_PART2_R.cpp"
+#include "triple_dgemm_update_192_48_PART1_R.cpp"
+#include "triple_dgemm_update_192_48_PART2_R.cpp"
+#include "triple_dgemm_update_192_96_PART1_R.cpp"
+#include "triple_dgemm_update_192_96_PART2_R.cpp"
+
+
+#endif
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.h b/src/library/blas/trtri/TrtriKernelSourceIncludes.h
new file mode 100644
index 0000000..a6e1834
--- /dev/null
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.h
@@ -0,0 +1,45 @@
+
+#ifndef TRTRI_SOURCE_INCLUDES_H
+#define TRTRI_SOURCE_INCLUDES_H
+#include <cstddef>
+#include "TrtriKernelSourceIncludes.cpp"
+
+//**** compiler flags
+//**** online compilation flags
+const char * const TrtriBuildOptions = "-cl-std=CL2.0";
+const char * const TrtribinBuildOptions = "-cl-std=CL2.0";
+
+
+extern const char * const diag_dtrtri_upper_192_12_src;
+extern unsigned char *diag_dtrtri_upper_192_12_bin;
+extern size_t diag_dtrtri_upper_192_12_binSize;
+
+extern const char * const triple_dgemm_update_192_12_R_src;
+extern unsigned char *triple_dgemm_update_192_12_R_bin;
+extern size_t triple_dgemm_update_192_12_R_binSize;
+
+extern const char * const triple_dgemm_update_192_24_PART1_R_src;
+extern unsigned char *triple_dgemm_update_192_24_PART1_R_bin;
+extern size_t triple_dgemm_update_192_24_PART1_R_binSize;
+
+extern const char * const triple_dgemm_update_192_24_PART2_R_src;
+extern unsigned char *triple_dgemm_update_192_24_PART2_R_bin;
+extern size_t triple_dgemm_update_192_24_PART2_R_binSize;
+
+extern const char * const triple_dgemm_update_192_48_PART1_R_src;
+extern unsigned char *triple_dgemm_update_192_48_PART1_R_bin;
+extern size_t triple_dgemm_update_192_48_PART1_R_binSize;
+
+extern const char * const triple_dgemm_update_192_48_PART2_R_src;
+extern unsigned char *triple_dgemm_update_192_48_PART2_R_bin;
+extern size_t triple_dgemm_update_192_48_PART2_R_binSize;
+
+extern const char * const triple_dgemm_update_192_96_PART1_R_src;
+extern unsigned char *triple_dgemm_update_192_96_PART1_R_bin;
+extern size_t triple_dgemm_update_192_96_PART1_R_binSize;
+
+extern const char * const triple_dgemm_update_192_96_PART2_R_src;
+extern unsigned char *triple_dgemm_update_192_96_PART2_R_bin;
+extern size_t triple_dgemm_update_192_96_PART2_R_binSize;
+
+#endif
diff --git a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
new file mode 100644
index 0000000..b4409c5
--- /dev/null
+++ b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
@@ -0,0 +1,148 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP
+#define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP
+#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *diag_dtrtri_upper_192_12_bin = 0;
+size_t diag_dtrtri_upper_192_12_binSize = 0;
+
+const char * const diag_dtrtri_upper_192_12_src = STRINGIFY(
+#define BLOCK_SIZE 12 \n
+#define NB 192 \n
+#define ZERO ( 0.0) \n
+#define ONE ( 1.0) \n
+#ifdef DOUBLE_PRECISION \n
+#ifdef cl_khr_fp64 \n
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n
+#else \n
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable \n
+#endif \n
+#endif \n
+__kernel void diag_dtrtri_upper_192_12_src(\n
+int isDiagUnit,\n
+__global double const * restrict A, \n
+uint offA, \n
+__global double *d_dinvA, \n
+uint lda, \n
+uint na)\n
+{ \n
+int i, j;\n
+double Ystx = 0; \n
+__local double *y = 0; \n
+double switcher; \n
+double neg_switcher; \n
+
+// Thread index
+int tx = get_local_id(0); \n
+
+// Thread index
+int gx = get_global_id(0); \n
+
+// Block index
+int bx = get_group_id(0); \n
+
+A = A + offA; \n
+
+__global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; \n
+int NumBLperNB = NB / BLOCK_SIZE; \n
+d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); \n
+
+__local double Bs[BLOCK_SIZE*BLOCK_SIZE]; \n
+__local double workspace[BLOCK_SIZE];\n // workspace used to store the current working column
+
+// load A \n
+#pragma unroll\n
+for (i = 0; i < BLOCK_SIZE; i++)\n
+{ \n
+ if (tx <= i && i + bx*BLOCK_SIZE < na)\n
+ { \n
+ Bs[i*BLOCK_SIZE + tx] = *(Aoff + i*lda + tx); \n
+ }\n
+ else\n
+ { \n
+ Bs[i*BLOCK_SIZE + tx] = ZERO; \n
+ }\n
+}\n
+// read in the whole square block of my A and zero out the non data triangular
+
+// Synchronize to make sure the matrices are loaded
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+
+// solve the diagonals
+
+if (isDiagUnit == 1)\n
+{ \n
+ Bs[tx*BLOCK_SIZE + tx] = ONE; \n
+}\n
+else\n
+{ \n
+ if (Bs[tx*BLOCK_SIZE + tx] == ZERO)\n
+ { \n
+ Bs[tx*BLOCK_SIZE + tx] = ONE; \n
+ }\n
+ else\n
+ { \n
+ Bs[tx*BLOCK_SIZE + tx] = ONE / (Bs[tx*BLOCK_SIZE + tx]); \n
+ }\n
+}\n
+
+
+/* the upper case */
+for (i = 0; i < BLOCK_SIZE; i++) {\n
+ Ystx = ZERO; \n
+ if (tx < i)\n
+ { \n
+ switcher = ONE; \n
+ }\n
+ else\n
+ { \n
+ switcher = ZERO; \n
+ }\n
+
+ //dtrmv
+ workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n
+ y = Bs + i*BLOCK_SIZE; \n
+
+#pragma unroll\n
+ //for( j=tx; j < i; j++ )
+ for (j = 0; j < i; j++)\n
+ Ystx += switcher * (*(Bs + j*BLOCK_SIZE + tx)*workspace[j]); \n
+
+ //sscal
+ // if (tx != i) y[tx]=switcher*Ystx*(-Bs[i*BLOCK_SIZE+i]);
+
+ if (tx != i)\n
+ { \n
+ switcher = ONE; \n
+ neg_switcher = ZERO; \n
+ }\n
+ else\n
+ { \n
+ switcher = ZERO; \n
+ neg_switcher = ONE; \n
+ }\n
+
+ y[tx] = switcher *Ystx*(-Bs[i*BLOCK_SIZE + i]) + neg_switcher*y[tx]; \n
+
+ // __syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+}\n
+
+// write back A
+#pragma unroll\n
+for (i = 0; i < BLOCK_SIZE; i++)\n
+ *(d_dinvA + i*NB + tx) = Bs[i*BLOCK_SIZE + tx]; \n
+
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
new file mode 100644
index 0000000..ca5529f
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
@@ -0,0 +1,193 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+
+ * B21 = -inv(A11)*A12*inv(A22)
+ * 12 to 24
+
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_192_12_R_bin = 0;
+size_t triple_dgemm_update_192_12_R_binSize = 0;
+
+const char * const triple_dgemm_update_192_12_R_src = STRINGIFY(
+#define NB 192\n
+ __kernel void TRIPLE_DGEMM_UPDATE_192_12_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n
+{\n
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 12 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8
+
+ //Work group size is [12]
+ //global work size is [96*number of blocks]
+ //each work item in each work group is responsible for every element in that row
+ //each work group is responsible for one gemm;\
+
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0);\n
+ const uint idx = get_local_id(0);\n
+
+ const uint page = gidx % npages;\n
+ const uint page_block = page / 8;\n//8 pages per page block
+ const uint page_index_in_block = page % 8;\n
+
+
+ __global double *B, *C;\n
+ __local double lA[12][12];\n
+ __local double lB[12][12];\n
+ double privateC[12] = { (double)0 };\n
+
+ //decide A12 location for each page
+ Ain = Ain + offAin;\n
+ Ain += (page*blk * 2 + blk) * lda + page * 2 * blk;\n
+
+ //decide invA22 (B) location for each page
+ B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk + blk;\n
+
+ //decide invA12 location for each page
+ C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk;\n
+
+ //read A and B into LDS no transpose operated here
+ lA[idx][0] = Ain[idx];\n
+ lA[idx][1] = Ain[idx + lda];\n
+ lA[idx][2] = Ain[idx + lda * 2];\n
+ lA[idx][3] = Ain[idx + lda * 3];\n
+ lA[idx][4] = Ain[idx + lda * 4];\n
+ lA[idx][5] = Ain[idx + lda * 5];\n
+ lA[idx][6] = Ain[idx + lda * 6];\n
+ lA[idx][7] = Ain[idx + lda * 7];\n
+ lA[idx][8] = Ain[idx + lda * 8];\n
+ lA[idx][9] = Ain[idx + lda * 9];\n
+ lA[idx][10] = Ain[idx + lda * 10];\n
+ lA[idx][11] = Ain[idx + lda * 11];\n
+
+ lB[idx][0] = B[idx];\n
+ lB[idx][1] = B[idx + NB];\n
+ lB[idx][2] = B[idx + NB * 2];\n
+ lB[idx][3] = B[idx + NB * 3];\n
+ lB[idx][4] = B[idx + NB * 4];\n
+ lB[idx][5] = B[idx + NB * 5];\n
+ lB[idx][6] = B[idx + NB * 6];\n
+ lB[idx][7] = B[idx + NB * 7];\n
+ lB[idx][8] = B[idx + NB * 8];\n
+ lB[idx][9] = B[idx + NB * 9];\n
+ lB[idx][10] = B[idx + NB * 10];\n
+ lB[idx][11] = B[idx + NB * 11];\n
+ barrier(CLK_LOCAL_MEM_FENCE);\n
+
+ //do math
+
+ uint i = 0;\n
+
+ do{\n
+ privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);\n
+ privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);\n
+ privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);\n
+ privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);\n
+ privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);\n
+ privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);\n
+ privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);\n
+ privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);\n
+ privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);\n
+ privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);\n
+ privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);\n
+ privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);\n
+ //mem_fence(CLK_LOCAL_MEM_FENCE);
+ i = i + 1;\n
+ } while (i < 12);\n
+
+ i = 0;\n
+ do{\n
+ C[NB*i + idx] = privateC[i];\n
+ i = i + 1;\n
+ } while (i < 12);\n
+
+ ////////////// -invA11*invA12
+ barrier(CLK_GLOBAL_MEM_FENCE);\n
+ //A is moving to invA11
+ __global double *A;\n
+ A = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2) * NB + (page % 4) * 2 * blk;\n
+ //both B and C are pointing at invA12
+ B = C;\n
+
+ //read A and B into LDS no transpose operated here
+ lA[idx][0] = A[idx];\n
+ lA[idx][1] = A[idx + NB];\n
+ lA[idx][2] = A[idx + NB * 2];\n
+ lA[idx][3] = A[idx + NB * 3];\n
+ lA[idx][4] = A[idx + NB * 4];\n
+ lA[idx][5] = A[idx + NB * 5];\n
+ lA[idx][6] = A[idx + NB * 6];\n
+ lA[idx][7] = A[idx + NB * 7];\n
+ lA[idx][8] = A[idx + NB * 8];\n
+ lA[idx][9] = A[idx + NB * 9];\n
+ lA[idx][10] = A[idx + NB * 10];\n
+ lA[idx][11] = A[idx + NB * 11];\n
+
+ lB[idx][0] = B[idx];\n
+ lB[idx][1] = B[idx + NB];\n
+ lB[idx][2] = B[idx + NB * 2];\n
+ lB[idx][3] = B[idx + NB * 3];\n
+ lB[idx][4] = B[idx + NB * 4];\n
+ lB[idx][5] = B[idx + NB * 5];\n
+ lB[idx][6] = B[idx + NB * 6];\n
+ lB[idx][7] = B[idx + NB * 7];\n
+ lB[idx][8] = B[idx + NB * 8];\n
+ lB[idx][9] = B[idx + NB * 9];\n
+ lB[idx][10] = B[idx + NB * 10];\n
+ lB[idx][11] = B[idx + NB * 11];\n
+ barrier(CLK_LOCAL_MEM_FENCE);\n
+
+ //do math
+
+ i = 0;\n
+ privateC[0] = 0;\n
+ privateC[1] = 0;\n
+ privateC[2] = 0;\n
+ privateC[3] = 0;\n
+ privateC[4] = 0;\n
+ privateC[5] = 0;\n
+ privateC[6] = 0;\n
+ privateC[7] = 0;\n
+ privateC[8] = 0;\n
+ privateC[9] = 0;\n
+ privateC[10] = 0;\n
+ privateC[11] = 0;\n
+ do{\n
+ privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);\n
+ privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);\n
+ privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);\n
+ privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);\n
+ privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);\n
+ privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);\n
+ privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);\n
+ privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);\n
+ privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);\n
+ privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);\n
+ privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);\n
+ privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);\n
+ //mem_fence(CLK_LOCAL_MEM_FENCE);
+ i = i + 1;\n
+ } while (i < 12);\n
+
+ i = 0;\n
+ do{\n
+ C[NB*i + idx] = -1 * privateC[i];\n
+ i = i + 1;\n
+ } while (i < 12);\n
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
new file mode 100644
index 0000000..b23a276
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_192_24_PART1_R_bin = 0;
+size_t triple_dgemm_update_192_24_PART1_R_binSize = 0;
+
+const char * const triple_dgemm_update_192_24_PART1_R_src = STRINGIFY(
+#define NB 192\n
+ __kernel void TRIPLE_DGEMM_UPDATE_192_24_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n
+{ \n
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 24 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 2]
+ //each work item in each work group is responsible for 12 elements (half) in that row
+ //each work group is responsible for one gemm;
+
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0); \n
+ const uint gidy = get_group_id(1); \n
+ const uint idx = get_local_id(0); \n
+ const uint idy = get_local_id(1); \n
+
+ const uint page = gidx % npages; \n//0-3 for 192; 0-1 for 96
+ const uint page_block = page / 4; \n//4 pages per page block
+
+
+
+ __global double *B, *C; \n
+ __local double lA[24][24]; \n
+ __local double lB[24][24]; \n
+ double privateC[12] = { (double)0 }; \n
+
+ //decide A12 location for each page
+ Ain = Ain + offAin; \n
+ Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; \n
+
+ //decide invA22 (B) location for each page
+ B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk + blk; \n
+
+ //decide invA12 location for each page
+ C = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; \n
+
+ //read A and B into LDS no transpose operated here
+ //each work iteam loads half a row
+ lA[idx][0 + idy * 12] = Ain[idx + idy * 12 * lda]; \n
+ lA[idx][1 + idy * 12] = Ain[idx + lda + idy * 12 * lda]; \n
+ lA[idx][2 + idy * 12] = Ain[idx + lda * 2 + idy * 12 * lda]; \n
+ lA[idx][3 + idy * 12] = Ain[idx + lda * 3 + idy * 12 * lda]; \n
+ lA[idx][4 + idy * 12] = Ain[idx + lda * 4 + idy * 12 * lda]; \n
+ lA[idx][5 + idy * 12] = Ain[idx + lda * 5 + idy * 12 * lda]; \n
+ lA[idx][6 + idy * 12] = Ain[idx + lda * 6 + idy * 12 * lda]; \n
+ lA[idx][7 + idy * 12] = Ain[idx + lda * 7 + idy * 12 * lda]; \n
+ lA[idx][8 + idy * 12] = Ain[idx + lda * 8 + idy * 12 * lda]; \n
+ lA[idx][9 + idy * 12] = Ain[idx + lda * 9 + idy * 12 * lda]; \n
+ lA[idx][10 + idy * 12] = Ain[idx + lda * 10 + idy * 12 * lda]; \n
+ lA[idx][11 + idy * 12] = Ain[idx + lda * 11 + idy * 12 * lda]; \n
+
+ lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB]; \n
+ lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB]; \n
+ lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB]; \n
+ lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB]; \n
+ lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB]; \n
+ lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB]; \n
+ lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB]; \n
+ lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB]; \n
+ lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB]; \n
+ lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB]; \n
+ lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB]; \n
+ lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB]; \n
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ //do math
+
+ uint i = 0; \n
+
+ do{ \n
+ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); \n
+ privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); \n
+ privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); \n
+ privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); \n
+ privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); \n
+ privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); \n
+ privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); \n
+ privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); \n
+ privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); \n
+ privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); \n
+ privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n
+ privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n
+ i = i + 1; \n
+ } while (i < 24); \n
+
+ i = 0; \n
+ do{ \n
+ C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n
+ i = i + 1; \n
+ } while (i < 12); \n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
new file mode 100644
index 0000000..e3a396c
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
@@ -0,0 +1,111 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_192_24_PART2_R_bin = 0;
+size_t triple_dgemm_update_192_24_PART2_R_binSize = 0;
+
+const char * const triple_dgemm_update_192_24_PART2_R_src = STRINGIFY(
+#define NB 192\n
+ __kernel void TRIPLE_DGEMM_UPDATE_192_24_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 24 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 2]
+ //each work item in each work group is responsible for 12 elements (half) in that row
+ //each work group is responsible for one gemm;
+
+
+ ////////////// -invA11*invA12
+ const uint gidx = get_group_id(0);\n
+ const uint gidy = get_group_id(1);\n
+ const uint idx = get_local_id(0);\n
+ const uint idy = get_local_id(1);\n
+
+ const uint page = gidx % npages; \n//0-3 for 192; 0-1 for 96
+ const uint page_block = page / 4; \n//4 pages per page block
+
+ __global double *A, *B, *C; \n
+ __local double lA[24][24]; \n
+ __local double lB[24][24]; \n
+ double privateC[12] = { (double)0 }; \n
+
+ //decide invA11 location for each page
+ A = d_dinvA + page_block*NB*NB + (page % 4)*blk * 2 * NB + (page % 4) * 2 * blk; \n
+ //decide invA12 location for each page
+ B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; \n
+ C = B;
+ //C = d_dinvA + page_block*NB*NB + ((page%4)*blk*2) * NB + (page%4) * 2 * blk + blk;
+
+ //read A and B into LDS no transpose operated here
+ //each work iteam loads half a row
+ lA[idx][0 + idy * 12] = A[idx + idy * 12 * NB]; \n
+ lA[idx][1 + idy * 12] = A[idx + NB + idy * 12 * NB]; \n
+ lA[idx][2 + idy * 12] = A[idx + NB * 2 + idy * 12 * NB];\n
+ lA[idx][3 + idy * 12] = A[idx + NB * 3 + idy * 12 * NB];\n
+ lA[idx][4 + idy * 12] = A[idx + NB * 4 + idy * 12 * NB];\n
+ lA[idx][5 + idy * 12] = A[idx + NB * 5 + idy * 12 * NB];\n
+ lA[idx][6 + idy * 12] = A[idx + NB * 6 + idy * 12 * NB];\n
+ lA[idx][7 + idy * 12] = A[idx + NB * 7 + idy * 12 * NB];\n
+ lA[idx][8 + idy * 12] = A[idx + NB * 8 + idy * 12 * NB];\n
+ lA[idx][9 + idy * 12] = A[idx + NB * 9 + idy * 12 * NB];\n
+ lA[idx][10 + idy * 12] = A[idx + NB * 10 + idy * 12 * NB];\n
+ lA[idx][11 + idy * 12] = A[idx + NB * 11 + idy * 12 * NB];\n
+
+ lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB];\n
+ lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB];\n
+ lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB];\n
+ lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB];\n
+ lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB];\n
+ lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB];\n
+ lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB];\n
+ lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB];\n
+ lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB];\n
+ lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB];\n
+ lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB];\n
+ lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB];\n
+ barrier(CLK_LOCAL_MEM_FENCE);\n
+ //do math
+
+ uint i = 0;\n
+
+ do{\n
+ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n
+ privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n
+ privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n
+ privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n
+ privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n
+ privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n
+ privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n
+ privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n
+ privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n
+ privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n
+ privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]);\n
+ privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]);\n
+ i = i + 1;\n
+ } while (i < 24);\n
+
+ i = 0;\n
+ do{\n
+ C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i];\n
+ i = i + 1;\n
+ } while (i < 12);\n
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
new file mode 100644
index 0000000..473d989
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
@@ -0,0 +1,143 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_192_48_PART1_R_bin = 0;
+size_t triple_dgemm_update_192_48_PART1_R_binSize = 0;
+
+const char * const triple_dgemm_update_192_48_PART1_R_src = STRINGIFY(
+#define NB 192\n
+ __kernel void TRIPLE_DGEMM_UPDATE_192_48_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 48 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 4]
+ //each work item in each work group is responsible for 12 elements (1/4) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0);\n
+ const uint gidy = get_group_id(1);\n
+ const uint idx = get_local_id(0);\n
+ const uint idy = get_local_id(1);\n
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block
+ const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block
+
+ __global double *B, *C; \n
+ __local double lA[24][48]; \n
+ __local double lB[48][24]; \n
+ double privateC[12] = { (double)0 }; \n
+
+ //decide A12 location for each page
+ //each workgroup loads half of A (left or right)
+ Ain = Ain + offAin; \n
+ Ain += page_block*NB*lda + page_block*NB + page*blk * 2 * lda + page*blk * 2 + blk*lda + gidx % 2 * (blk / 2); \n
+
+ //decide invA22 (B) location for each page
+ //each workgroup loads half of B (up or down)
+ B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n
+
+ //decide invA12 location for each page;
+ //Actually this will be stored in invA21 temporarily
+ //each workgroup writes 1/4 of C
+ C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //idx 0-23 idy 0-1
+ lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n
+ lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n
+ lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n
+ lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n
+ lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n
+ lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n
+ lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n
+ lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n
+ lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n
+ lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n
+ lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n
+ lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n
+ lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n
+ lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n
+ lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n
+ lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n
+ lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n
+ lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n
+ lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n
+ lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n
+ lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n
+ lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n
+ lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n
+ lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n
+
+ lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n
+ lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n
+ lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n
+ lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n
+ lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n
+ lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n
+ lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n
+ lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n
+ lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n
+ lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n
+ lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n
+ lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n
+ lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n
+ lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n
+ lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n
+ lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n
+ lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n
+ lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n
+ lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n
+ lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n
+ lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n
+ lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n
+ lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n
+ lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ //do math
+
+ uint i = 0; \n
+
+ do{\n
+ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n
+ privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n
+ privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n
+ privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n
+ privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n
+ privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n
+ privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n
+ privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n
+ privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n
+ privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n
+ privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n
+ privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n
+ i = i + 1; \n
+ } while (i < 48); \n
+
+ i = 0; \n
+ do{\n
+ C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n
+ i = i + 1; \n
+ } while (i < 12); \n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
new file mode 100644
index 0000000..5315196
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_192_48_PART2_R_bin = 0;
+size_t triple_dgemm_update_192_48_PART2_R_binSize = 0;
+
+const char * const triple_dgemm_update_192_48_PART2_R_src = STRINGIFY(
+#define NB 192\n
+ __kernel void TRIPLE_DGEMM_UPDATE_192_48_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 48 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 4]
+ //each work item in each work group is responsible for 12 elements (1/4) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// -invA11*invA12
+ const uint gidx = get_group_id(0);\n
+ const uint gidy = get_group_id(1); \n
+ const uint idx = get_local_id(0); \n
+ const uint idy = get_local_id(1); \n
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block
+ const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block
+
+ __global double *A, *B, *C; \n
+ __local double lA[24][48]; \n
+ __local double lB[48][24]; \n
+ double privateC[12] = { (double)0 }; \n
+
+ //decide invA11 location for each page
+ //each workgroup loads half of A (left or right)
+ A = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + gidx % 2 * (blk / 2); \n
+
+ //decide invA12 (B) location for each page
+ //actually it was saved in invA21 from last kernel
+ //each workgroup loads half of B (up or down)
+ B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidy*(blk / 2)*NB; \n
+
+ //decide invA12 location for each page
+ //each workgroup writes 1/4 of C
+ C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //idx 0-23 idy 0-1
+ lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; \n
+ lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; \n
+ lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB];\n
+ lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB];\n
+ lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB];\n
+ lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB];\n
+ lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB];\n
+ lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB];\n
+ lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB];\n
+ lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB];\n
+ lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB];\n
+ lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB];\n
+ lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB];\n
+ lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB];\n
+ lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB];\n
+ lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB];\n
+ lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB];\n
+ lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB];\n
+ lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB];\n
+ lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB];\n
+ lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB];\n
+ lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB];\n
+ lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB];\n
+ lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB];\n
+
+ lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n
+ lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n
+ lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n
+ lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n
+ lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n
+ lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n
+ lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n
+ lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n
+ lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n
+ lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n
+ lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n
+ lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n
+ lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n
+ lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n
+ lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n
+ lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n
+ lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n
+ lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n
+ lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n
+ lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n
+ lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n
+ lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n
+ lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n
+ lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ //do math
+
+ uint i = 0; \n
+
+ do{\n
+ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n
+ privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n
+ privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n
+ privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n
+ privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n
+ privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n
+ privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n
+ privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n
+ privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n
+ privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n
+ privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n
+ privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n
+ i = i + 1; \n
+ } while (i < 48); \n
+
+ i = 0; \n
+ do{\n
+ C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; \n
+ i = i + 1; \n
+ } while (i < 12); \n
+
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
new file mode 100644
index 0000000..1c8111d
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
@@ -0,0 +1,155 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_192_96_PART1_R_bin = 0;
+size_t triple_dgemm_update_192_96_PART1_R_binSize = 0;
+
+const char * const triple_dgemm_update_192_96_PART1_R_src = STRINGIFY(
+#define NB 192\n
+ __kernel void TRIPLE_DGEMM_UPDATE_192_96_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 96 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 192 this is 1 for 384 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [96*number of blocks, 8]
+ //each work item in each work group is responsible for 12 elements (1/8) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// A12*invA22
+ const uint gidx = get_group_id(0);\n
+ const uint gidy = get_group_id(1); \n
+ const uint idx = get_local_id(0); \n
+ const uint idy = get_local_id(1); \n
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ //const uint page = (gidx/4)%1;//index of page within a page_block; 1 pages per page_block
+ const uint page_block = gidx / 4; \n//#index of page_block; 4 WG per page; 4 WG per page_block
+
+
+ __global double *B, *C; \n
+ __local double lA[24][48]; \n
+ __local double lB[48][24]; \n
+ double privateC[12] = { (double)0 }; \n
+
+ //decide A12 location for each page
+ //each workgroup loads 1/4 of A (left or right)
+ Ain = Ain + offAin; \n
+ Ain += page_block*NB*lda + page_block*NB + blk*lda + gidx % 4 * (blk / 4); \n
+
+ //decide invA22 (B) location for each page
+ //each workgroup loads 1/4 of B (up or down)
+ B = d_dinvA + page_block*NB*NB + blk*NB + blk + gidy*(blk / 4)*NB; \n
+
+ //decide invA12 location for each page;
+ //Actually this will be stored in invA21 temporarily
+ //each workgroup writes 1/4*1/4 of C
+ C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; \n
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //each loop loads 1/4 row of A and 1/4 column of B
+ //idx 0-23 idy 0-1
+
+ uint block_k = blk / 48; \n //thus we need 2 iterations here
+ do{\n
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n
+ lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n
+ lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n
+ lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n
+ lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n
+ lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n
+ lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n
+ lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n
+ lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n
+ lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n
+ lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n
+ lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n
+ lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n
+ lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n
+ lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n
+ lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n
+ lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n
+ lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n
+ lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n
+ lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n
+ lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n
+ lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n
+ lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n
+ lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n
+
+ lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n
+ lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n
+ lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n
+ lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n
+ lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n
+ lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n
+ lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n
+ lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n
+ lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n
+ lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n
+ lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n
+ lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n
+ lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n
+ lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n
+ lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n
+ lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n
+ lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n
+ lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n
+ lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n
+ lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n
+ lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n
+ lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n
+ lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n
+ lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ //do math
+
+ uint i = 0; \n
+
+ do{\n
+ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); \n
+ privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); \n
+ privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); \n
+ privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); \n
+ privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); \n
+ privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); \n
+ privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); \n
+ privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); \n
+ privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); \n
+ privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); \n
+ privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n
+ privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n
+ i = i + 1; \n
+ } while (i < 48); \n
+
+ Ain += 48 * lda; \n
+ B += 48; \n
+ } while (--block_k>0); \n
+
+ uint i = 0; \n
+ do{\n
+ C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n
+ i = i + 1; \n
+ } while (i < 12); \n
+
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
new file mode 100644
index 0000000..6c2c61d
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
@@ -0,0 +1,156 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_192_96_PART2_R_bin = 0;
+size_t triple_dgemm_update_192_96_PART2_R_binSize = 0;
+
+const char * const triple_dgemm_update_192_96_PART2_R_src = STRINGIFY(
+#define NB 192\n
+ __kernel void TRIPLE_DGEMM_UPDATE_192_96_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{ \n
+ // Ain is the non inverse matrix; the size of Ain is lda * na
+ // offAin is the offset of Ain
+ // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
+ // blk is subblock size, which is 48 here.
+ // lda in leading dimension. Column major here
+ // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2
+
+ //Work group size is [24, 2]
+ //global work size is [48*number of blocks, 4]
+ //each work item in each work group is responsible for 12 elements (1/4) in that row
+ //each work group is responsible for 24 by 24 macro tile;
+
+ ////////////// -invA11*invA12
+ const uint gidx = get_group_id(0); \n
+ const uint gidy = get_group_id(1); \n
+ const uint idx = get_local_id(0); \n
+ const uint idy = get_local_id(1); \n
+
+ //uint page = gidx / 2;//0-1 for 192; 0 for 96
+ //const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block
+ const uint page_block = gidx / 4; \n//#index of page_block; 4 WG per page; 4 WG per page_block
+
+ __global double *A, *B, *C; \n
+ __local double lA[24][48]; \n
+ __local double lB[48][24]; \n
+ double privateC[12] = { (double)0 }; \n
+
+ //decide invA11 location for each page
+ //each workgroup loads half of A (left or right)
+ //A = d_dinvA + page*NB*NB + gidx%2*(blk/2);
+ A = d_dinvA + page_block*NB*NB + gidx % 4 * (blk / 4); \n
+
+ //decide invA12 (B) location for each page
+ //actually it was saved in invA21 from last kernel
+ //each workgroup loads half of B (up or down)
+ //B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB;
+ B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk / 4)*NB; \n
+
+ //decide invA12 location for each page
+ //each workgroup writes 1/4 of C
+ //C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
+ C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; \n
+
+ //read A and B into LDS no transpose operated here
+ //each work item loads a half row of A and half column of B
+ //idx 0-23 idy 0-1
+
+ uint block_k = blk / 48; \n //thus we need 2 iterations here
+ do{\n
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; \n
+ lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; \n
+ lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB];\n
+ lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB];\n
+ lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB];\n
+ lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB];\n
+ lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB];\n
+ lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB];\n
+ lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB];\n
+ lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB];\n
+ lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB];\n
+ lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB];\n
+ lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB];\n
+ lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB];\n
+ lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB];\n
+ lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB];\n
+ lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB];\n
+ lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB];\n
+ lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB];\n
+ lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB];\n
+ lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB];\n
+ lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB];\n
+ lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB];\n
+ lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB];\n
+
+ lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n
+ lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1]; \n
+ lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n
+ lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n
+ lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n
+ lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n
+ lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n
+ lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n
+ lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n
+ lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n
+ lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n
+ lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n
+ lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n
+ lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n
+ lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n
+ lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n
+ lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n
+ lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n
+ lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n
+ lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n
+ lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n
+ lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n
+ lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n
+ lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ //do math
+
+ uint i = 0; \n
+
+ do{\n
+ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n
+ privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n
+ privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n
+ privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n
+ privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n
+ privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n
+ privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n
+ privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n
+ privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n
+ privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n
+ privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n
+ privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n
+ i = i + 1; \n
+ } while (i < 48); \n
+
+ A += 48 * NB; \n
+ B += 48; \n
+
+ } while (--block_k>0); \n
+
+ uint i = 0; \n
+ do{\n
+ C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; \n
+ i = i + 1; \n
+ } while (i < 12); \n
+
+
+}\n
+);
+#endif
diff --git a/src/library/blas/xtrsm.cc b/src/library/blas/xtrsm.cc
index 288dbb1..fc3df26 100644
--- a/src/library/blas/xtrsm.cc
+++ b/src/library/blas/xtrsm.cc
@@ -21,6 +21,13 @@
#include <functor.h>
#include <functor_selector.h>
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+#include "TrtriClKernels.h"
+#include "TrtriKernelSourceIncludes.h"
+
// Transform a trsm in clblasRowMajor into a trsm in clblasColumnMajor:
//
@@ -83,6 +90,633 @@ static void force_trsm_column_major(Args & args)
// the TRSMs only have to consider one of the two cases.
//
+//
+// Common part of all XTRSM implementations using the old Solver infrastructure
+//
+
+#define CL_CHECK(RET) \
+ if(RET != CL_SUCCESS) { \
+ printf("OpenCL error %i on line %u\n", RET, __LINE__); \
+ assert(false); \
+ }
+
+static void makeKernel(
+ cl_kernel *clKernel,
+ cl_command_queue clQueue,
+ const char *kernelSource,
+ const char *sourceBuildOptions,
+ const unsigned char **kernelBinary,
+ size_t *kernelBinarySize,
+ const char *binaryBuildOptions)
+{
+ cl_int err;
+ if (*clKernel) {
+ // kernel has already been built, return
+#ifdef AUTOGEMM_PRINT_DEBUG
+ // get kernel name
+ size_t kernelNameLength;
+ err = clGetKernelInfo(
+ *clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ sizeof(kernelNameLength),
+ NULL,
+ &kernelNameLength);
+ CL_CHECK(err)
+ char *kernelName = new char[kernelNameLength];
+ err = clGetKernelInfo(
+ *clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ kernelNameLength*sizeof(char),
+ kernelName,
+ NULL);
+ CL_CHECK(err)
+ printf("makeGemmKernel: \"%s\" already built; returning.\n", kernelName);
+ delete[] kernelName;
+#endif
+ return;
+ }
+ else {
+ // kernel has not been built, so build it (from binary, preferably)
+ cl_context clContext;
+ cl_device_id clDevice;
+ err = clGetCommandQueueInfo(clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL);
+ CL_CHECK(err)
+ err = clGetCommandQueueInfo(clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL);
+ CL_CHECK(err)
+ cl_program clProgram;
+ cl_int clBinaryStatus;
+ if (*kernelBinary) {
+#ifdef AUTOGEMM_PRINT_DEBUG
+ printf("makeGemmKernel: pre-compiled binary found: %llu bytes\n", *kernelBinarySize);
+#endif
+ clProgram = clCreateProgramWithBinary(
+ clContext,
+ 1, &clDevice,
+ kernelBinarySize, kernelBinary,
+ &clBinaryStatus, &err);
+ CL_CHECK(err)
+ err = clBuildProgram(
+ clProgram,
+ 1, &clDevice,
+ binaryBuildOptions, NULL, NULL);
+ CL_CHECK(err)
+ }
+ else {
+ clProgram = clCreateProgramWithSource(
+ clContext,
+ 1, &kernelSource,
+ NULL, &err);
+ CL_CHECK(err)
+ err = clBuildProgram(
+ clProgram,
+ 1, &clDevice,
+ sourceBuildOptions, NULL, NULL);
+ if (err != CL_SUCCESS) {
+ size_t logSize = 0;
+ char* log;
+ clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+ log = (char*)calloc(1, logSize + 1);
+ clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
+ printf("=== Build log ===\n%s\n", log);
+ free(log);
+ clReleaseProgram(clProgram);
+ }
+ CL_CHECK(err)
+ }
+ err = clCreateKernelsInProgram(
+ clProgram,
+ 1, clKernel,
+ NULL);
+ CL_CHECK(err)
+ err = clReleaseProgram(clProgram);
+ CL_CHECK(err)
+#ifdef AUTOGEMM_PRINT_DEBUG
+ // get kernel name
+ size_t kernelNameLength;
+ err = clGetKernelInfo(
+ *clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ sizeof(kernelNameLength),
+ NULL,
+ &kernelNameLength);
+ CL_CHECK(err)
+ char *kernelName = new char[kernelNameLength];
+ err = clGetKernelInfo(
+ *clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ kernelNameLength*sizeof(char),
+ kernelName,
+ NULL);
+ CL_CHECK(err)
+ printf("makeGemmKernel: \"%s\" now built; returning.\n", kernelName);
+ delete[] kernelName;
+#endif
+ }
+}
+
+static cl_int clearBuffer(cl_command_queue queue,
+ cl_mem buffer,
+ size_t buffer_size)
+{
+
+ cl_int err = 0;
+ cl_event event;
+ // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!!
+ double zero = 0.0;
+ err = clEnqueueFillBuffer(queue,
+ buffer,
+ &zero,
+ sizeof(double),
+ 0, // offset
+ buffer_size,
+ 0,
+ NULL,
+ &event
+ );
+
+ return err;
+
+}
+// Compute the number of blocks of the specified 'size' to fully cover 'n'
+// Simply speaking, this is n/size rounded up.
+#define BLOCKS(n,size) ( ( (n) / size ) + ( (n) % (size) != 0 ) )
+
+cl_int call_kernel_triple_update192(
+ cl_kernel *kernel,
+ const char *kernelSource,
+ const char *sourceBuildOptions,
+ const unsigned char **kernelBinary,
+ size_t *kernelBinarySize,
+ const char *binaryBuildOptions,
+ const cl_command_queue queue,
+ cl_mem A,
+ unsigned int offA,
+ cl_mem d_dinvA,
+ int i,
+ unsigned int lda,
+ int M,
+ cl_event *event)
+{
+ cl_int err = 0;
+
+ unsigned int m = M;
+
+ int npages = M / (i * 2) + (M % (i * 2) != 0);
+
+
+ size_t globalLocal[2];
+ size_t globalThreads[2];
+ switch (i)
+ {
+ case 12:
+ globalLocal[0] = 12;
+ globalLocal[1] = 1;
+ globalThreads[0] = npages * 12;
+ globalThreads[1] = 1;
+ break;
+ case 24:
+ globalLocal[0] = 24;
+ globalLocal[1] = 2;
+ globalThreads[0] = npages * 24;
+ globalThreads[1] = 2;
+ break;
+ case 48:
+ globalLocal[0] = 24;
+ globalLocal[1] = 2;
+ globalThreads[0] = npages * 48;
+ globalThreads[1] = 4;
+ break;
+ case 96:
+ globalLocal[0] = 24;
+ globalLocal[1] = 2;
+ globalThreads[0] = npages * 96;
+ globalThreads[1] = 8;
+ break;
+ default:
+ break;
+ }
+
+ makeKernel(kernel,
+ queue,
+ kernelSource,
+ sourceBuildOptions,
+ kernelBinary,
+ kernelBinarySize,
+ binaryBuildOptions);
+
+ if (err != CL_SUCCESS) {
+ //printf( "create kernel %s failed with %d\n", kernel_name, err );
+ return err;
+ }
+
+ clSetKernelArg(*kernel, 0, sizeof(cl_mem), &A);
+ clSetKernelArg(*kernel, 1, sizeof(unsigned int), &offA);
+ clSetKernelArg(*kernel, 2, sizeof(cl_mem), &d_dinvA);
+ clSetKernelArg(*kernel, 3, sizeof(int), &i);
+ clSetKernelArg(*kernel, 4, sizeof(unsigned int), &lda);
+ clSetKernelArg(*kernel, 5, sizeof(int), &npages);
+ clSetKernelArg(*kernel, 6, sizeof(unsigned int), &m);
+
+ err = clEnqueueNDRangeKernel(queue, *kernel, 2, NULL,
+ globalThreads, globalLocal,
+ 0, NULL, event);
+
+
+ return err;
+
+}
+
+cl_int diag_dtrtri192(
+ cl_command_queue queue,
+ int M,
+ clblasUplo uplo,
+ clblasDiag diag,
+ cl_mem A,
+ size_t offA,
+ cl_mem d_dinvA,
+ size_t lda,
+ int inner_block_size,
+ int outer_block_size,
+ cl_event *event)
+{
+
+ const char *diag_dtrtri_kernel_upper_KernelSource = NULL;
+ cl_kernel *diag_dtrtri_kernel_upper_ClKernel = NULL;
+ size_t diag_dtrtri_kernel_upper_KernelBinarySize = 0;
+ const unsigned char *diag_dtrtri_kernel_upper_KernelBinary = NULL;
+ cl_int err;
+
+ /*
+ This routine is used in dtrsm
+ */
+
+ //For side==right, M is actually N here
+ int nthreads = (M / inner_block_size + (M % inner_block_size != 0)) * inner_block_size;
+ unsigned int m = M;
+
+ if (uplo == clblasLower) {
+
+ //lower is not supported yet
+ }
+ else {
+ diag_dtrtri_kernel_upper_KernelSource = diag_dtrtri_upper_192_12_src;
+ diag_dtrtri_kernel_upper_ClKernel = &diag_dtrtri_upper_192_12_clKernel;
+ diag_dtrtri_kernel_upper_KernelBinary = diag_dtrtri_upper_192_12_bin;
+ diag_dtrtri_kernel_upper_KernelBinarySize = diag_dtrtri_upper_192_12_binSize;
+
+ //cl_kernel diag_dtrtri_kernel_upper = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_UPPER", &err);
+ makeKernel(diag_dtrtri_kernel_upper_ClKernel,
+ queue,
+ diag_dtrtri_kernel_upper_KernelSource,
+ TrtriBuildOptions,
+ &diag_dtrtri_kernel_upper_KernelBinary,
+ &diag_dtrtri_kernel_upper_KernelBinarySize,
+ TrtribinBuildOptions);
+
+
+ int isDiagUnit = (diag == clblasUnit);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 0, sizeof(int), &isDiagUnit);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 1, sizeof(cl_mem), &A);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 2, sizeof(unsigned int), &offA);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 3, sizeof(cl_mem), &d_dinvA);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 4, sizeof(unsigned int), &lda);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 5, sizeof(unsigned int), &m);
+ CL_CHECK(err);
+
+ size_t globalThreads[1] = { nthreads };
+ size_t globalLocal[1] = { inner_block_size };
+
+ err = clEnqueueNDRangeKernel(queue, *diag_dtrtri_kernel_upper_ClKernel, 1, NULL,
+ globalThreads, globalLocal,
+ 0, NULL, event);
+
+ if (err != CL_SUCCESS) {
+ //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
+ return err;
+ }
+
+ //clReleaseKernel(diag_dtrtri_kernel_upper);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ // update the inverse up to the size of BLOCK_SIZE
+
+ for (int i = inner_block_size; i < outer_block_size; i *= 2) {
+
+ switch (i) {
+ case 12:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_12_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_12_R,
+ triple_dgemm_update_192_12_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_192_12_R_bin,
+ &triple_dgemm_update_192_12_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ case 24:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART1_R,
+ triple_dgemm_update_192_24_PART1_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_192_24_PART1_R_bin,
+ &triple_dgemm_update_192_24_PART1_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART2_R,
+ triple_dgemm_update_192_24_PART2_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_192_24_PART2_R_bin,
+ &triple_dgemm_update_192_24_PART2_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ break;
+
+ case 48:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART1_R,
+ triple_dgemm_update_192_48_PART1_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_192_48_PART1_R_bin,
+ &triple_dgemm_update_192_48_PART1_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART2_R,
+ triple_dgemm_update_192_48_PART2_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_192_48_PART2_R_bin,
+ &triple_dgemm_update_192_48_PART2_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ break;
+
+ case 96:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART1_R,
+ triple_dgemm_update_192_96_PART1_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_192_96_PART1_R_bin,
+ &triple_dgemm_update_192_96_PART1_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART2_R,
+ triple_dgemm_update_192_96_PART2_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_192_96_PART2_R_bin,
+ &triple_dgemm_update_192_96_PART2_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ break;
+
+ default:
+
+ break;
+ }
+
+ if (i * 2 >= M) break;
+
+ }
+
+ }
+
+ return err;
+
+}
+
+static clblasStatus gpu_dtrsm192(
+ clblasOrder order,
+ clblasSide side,
+ clblasUplo uplo,
+ clblasTranspose transA,
+ clblasDiag diag,
+ size_t M,
+ size_t N,
+ double alpha,
+ const cl_mem A,
+ size_t offA,
+ size_t lda,
+ cl_mem B,
+ size_t offB,
+ size_t ldb,
+ cl_uint numCommandQueues,
+ cl_command_queue *commandQueues,
+ cl_uint numEventsInWaitList,
+ const cl_event *eventWaitList,
+ cl_event *events,
+ bool &specialCaseHandled)
+{
+ if (order != clblasColumnMajor)
+ return clblasNotImplemented;
+
+ if ((M % 192 == 0) && (N % 192 == 0))
+ {
+ //TODO: the implementation of sub block being 192 only supports
+ //side == right
+ //uplo == upper
+ //trans == notrans
+ //M and N need to be mod192
+ //subblock being 192 is prefered over 128 on Hawaii device since
+ //it does not create "boundary" in DGEMM calls
+ //Hawaii DGEMM calls have better performance when M N K are mod48
+ if ((side == clblasRight) && (uplo == clblasUpper) && (transA == clblasNoTrans))
+ {
+ int inner_block_size = 12; // inner blocking size, <=32
+ int outer_block_size = 192;// outer blocking size, >BLOCK_SIZE
+ cl_int err = 0;
+
+ int i;
+ cl_context context;
+ err = getQueueContext(commandQueues[0], &context);
+ CL_CHECK(err);
+
+ /* quick return on wrong size */
+ if (M <= 0 || N <= 0)
+ return clblasInvalidDim;
+
+ double neg_one = -1.0;
+ double one = 1.0;
+ double zero = 0.0;
+
+ cl_mem InvA = 0;
+ cl_mem X = 0;
+ // X of size mxn will contain the result
+ size_t ldX = M;
+ size_t offX = 0; //must be 0: needed by the _(X,i,j) macro
+ size_t size_X = N*ldX * sizeof(double);
+ X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err);
+ CL_CHECK(err);
+ err = clearBuffer(commandQueues[0], X, size_X);
+ CL_CHECK(err);
+
+ // side=R
+ /* invert the diagonals
+ * Allocate device memory for the inverted diagonal blocks, size=n*BLOCK_SIZE
+ */
+
+ /* invert the diagonals
+ * Allocate device memory for the inverted diagonal blocks, size=m*nb
+ */
+ size_t ldInvA = outer_block_size;
+ size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro
+ size_t size_InvA = ldInvA * BLOCKS(N, outer_block_size) * outer_block_size *sizeof(double);
+ InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err);
+ CL_CHECK(err);
+ err = clearBuffer(commandQueues[0], InvA, size_InvA);
+ CL_CHECK(err);
+
+ diag_dtrtri192(commandQueues[0], N, uplo, diag, A, offA, InvA, lda, inner_block_size, outer_block_size, events);
+
+ specialCaseHandled = true;
+ return clblasSuccess;
+
+ }
+ }
+
+
+
+ return clblasNotImplemented;
+}
+
+static clblasStatus gpu_dtrsm128(
+ clblasOrder order,
+ clblasSide side,
+ clblasUplo uplo,
+ clblasTranspose transA,
+ clblasDiag diag,
+ size_t M,
+ size_t N,
+ double alpha,
+ const cl_mem A,
+ size_t offA,
+ size_t lda,
+ cl_mem B,
+ size_t offB,
+ size_t ldb,
+ cl_uint numCommandQueues,
+ cl_command_queue *commandQueues,
+ cl_uint numEventsInWaitList,
+ const cl_event *eventWaitList,
+ cl_event *events,
+ bool &specialCaseHandled)
+{
+
+
+ return clblasNotImplemented;
+}
+
+static clblasStatus
+doTrsm(
+CLBlasKargs *kargs,
+clblasOrder order,
+clblasSide side,
+clblasUplo uplo,
+clblasTranspose transA,
+clblasDiag diag,
+size_t M,
+size_t N,
+const cl_mem A,
+size_t offA,
+size_t lda,
+cl_mem B,
+size_t offB,
+size_t ldb,
+cl_uint numCommandQueues,
+cl_command_queue *commandQueues,
+cl_uint numEventsInWaitList,
+const cl_event *eventWaitList,
+cl_event *events)
+{
+ cl_int err;
+ ListHead seq;
+ size_t msize;
+ clblasStatus retCode = clblasSuccess;
+
+ if (!clblasInitialized) {
+ return clblasNotInitialized;
+ }
+
+ /* Validate arguments */
+
+ if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET))) {
+ return retCode;
+ }
+ msize = (side == clblasLeft) ? M : N;
+
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize,
+ A, offA, lda, A_MAT_ERRSET))) {
+ return retCode;
+ }
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N,
+ B, offB, ldb, B_MAT_ERRSET))) {
+ return retCode;
+ }
+
+ kargs->order = order;
+ kargs->side = side;
+ kargs->uplo = uplo;
+ kargs->transA = transA;
+ kargs->diag = diag;
+ kargs->M = M;
+ kargs->N = N;
+ kargs->A = A;
+ kargs->offA = offA;
+ kargs->lda.matrix = lda;
+ kargs->B = B;
+ kargs->offBX = offB;
+ kargs->ldb.matrix = ldb;
+ // Store original problem size in K, this is used to know it while
+ // calculating result by parts using M or N as part size
+ if (side == clblasLeft) {
+ kargs->K = M;
+ }
+ else {
+ kargs->K = N;
+ }
+
+ kargs->offsetM = 0;
+ kargs->offsetN = 0;
+ kargs->scimage[0] = 0;
+
+#ifndef TRXM_MULTIPLE_QUEUES
+ if (numCommandQueues != 0) {
+ numCommandQueues = 1;
+ }
+#endif
+
+ listInitHead(&seq);
+ err = makeSolutionSeq(CLBLAS_TRSM, kargs, numCommandQueues, commandQueues,
+ numEventsInWaitList, eventWaitList, events, &seq);
+ if (err == CL_SUCCESS) {
+ err = executeSolutionSeq(&seq);
+ }
+
+ freeSolutionSeq(&seq);
+
+ return (clblasStatus)err;
+}
extern "C"
clblasStatus
@@ -170,6 +804,7 @@ clblasDtrsm(
const cl_event *eventWaitList,
cl_event *events)
{
+ /*
CHECK_QUEUES(numCommandQueues, commandQueues);
CHECK_EVENTS(numEventsInWaitList, eventWaitList);
@@ -204,6 +839,48 @@ clblasDtrsm(
functor->release();
return res;
+ */
+ bool specialCaseHandled = false;
+
+ clblasStatus SpecialCaseStatus = gpu_dtrsm192(order,
+ side,
+ uplo,
+ transA,
+ diag,
+ M, N,
+ alpha,
+ A, offA, lda,
+ B, offB, ldb,
+ numCommandQueues, commandQueues,
+ numEventsInWaitList,
+ eventWaitList,
+ events,
+ specialCaseHandled);
+
+ if (specialCaseHandled)
+ return SpecialCaseStatus;
+
+
+ CLBlasKargs kargs;
+
+ memset(&kargs, 0, sizeof(kargs));
+ kargs.dtype = TYPE_DOUBLE;
+ kargs.alpha.argDouble = alpha;
+
+ return doTrsm(&kargs,
+ order,
+ side,
+ uplo,
+ transA,
+ diag,
+ M, N,
+ A, offA, lda,
+ B, offB, ldb,
+ numCommandQueues, commandQueues,
+ numEventsInWaitList,
+ eventWaitList,
+ events);
+
}
extern "C"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list