[clblas] 39/67: dtrsm reenablment 192
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Oct 27 08:02:13 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit d6e6a78f8430f0ca7d22f17d3d2160304da78924
Author: Timmy <timmy.liu at amd.com>
Date: Mon Sep 21 10:39:58 2015 -0500
dtrsm reenablment 192
---
src/library/blas/trtri/TrtriClKernels.h | 25 +-
.../blas/trtri/TrtriKernelSourceIncludes.cpp | 11 +
src/library/blas/trtri/TrtriKernelSourceIncludes.h | 39 +-
.../blas/trtri/diag_dtrtri_upper_128_16.cpp | 150 ++++++
.../blas/trtri/triple_dgemm_update_128_16_R.cpp | 238 +++++++++
.../trtri/triple_dgemm_update_128_32_PART1_R.cpp | 150 ++++++
.../trtri/triple_dgemm_update_128_32_PART2_R.cpp | 135 +++++
.../trtri/triple_dgemm_update_128_64_PART1_R.cpp | 144 +++++
.../trtri/triple_dgemm_update_128_64_PART2_R.cpp | 133 +++++
.../triple_dgemm_update_128_ABOVE64_PART1_R.cpp | 143 +++++
.../triple_dgemm_update_128_ABOVE64_PART2_R.cpp | 134 +++++
.../triple_dgemm_update_128_ABOVE64_PART3_R.cpp | 91 ++++
src/library/blas/xtrsm.cc | 590 ++++++++++++++++++++-
13 files changed, 1963 insertions(+), 20 deletions(-)
diff --git a/src/library/blas/trtri/TrtriClKernels.h b/src/library/blas/trtri/TrtriClKernels.h
index dbc9744..0769968 100644
--- a/src/library/blas/trtri/TrtriClKernels.h
+++ b/src/library/blas/trtri/TrtriClKernels.h
@@ -3,14 +3,25 @@
#define TRTRI_CL_KERNELS_H
#include "CL/cl.h"
+/*mod 192 dtrsm*/
static cl_kernel diag_dtrtri_upper_192_12_clKernel = NULL;
-static cl_kernel triple_dgemm_update_192_12_R = NULL;
-static cl_kernel triple_dgemm_update_192_24_PART1_R = NULL;
-static cl_kernel triple_dgemm_update_192_24_PART2_R = NULL;
-static cl_kernel triple_dgemm_update_192_48_PART1_R = NULL;
-static cl_kernel triple_dgemm_update_192_48_PART2_R = NULL;
-static cl_kernel triple_dgemm_update_192_96_PART1_R = NULL;
-static cl_kernel triple_dgemm_update_192_96_PART2_R = NULL;
+static cl_kernel triple_dgemm_update_192_12_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_192_24_PART1_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_192_24_PART2_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_192_48_PART1_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_192_48_PART2_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_192_96_PART1_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_192_96_PART2_R_clKernel = NULL;
+/*mod 128 dtrsm*/
+static cl_kernel diag_dtrtri_upper_128_16_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_16_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_32_PART1_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_32_PART2_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_64_PART1_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_64_PART2_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_ABOVE64_PART1_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_ABOVE64_PART2_R_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_ABOVE64_PART3_R_clKernel = NULL;
#endif
\ No newline at end of file
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
index 8750cee..acef64d 100644
--- a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
@@ -6,6 +6,7 @@
#ifndef TRTRI_SOURCE_INCLUDES_CPP
#define TRTRI_SOURCE_INCLUDES_CPP
+/*mod 192 dtrsm*/
#include "diag_dtrtri_upper_192_12.cpp"
#include "triple_dgemm_update_192_12_R.cpp"
#include "triple_dgemm_update_192_24_PART1_R.cpp"
@@ -15,5 +16,15 @@
#include "triple_dgemm_update_192_96_PART1_R.cpp"
#include "triple_dgemm_update_192_96_PART2_R.cpp"
+/*mod 128 dtrsm*/
+#include "diag_dtrtri_upper_128_16.cpp"
+#include "triple_dgemm_update_128_16_R.cpp"
+#include "triple_dgemm_update_128_32_PART1_R.cpp"
+#include "triple_dgemm_update_128_32_PART2_R.cpp"
+#include "triple_dgemm_update_128_64_PART1_R.cpp"
+#include "triple_dgemm_update_128_64_PART2_R.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART1_R.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART2_R.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART3_R.cpp"
#endif
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.h b/src/library/blas/trtri/TrtriKernelSourceIncludes.h
index a6e1834..0fcc133 100644
--- a/src/library/blas/trtri/TrtriKernelSourceIncludes.h
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.h
@@ -9,7 +9,7 @@
const char * const TrtriBuildOptions = "-cl-std=CL2.0";
const char * const TrtribinBuildOptions = "-cl-std=CL2.0";
-
+/*mod 192 dtrsm*/
extern const char * const diag_dtrtri_upper_192_12_src;
extern unsigned char *diag_dtrtri_upper_192_12_bin;
extern size_t diag_dtrtri_upper_192_12_binSize;
@@ -42,4 +42,41 @@ extern const char * const triple_dgemm_update_192_96_PART2_R_src;
extern unsigned char *triple_dgemm_update_192_96_PART2_R_bin;
extern size_t triple_dgemm_update_192_96_PART2_R_binSize;
+/*mod 128 dtrsm*/
+extern const char * const diag_dtrtri_upper_128_16_src;
+extern unsigned char *diag_dtrtri_upper_128_16_bin;
+extern size_t diag_dtrtri_upper_128_16_binSize;
+
+extern const char * const triple_dgemm_update_128_16_R_src;
+extern unsigned char *triple_dgemm_update_128_16_R_bin;
+extern size_t triple_dgemm_update_128_16_R_binSize;
+
+extern const char * const triple_dgemm_update_128_32_PART1_R_src;
+extern unsigned char *triple_dgemm_update_128_32_PART1_R_bin;
+extern size_t triple_dgemm_update_128_32_PART1_R_binSize;
+
+extern const char * const triple_dgemm_update_128_32_PART2_R_src;
+extern unsigned char *triple_dgemm_update_128_32_PART2_R_bin;
+extern size_t triple_dgemm_update_128_32_PART2_R_binSize;
+
+extern const char * const triple_dgemm_update_128_64_PART1_R_src;
+extern unsigned char *triple_dgemm_update_128_64_PART1_R_bin;
+extern size_t triple_dgemm_update_128_64_PART1_R_binSize;
+
+extern const char * const triple_dgemm_update_128_64_PART2_R_src;
+extern unsigned char *triple_dgemm_update_128_64_PART2_R_bin;
+extern size_t triple_dgemm_update_128_64_PART2_R_binSize;
+
+extern const char * const triple_dgemm_update_128_ABOVE64_PART1_R_src;
+extern unsigned char *triple_dgemm_update_128_ABOVE64_PART1_R_bin;
+extern size_t triple_dgemm_update_128_ABOVE64_PART1_R_binSize;
+
+extern const char * const triple_dgemm_update_128_ABOVE64_PART2_R_src;
+extern unsigned char *triple_dgemm_update_128_ABOVE64_PART2_R_bin;
+extern size_t triple_dgemm_update_128_ABOVE64_PART2_R_binSize;
+
+extern const char * const triple_dgemm_update_128_ABOVE64_PART3_R_src;
+extern unsigned char *triple_dgemm_update_128_ABOVE64_PART3_R_bin;
+extern size_t triple_dgemm_update_128_ABOVE64_PART3_R_binSize;
+
#endif
diff --git a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
new file mode 100644
index 0000000..83f42f2
--- /dev/null
+++ b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
@@ -0,0 +1,150 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP
+#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP
+#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *diag_dtrtri_upper_128_16_bin = 0;
+size_t diag_dtrtri_upper_128_16_binSize = 0;
+
+const char * const diag_dtrtri_upper_128_16_src = STRINGIFY(
+#define BLOCK_SIZE 16 \n
+#define NB 128 \n
+#define ZERO ( 0.0) \n
+#define ONE ( 1.0) \n
+#ifdef DOUBLE_PRECISION \n
+#ifdef cl_khr_fp64 \n
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n
+#else \n
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable \n
+#endif \n
+#endif \n
+__kernel void diag_dtrtri_upper_128_16_src(\n
+int isDiagUnit,\n
+__global double const * restrict A, \n
+uint offA, \n
+__global double *d_dinvA, \n
+uint lda, \n
+uint na)\n
+{ \n
+ int i, j;\n
+ double Ystx = 0;\n
+ __local double *y = 0;\n
+ double switcher;\n
+ double neg_switcher;\n
+
+ // Thread index
+ int tx = get_local_id(0);\n
+
+ // Thread index
+ int gx = get_global_id(0);\n
+
+ // Block index
+ int bx = get_group_id(0);\n
+
+ A = A + offA;\n
+
+ __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE;\n
+ int NumBLperNB = NB/BLOCK_SIZE;\n
+ d_dinvA += bx/NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE);\n
+
+ __local double Bs[BLOCK_SIZE*BLOCK_SIZE];\n
+ __local double workspace[BLOCK_SIZE]; \n // workspace used to store the current working column
+
+ // load A
+ #pragma unroll \n
+ for( i=0; i < BLOCK_SIZE; i++ )\n
+ {\n
+ if(tx <= i && i+bx*BLOCK_SIZE < na )\n
+ {\n
+ Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);\n
+ }\n
+ else\n
+ {\n
+ Bs[i*BLOCK_SIZE+tx] = ZERO;\n
+ }\n
+ }\n
+ // read in the whole square block of my A and zero out the non data triangular
+
+ // Synchronize to make sure the matrices are loaded
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE);\n
+
+ // solve the diagonals
+
+ if(isDiagUnit == 1)\n
+ {\n
+ Bs[tx*BLOCK_SIZE+tx] = ONE;\n
+ }\n
+ else\n
+ {\n
+ if( Bs[tx*BLOCK_SIZE+tx] == ZERO )\n
+ {\n
+ Bs[tx*BLOCK_SIZE+tx] = ONE; \n
+ }\n
+ else \n
+ {\n
+ Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;\n
+ }\n
+ }\n
+
+ /* the upper case */
+ for( i=0; i < BLOCK_SIZE; i++ ) {\n
+ Ystx = ZERO;\n
+ if( tx < i)\n
+ {\n
+ switcher = ONE;\n
+ }\n
+ else\n
+ {\n
+ switcher = ZERO;\n
+ }\n
+
+ //dtrmv
+ workspace[tx] = *(Bs+i*BLOCK_SIZE+tx);\n
+ y = Bs+i*BLOCK_SIZE;\n
+
+ #pragma unroll\n
+ //for( j=tx; j < i; j++ )
+ for( j=0; j < i; j++ )\n
+ {\n
+ Ystx += switcher * (*(Bs+j*BLOCK_SIZE+tx)*workspace[j]);\n
+ }\n
+
+ //sscal
+ // if (tx != i) y[tx]=switcher*Ystx*(-Bs[i*BLOCK_SIZE+i]);
+
+ if( tx != i)\n
+ {\n
+ switcher = ONE;\n
+ neg_switcher = ZERO;\n
+ }\n
+ else\n
+ {\n
+ switcher = ZERO;\n
+ neg_switcher = ONE;\n
+ }\n
+
+ y[tx] = switcher *Ystx*(-Bs[i*BLOCK_SIZE+i])+neg_switcher*y[tx];\n
+
+ // __syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE);\n
+ }\n
+
+ // write back A
+#pragma unroll\n
+ for( i=0; i < BLOCK_SIZE; i++ )\n
+ {\n
+ *(d_dinvA+i*NB+tx) = Bs[i*BLOCK_SIZE+tx];\n
+ }\n
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
new file mode 100644
index 0000000..5fdad0a
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
@@ -0,0 +1,238 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+
+ * B21 = -inv(A11)*A12*inv(A22)
+ * 16 to 32
+
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_16_R_bin = 0;
+size_t triple_dgemm_update_128_16_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_16_R_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_16_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages;\n
+//const int page = (blockIdx.y)%(npages);
+const int page = qmod(get_group_id(1), npages);\n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny*get_local_size(0); \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part one---------------------------//
+{ \n
+ // A12*inv(A22) -> A12
+ // A=A12, B=inv(A22), C=A12(d_dinvA)
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ int xa = page*blk * 2 + ibx + id; \n
+ int ya = page*blk * 2 + blk; \n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else
+ maxA = 0; \n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA + blk*NB + blk; \n
+ C = d_dinvA + blk*NB; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = READA; incA += lda; \n
+ a[1] = READA; incA += lda; \n
+ a[2] = READA; incA += lda; \n
+ a[3] = READA; incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 4][iny] = B[4 + 0 * ldb]; \n
+ bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n
+ bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n
+ bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ bs[inx + 12][iny] = B[12 + 0 * ldb]; \n
+ bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n
+ bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n
+ bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[4][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[8][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+
+#undef READA\n
+
+//--------------------------part two---------------------------//
+{
+ // -inv(A11)*A12 -> A12
+ // A=inv(A11), B=A12, C=A12
+ __global double *A, *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ A = d_dinvA; \n
+ B = C = d_dinvA + blk*NB; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 4][iny] = B[4 + 0 * ldb]; \n
+ bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n
+ bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n
+ bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ bs[inx + 12][iny] = B[12 + 0 * ldb]; \n
+ bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n
+ bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n
+ bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
new file mode 100644
index 0000000..a3823d0
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
@@ -0,0 +1,150 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_32_PART1_R_bin = 0;
+size_t triple_dgemm_update_128_32_PART1_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_32_PART1_R_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+c[0] += alpha * b[0]; \n
+c[1] += alpha * b[1]; \n
+c[2] += alpha * b[2]; \n
+c[3] += alpha * b[3]; \n
+c[4] += alpha * b[4]; \n
+c[5] += alpha * b[5]; \n
+c[6] += alpha * b[6]; \n
+c[7] += alpha * b[7]; \n
+c[8] += alpha * b[8]; \n
+c[9] += alpha * b[9]; \n
+c[10] += alpha * b[10]; \n
+c[11] += alpha * b[11]; \n
+c[12] += alpha * b[12]; \n
+c[13] += alpha * b[13]; \n
+c[14] += alpha * b[14]; \n
+c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_32_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n
+{ \n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny*get_local_size(0); \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part one---------------------------//
+{
+ // A12*inv(A22) -> A21
+ // A=A12, B=inv(A22), C=A12(d_dinvA)
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ int xa = page*blk * 2 + ibx + id; \n
+ int ya = page*blk * 2 + blk; \n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else\n
+ maxA = 0; \n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA + blk*NB + blk; \n
+ C = d_dinvA + blk*NB; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = READA; incA += lda; \n
+ a[1] = READA; incA += lda; \n
+ a[2] = READA; incA += lda; \n
+ a[3] = READA; incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = READA; incA += lda; \n
+ \n
+ daxpy(a[0], &bs[4][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = READA; incA += lda; \n
+ \n
+ daxpy(a[0], &bs[8][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c);\n
+ daxpy(a[1], &bs[13][0], c);\n
+ daxpy(a[2], &bs[14][0], c);\n
+ daxpy(a[3], &bs[15][0], c);\n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
new file mode 100644
index 0000000..2fcd684
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
@@ -0,0 +1,135 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_32_PART2_R_bin = 0;
+size_t triple_dgemm_update_128_32_PART2_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_32_PART2_R_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+c[0] += alpha * b[0]; \n
+c[1] += alpha * b[1]; \n
+c[2] += alpha * b[2]; \n
+c[3] += alpha * b[3]; \n
+c[4] += alpha * b[4]; \n
+c[5] += alpha * b[5]; \n
+c[6] += alpha * b[6]; \n
+c[7] += alpha * b[7]; \n
+c[8] += alpha * b[8]; \n
+c[9] += alpha * b[9]; \n
+c[10] += alpha * b[10]; \n
+c[11] += alpha * b[11]; \n
+c[12] += alpha * b[12]; \n
+c[13] += alpha * b[13]; \n
+c[14] += alpha * b[14]; \n
+c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_32_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages;\n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny*get_local_size(0); \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+
+//--------------------------part two---------------------------//
+{
+ // -inv(A11)*A12 -> A12
+ // A=inv(A11), B=A12, C=A12
+ __global double *A, *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ A = d_dinvA; \n
+ B = C = d_dinvA + blk*NB; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
new file mode 100644
index 0000000..a5245ed
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_64_PART1_R_bin = 0;
+size_t triple_dgemm_update_128_64_PART1_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_64_PART1_R_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part one---------------------------//
+{
+ // A12*inv(A22) -> A12(d_dinvA)
+ // A=A12, B=inv(A22), C=A12
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ int xa = page*blk * 2 + ibx + id; \n
+ int ya = page*blk * 2 + blk; \n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else\n
+ maxA = 0; \n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA + blk*NB + blk; \n
+ C = d_dinvA + blk*NB; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = READA; incA += lda; \n
+ a[1] = READA; incA += lda; \n
+ a[2] = READA; incA += lda; \n
+ a[3] = READA; incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = READA; incA += lda;\n
+ daxpy(a[1], &bs[1][0], c); a[1] = READA; incA += lda;\n
+ daxpy(a[2], &bs[2][0], c); a[2] = READA; incA += lda;\n
+ daxpy(a[3], &bs[3][0], c); a[3] = READA; incA += lda;\n
+ \n
+ daxpy(a[0], &bs[4][0], c); a[0] = READA; incA += lda;\n
+ daxpy(a[1], &bs[5][0], c); a[1] = READA; incA += lda;\n
+ daxpy(a[2], &bs[6][0], c); a[2] = READA; incA += lda;\n
+ daxpy(a[3], &bs[7][0], c); a[3] = READA; incA += lda;\n
+ \n
+ daxpy(a[0], &bs[8][0], c); a[0] = READA; incA += lda;\n
+ daxpy(a[1], &bs[9][0], c); a[1] = READA; incA += lda;\n
+ daxpy(a[2], &bs[10][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c);\n
+ daxpy(a[1], &bs[13][0], c);\n
+ daxpy(a[2], &bs[14][0], c);\n
+ daxpy(a[3], &bs[15][0], c);\n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+#undef READA\n
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
new file mode 100644
index 0000000..09e37c4
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_64_PART2_R_bin = 0;
+size_t triple_dgemm_update_128_64_PART2_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_64_PART2_R_src = STRINGIFY(
+ static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+
+//--------------------------part two---------------------------//
+{
+ // -inv(A11)*A12 -> A12
+ // A=inv(A11), B=A12, C=A12
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ A = d_dinvA; \n
+ B = C = d_dinvA + blk*NB; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
new file mode 100644
index 0000000..13d0233
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
@@ -0,0 +1,143 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_ABOVE64_PART1_R_bin = 0;
+size_t triple_dgemm_update_128_ABOVE64_PART1_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_ABOVE64_PART1_R_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part one---------------------------//
+{
+ // A12*inv(A22) -> A12(d_dinvA)
+ // A=A12, B=inv(A22), C=A12
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ int xa = page*blk * 2 + ibx + id; \n
+ int ya = page*blk * 2 + blk; \n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else\n
+ maxA = 0; \n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA + blk*NB + blk; \n
+ C = d_dinvA + blk*NB; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = READA; incA += lda; \n
+ a[1] = READA; incA += lda; \n
+ a[2] = READA; incA += lda; \n
+ a[3] = READA; incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[4][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[8][0], c); a[0] = READA; incA += lda; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = READA; incA += lda; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = READA; incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = READA; incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+ }\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
new file mode 100644
index 0000000..b485b50
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_ABOVE64_PART2_R_bin = 0;
+size_t triple_dgemm_update_128_ABOVE64_PART2_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_ABOVE64_PART2_R_src = STRINGIFY(
+ static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+
+//--------------------------part two---------------------------//
+{ \n
+ // -inv(A11)*A12 -> A12
+ // A=inv(A11), B=A12, C=A12
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ A = d_dinvA; \n
+ B = d_dinvA + blk*NB; \n
+ C = d_dinvA + blk; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
new file mode 100644
index 0000000..7d09937
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ *
+ * part 3, copy data into position
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_ABOVE64_PART3_R_bin = 0;
+size_t triple_dgemm_update_128_ABOVE64_PART3_R_binSize = 0;
+
+const char * const triple_dgemm_update_128_ABOVE64_PART3_R_src = STRINGIFY(
+ static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages;\n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+
+//--------------------------part two---------------------------//
+{
+ // -inv(A11)*A12 -> A12
+ // A=inv(A11), B=A12, C=A12
+ __global double *C_temp, *C_real; \n
+ int ldc = NB; \n
+
+ C_temp = d_dinvA + NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2)
+ + blk; \n
+
+ C_real = d_dinvA + NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + blk*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ C_temp += ibx + id + __mul(iby, ldc); \n
+ C_real += ibx + id + __mul(iby, ldc); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C_real[0] = C_temp[0]; \n
+ C_temp[0] = ZERO; \n
+ C_temp += ldc; \n
+ C_real += ldc; \n
+ }\n
+}\n
+
+}\n
+);
+#endif
diff --git a/src/library/blas/xtrsm.cc b/src/library/blas/xtrsm.cc
index 4fc408f..73f13a1 100644
--- a/src/library/blas/xtrsm.cc
+++ b/src/library/blas/xtrsm.cc
@@ -27,6 +27,7 @@
#include "TrtriClKernels.h"
#include "TrtriKernelSourceIncludes.h"
+#include <iostream>
// Transform a trsm in clblasRowMajor into a trsm in clblasColumnMajor:
@@ -164,6 +165,7 @@ static void makeKernel(
CL_CHECK(err)
}
else {
+ //std::cout << kernelSource << std::endl;
clProgram = clCreateProgramWithSource(
clContext,
1, &kernelSource,
@@ -305,12 +307,12 @@ cl_int call_kernel_triple_update192(
kernelBinary,
kernelBinarySize,
binaryBuildOptions);
-
+ /*
if (err != CL_SUCCESS) {
//printf( "create kernel %s failed with %d\n", kernel_name, err );
return err;
}
-
+ */
clSetKernelArg(*kernel, 0, sizeof(cl_mem), &A);
clSetKernelArg(*kernel, 1, sizeof(unsigned int), &offA);
clSetKernelArg(*kernel, 2, sizeof(cl_mem), &d_dinvA);
@@ -414,7 +416,7 @@ cl_int diag_dtrtri192(
switch (i) {
case 12:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_12_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- err = call_kernel_triple_update192(&triple_dgemm_update_192_12_R,
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_12_R_clKernel,
triple_dgemm_update_192_12_R_src,
TrtriBuildOptions,
(const unsigned char **)&triple_dgemm_update_192_12_R_bin,
@@ -422,12 +424,13 @@ cl_int diag_dtrtri192(
TrtribinBuildOptions,
queue,
A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
break;
case 24:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART1_R,
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART1_R_clKernel,
triple_dgemm_update_192_24_PART1_R_src,
TrtriBuildOptions,
(const unsigned char **)&triple_dgemm_update_192_24_PART1_R_bin,
@@ -436,7 +439,7 @@ cl_int diag_dtrtri192(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
- err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART2_R,
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART2_R_clKernel,
triple_dgemm_update_192_24_PART2_R_src,
TrtriBuildOptions,
(const unsigned char **)&triple_dgemm_update_192_24_PART2_R_bin,
@@ -450,7 +453,7 @@ cl_int diag_dtrtri192(
case 48:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART1_R,
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART1_R_clKernel,
triple_dgemm_update_192_48_PART1_R_src,
TrtriBuildOptions,
(const unsigned char **)&triple_dgemm_update_192_48_PART1_R_bin,
@@ -459,7 +462,7 @@ cl_int diag_dtrtri192(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
- err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART2_R,
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART2_R_clKernel,
triple_dgemm_update_192_48_PART2_R_src,
TrtriBuildOptions,
(const unsigned char **)&triple_dgemm_update_192_48_PART2_R_bin,
@@ -473,7 +476,7 @@ cl_int diag_dtrtri192(
case 96:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART1_R,
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART1_R_clKernel,
triple_dgemm_update_192_96_PART1_R_src,
TrtriBuildOptions,
(const unsigned char **)&triple_dgemm_update_192_96_PART1_R_bin,
@@ -482,7 +485,7 @@ cl_int diag_dtrtri192(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
- err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART2_R,
+ err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART2_R_clKernel,
triple_dgemm_update_192_96_PART2_R_src,
TrtriBuildOptions,
(const unsigned char **)&triple_dgemm_update_192_96_PART2_R_bin,
@@ -677,6 +680,322 @@ static clblasStatus gpu_dtrsm192(
return clblasNotImplemented;
}
+cl_int call_kernel_triple_update128(
+ cl_kernel *kernel,
+ const char *kernelSource,
+ const char *sourceBuildOptions,
+ const unsigned char **kernelBinary,
+ size_t *kernelBinarySize,
+ const char *binaryBuildOptions,
+ const cl_command_queue queue,
+ cl_mem A,
+ unsigned int offA,
+ cl_mem d_dinvA,
+ int i,
+ unsigned int lda,
+ int M,
+ cl_event *event)
+{
+ cl_int err = 0;
+
+ unsigned int m = M;
+
+ int npages = M / (i * 2) + (M % (i * 2) != 0);
+ size_t globalLocal[2] = { (i <= 32) ? (i / 4) : 16, 4 };
+ size_t globalThreads[2] = { (i / (globalLocal[0] * globalLocal[1]))* globalLocal[0],
+ npages*(i / 16) * globalLocal[1] };
+
+ makeKernel(kernel,
+ queue,
+ kernelSource,
+ sourceBuildOptions,
+ kernelBinary,
+ kernelBinarySize,
+ binaryBuildOptions);
+ /*
+ if (err != CL_SUCCESS) {
+ //printf( "create kernel %s failed with %d\n", kernel_name, err );
+ return err;
+ }
+ */
+
+ clSetKernelArg(*kernel, 0, sizeof(cl_mem), &A);
+ clSetKernelArg(*kernel, 1, sizeof(unsigned int), &offA);
+ clSetKernelArg(*kernel, 2, sizeof(cl_mem), &d_dinvA);
+ clSetKernelArg(*kernel, 3, sizeof(int), &i);
+ clSetKernelArg(*kernel, 4, sizeof(unsigned int), &lda);
+ clSetKernelArg(*kernel, 5, sizeof(int), &npages);
+ clSetKernelArg(*kernel, 6, sizeof(unsigned int), &m);
+
+ err = clEnqueueNDRangeKernel(queue, *kernel, 2, NULL,
+ globalThreads, globalLocal,
+ 0, NULL, event);
+
+
+ return err;
+}
+
+
+cl_int diag_dtrtri128(
+ cl_command_queue queue,
+ int M,
+ clblasUplo uplo,
+ clblasDiag diag,
+ cl_mem A,
+ size_t offA,
+ cl_mem d_dinvA,
+ size_t lda,
+ int inner_block_size,
+ int outer_block_size,
+ cl_event *event)
+{
+ std::cout << "enter diag_dtrtri128 " << std::endl;
+ const char *diag_dtrtri_kernel_upper_KernelSource = NULL;
+ cl_kernel *diag_dtrtri_kernel_upper_ClKernel = NULL;
+ size_t diag_dtrtri_kernel_upper_KernelBinarySize = 0;
+ const unsigned char *diag_dtrtri_kernel_upper_KernelBinary = NULL;
+
+ const char *diag_dtrtri_kernel_lower_KernelSource = NULL;
+ cl_kernel *diag_dtrtri_kernel_lower_ClKernel = NULL;
+ size_t diag_dtrtri_kernel_lower_KernelBinarySize = 0;
+ const unsigned char *diag_dtrtri_kernel_lower_KernelBinary = NULL;
+
+ cl_int err = 0;
+
+ /*
+ This routine is used in dtrsm
+ */
+
+
+ int nthreads = (M / inner_block_size + (M % inner_block_size != 0)) * inner_block_size;
+ unsigned int m = M;
+
+ if (uplo == clblasLower) {
+
+ /*
+ cl_kernel diag_dtrtri_kernel_lower = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_LOWER", &err);
+ if (err != CL_SUCCESS) {
+ //printf( "create kernel -diag_dtrtri_kernel_lower- failed with %d\n", err );
+ return err;
+ }
+
+ int isDiagUnit = (diag == clblasUnit);
+ clSetKernelArg(diag_dtrtri_kernel_lower, 0, sizeof(int), &isDiagUnit);
+ clSetKernelArg(diag_dtrtri_kernel_lower, 1, sizeof(cl_mem), &A);
+ clSetKernelArg(diag_dtrtri_kernel_lower, 2, sizeof(unsigned int), &offA);
+ clSetKernelArg(diag_dtrtri_kernel_lower, 3, sizeof(cl_mem), &d_dinvA);
+ clSetKernelArg(diag_dtrtri_kernel_lower, 4, sizeof(unsigned int), &lda);
+ clSetKernelArg(diag_dtrtri_kernel_lower, 5, sizeof(unsigned int), &m);
+
+
+ size_t globalThreads[1] = { nthreads };
+ size_t globalLocal[1] = { BLOCK_SIZE };
+
+ err = clEnqueueNDRangeKernel(queue, diag_dtrtri_kernel_lower, 1, NULL,
+ globalThreads, globalLocal,
+ 0, NULL, event);
+
+ if (err != CL_SUCCESS) {
+ //printf( "kernel -diag_dtrtri_kernel_lower- failed with %d\n", err );
+ return err;
+ }
+
+ err = clReleaseKernel(diag_dtrtri_kernel_lower);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+
+ // update the inverse up to the size of BLOCK_SIZE
+ for (int i = BLOCK_SIZE; i < NB; i *= 2) {
+
+ switch (i) {
+ case 16:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ case 32:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ case 64:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ default:
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ break;
+
+ }
+ if (i * 2 >= M) break;
+ }
+ */
+ }
+ else {
+
+ diag_dtrtri_kernel_upper_KernelSource = diag_dtrtri_upper_128_16_src;
+ diag_dtrtri_kernel_upper_ClKernel = &diag_dtrtri_upper_128_16_clKernel;
+ diag_dtrtri_kernel_upper_KernelBinary = diag_dtrtri_upper_128_16_bin;
+ diag_dtrtri_kernel_upper_KernelBinarySize = diag_dtrtri_upper_128_16_binSize;
+
+ makeKernel(diag_dtrtri_kernel_upper_ClKernel,
+ queue,
+ diag_dtrtri_kernel_upper_KernelSource,
+ TrtriBuildOptions,
+ &diag_dtrtri_kernel_upper_KernelBinary,
+ &diag_dtrtri_kernel_upper_KernelBinarySize,
+ TrtribinBuildOptions);
+
+ int isDiagUnit = (diag == clblasUnit);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 0, sizeof(int), &isDiagUnit);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 1, sizeof(cl_mem), &A);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 2, sizeof(unsigned int), &offA);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 3, sizeof(cl_mem), &d_dinvA);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 4, sizeof(unsigned int), &lda);
+ CL_CHECK(err);
+ err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 5, sizeof(unsigned int), &m);
+ CL_CHECK(err);
+
+ size_t globalThreads[1] = { nthreads };
+ size_t globalLocal[1] = { inner_block_size };
+
+ err = clEnqueueNDRangeKernel(queue, *diag_dtrtri_kernel_upper_ClKernel, 1, NULL,
+ globalThreads, globalLocal,
+ 0, NULL, NULL);
+
+ if (err != CL_SUCCESS) {
+ //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
+ return err;
+ }
+
+ //clReleaseKernel(diag_dtrtri_kernel_upper);
+ //if (err != CL_SUCCESS) {
+ // return err;
+ //}
+
+ // update the inverse up to the size of BLOCK_SIZE
+
+ for (int i = inner_block_size; i < outer_block_size; i *= 2) {
+
+ switch (i) {
+ case 16:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_16_R_clKernel,
+ triple_dgemm_update_128_16_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_16_R_bin,
+ &triple_dgemm_update_128_16_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+
+ break;
+
+ case 32:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART1_R_clKernel,
+ triple_dgemm_update_128_32_PART1_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_32_PART1_R_bin,
+ &triple_dgemm_update_128_32_PART1_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART2_R_clKernel,
+ triple_dgemm_update_128_32_PART2_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_32_PART2_R_bin,
+ &triple_dgemm_update_128_32_PART2_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+
+ break;
+
+ case 64:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART1_R_clKernel,
+ triple_dgemm_update_128_64_PART1_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_64_PART1_R_bin,
+ &triple_dgemm_update_128_64_PART1_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART2_R_clKernel,
+ triple_dgemm_update_128_64_PART2_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_64_PART2_R_bin,
+ &triple_dgemm_update_128_64_PART2_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+
+ break;
+
+ default:
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART1_R_clKernel,
+ triple_dgemm_update_128_ABOVE64_PART1_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART1_R_bin,
+ &triple_dgemm_update_128_ABOVE64_PART1_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART2_R_clKernel,
+ triple_dgemm_update_128_ABOVE64_PART2_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART2_R_bin,
+ &triple_dgemm_update_128_ABOVE64_PART2_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART3_R_clKernel,
+ triple_dgemm_update_128_ABOVE64_PART3_R_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART3_R_bin,
+ &triple_dgemm_update_128_ABOVE64_PART3_R_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ break;
+ }
+
+ if (i * 2 >= M) break;
+ }
+
+ }
+
+ return err;
+
+}
+
static clblasStatus gpu_dtrsm128(
clblasOrder order,
clblasSide side,
@@ -688,10 +1007,10 @@ static clblasStatus gpu_dtrsm128(
double alpha,
const cl_mem A,
size_t offA,
- size_t lda,
+ size_t ldA,
cl_mem B,
size_t offB,
- size_t ldb,
+ size_t ldB,
cl_uint numCommandQueues,
cl_command_queue *commandQueues,
cl_uint numEventsInWaitList,
@@ -699,7 +1018,233 @@ static clblasStatus gpu_dtrsm128(
cl_event *events,
bool &specialCaseHandled)
{
+ if (order != clblasColumnMajor)
+ return clblasNotImplemented;
+ if (M < 16 || N < 16)
+ return clblasNotImplemented;
+
+ //for now
+ if (side == clblasRight)
+ return clblasNotImplemented;
+ if (uplo == clblasLower)
+ return clblasNotImplemented;
+
+ int inner_block_size = 16; // inner blocking size, <=32
+ int outer_block_size = 128;// outer blocking size, >BLOCK_SIZE
+
+ cl_int err = 0;
+
+ int i;
+ cl_context context;
+ err = getQueueContext(commandQueues[0], &context);
+ CL_CHECK(err);
+
+ /* quick return on wrong size */
+ if (M <= 0 || N <= 0)
+ return clblasInvalidDim;
+
+ double neg_one = -1.0;
+ double one = 1.0;
+ double zero = 0.0;
+ // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm
+#define _(M,i,j) M , (off##M + ((i)+(j)*ld##M) ) , ld##M
+
+ cl_mem InvA = 0;
+ cl_mem X = 0;
+ // X of size mxn will contain the result
+ size_t ldX = M;
+ size_t offX = 0; //must be 0: needed by the _(X,i,j) macro
+ size_t size_X = N*ldX * sizeof(double);
+ X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err);
+ CL_CHECK(err);
+ err = clearBuffer(commandQueues[0], X, size_X);
+ CL_CHECK(err);
+
+ if (side == clblasLeft)
+ {
+ // side=L
+ /* invert the diagonals
+ * Allocate device memory for the inverted diagonal blocks, size=m*nb
+ */
+ size_t ldInvA = outer_block_size;
+ size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro
+ size_t size_InvA = ldInvA * BLOCKS(M, outer_block_size) * outer_block_size *sizeof(double);
+ InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err);
+
+ CL_CHECK(err);
+ err = clearBuffer(commandQueues[0], InvA, size_InvA);
+ CL_CHECK(err);
+
+ err = diag_dtrtri128(commandQueues[0], N, uplo, diag, A, offA, InvA, ldA, inner_block_size, outer_block_size, events);
+ CL_CHECK(err);
+
+ //
+ // Helper for C = alpha * transp(A) * B + beta * C
+ //
+ // In the calls below:
+ // - the 1st matrix shall be either A or InvA transposed according to transA.
+ // - the 2nd and 3rd matrices are either B and X
+ //
+#define DGEMM_LEFT(m, n, k, alpha, A, B, beta, C) \
+ do { \
+ err = clblasDgemm(clblasColumnMajor, transA, clblasNoTrans , m, n, k, alpha, A, B, beta, C , 1, commandQueues, 0, NULL, events ) ; \
+ CL_CHECK(err); \
+ } while(0)
+
+
+ if (transA == clblasNoTrans)
+ {
+ /* the non-transpose case */
+ if (uplo == clblasLower)
+ {
+
+ /* the lower case */
+ /* handle the first block seperately with alpha */
+
+ //return for now
+ clReleaseMemObject(InvA);
+ clReleaseMemObject(X);
+ return clblasNotImplemented;
+
+ int mm = min(outer_block_size, (int)M);
+ DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, 0), _(B, 0, 0), zero, _(X, 0, 0));
+
+ if (outer_block_size < M)
+ {
+ DGEMM_LEFT(M - outer_block_size, N, outer_block_size, neg_one, _(A, outer_block_size, 0), _(X, 0, 0), alpha, _(B, outer_block_size, 0));
+
+ /* the rest blocks */
+ for (i = outer_block_size; i < M; i += outer_block_size) {
+ mm = min((int)M - i, outer_block_size);
+ DGEMM_LEFT(mm, N, mm, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
+
+ if (i + outer_block_size >= M)
+ break;
+
+ DGEMM_LEFT(M - i - outer_block_size, N, outer_block_size, neg_one, _(A, i + outer_block_size, i), _(X, i, 0), one, _(B, i + outer_block_size, 0));
+ }
+
+ //check_last_error() ;
+ }
+
+
+ }
+ else // if ( uplo == clblasUpper)
+ {
+ /* the upper case */
+ /* handle the first block seperately with alpha */
+ std::cout << "dtrtri trsm " << std::endl;
+ int mm = (M % outer_block_size == 0) ? outer_block_size : (M % outer_block_size);
+ i = M - mm;
+ //DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
+
+ if (i - outer_block_size >= 0)
+ {
+ //DGEMM_LEFT(i, N, mm, neg_one, _(A, 0, i), _(X, i, 0), alpha, _(B, 0, 0));
+
+ /* the rest blocks */
+ for (i = M - mm - outer_block_size; i >= 0; i -= outer_block_size) {
+ //DGEMM_LEFT(outer_block_size, N, outer_block_size, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
+
+ if (i - outer_block_size < 0)
+ break;
+
+ //DGEMM_LEFT(i, N, outer_block_size, neg_one, _(A, 0, i), _(X, i, 0), one, _(B, 0, 0));
+ }
+ }
+ }
+ }
+ else
+ {
+ /* the transpose case */
+ if (uplo == clblasLower)
+ {
+ /* the lower case */
+ /* handle the first block seperately with alpha */
+
+ //return for now
+ clReleaseMemObject(InvA);
+ clReleaseMemObject(X);
+ return clblasNotImplemented;
+
+ int mm = (M % outer_block_size == 0) ? outer_block_size : (M % outer_block_size);
+ i = M - mm;
+ DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
+
+ if (i - outer_block_size >= 0)
+ {
+ DGEMM_LEFT(i, N, mm, neg_one, _(A, i, 0), _(X, i, 0), alpha, _(B, 0, 0));
+
+ /* the rest blocks */
+ for (i = M - mm - outer_block_size; i >= 0; i -= outer_block_size) {
+ DGEMM_LEFT(outer_block_size, N, outer_block_size, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
+
+ if (i - outer_block_size < 0)
+ break;
+
+ DGEMM_LEFT(i, N, outer_block_size, neg_one, _(A, i, 0), _(X, i, 0), one, _(B, 0, 0));
+ }
+ }
+ }
+ else
+ {
+ /* the upper case */
+ /* handle the first block seperately with alpha */
+ std::cout << "dtrtri trsm " << std::endl;
+ int mm = min(outer_block_size, (int)M);
+ DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, 0), _(B, 0, 0), zero, _(X, 0, 0));
+
+ if (outer_block_size < M)
+ {
+
+ DGEMM_LEFT(M - outer_block_size, N, outer_block_size, neg_one, _(A, 0, outer_block_size), _(X, 0, 0), alpha, _(B, outer_block_size, 0));
+
+ /* the rest blocks */
+ for (i = outer_block_size; i < M; i += outer_block_size) {
+ mm = min((int)M - i, outer_block_size);
+ DGEMM_LEFT(mm, N, mm, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
+
+ if (i + outer_block_size >= M)
+ break;
+
+ DGEMM_LEFT(M - i - outer_block_size, N, outer_block_size, neg_one, _(A, i, i + outer_block_size), _(X, i, 0), one, _(B, i + outer_block_size, 0));
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ clReleaseMemObject(X);
+ return clblasNotImplemented;
+ }
+
+ // Copy X(m,n) to B(m,n)
+ {
+ size_t src_origin[3] = { 0, 0, 0 };
+ size_t dst_origin[3] = { offB*sizeof(double), 0, 0 };
+ size_t region[3] = { M*sizeof(double), N, 1 };
+
+
+ err = clEnqueueCopyBufferRect(commandQueues[0],
+ X,
+ B,
+ src_origin,
+ dst_origin,
+ region,
+ ldX*sizeof(double), 0,
+ ldB*sizeof(double), 0,
+ 0, NULL,
+ events);
+ CL_CHECK(err);
+
+ clReleaseMemObject(InvA);
+ clReleaseMemObject(X);
+
+ specialCaseHandled = true;
+ return clblasSuccess;
+ }
return clblasNotImplemented;
}
@@ -919,7 +1464,10 @@ clblasDtrsm(
*/
bool specialCaseHandled = false;
- clblasStatus SpecialCaseStatus = gpu_dtrsm192(order,
+ //outer block size = 192
+ //inner block size = 12
+ clblasStatus SpecialCaseStatus;
+ SpecialCaseStatus = gpu_dtrsm192(order,
side,
uplo,
transA,
@@ -936,7 +1484,25 @@ clblasDtrsm(
if (specialCaseHandled)
return SpecialCaseStatus;
+
+ SpecialCaseStatus = gpu_dtrsm128(order,
+ side,
+ uplo,
+ transA,
+ diag,
+ M, N,
+ alpha,
+ A, offA, lda,
+ B, offB, ldb,
+ numCommandQueues, commandQueues,
+ numEventsInWaitList,
+ eventWaitList,
+ events,
+ specialCaseHandled);
+ if (specialCaseHandled)
+ return SpecialCaseStatus;
+
CLBlasKargs kargs;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list