[clblas] 15/61: improve big sgemm column NN perf. improve small sgemm NN perf.

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Fri Jul 24 22:49:44 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clblas.

commit 39b324d184bb23b5769767316a9b7e3e40e62439
Author: Timmy <timmy.liu at amd.com>
Date:   Tue Apr 21 17:41:11 2015 -0500

    improve big sgemm column NN perf. improve small sgemm NN perf.
---
 src/library/blas/functor/hawaii.cc                 |   6 +-
 .../blas/functor/hawaii_sgemmBranchKernel.cc       |  24 +++-
 .../gens/clTemplates/sgemm_gcn_SmallMatrices.cl    | 125 +++++++++++++++++++++
 .../gens/clTemplates/sgemm_hawaiiSplitKernel.cl    |   3 +-
 4 files changed, 153 insertions(+), 5 deletions(-)

diff --git a/src/library/blas/functor/hawaii.cc b/src/library/blas/functor/hawaii.cc
index d7b32de..7f0e58c 100644
--- a/src/library/blas/functor/hawaii.cc
+++ b/src/library/blas/functor/hawaii.cc
@@ -101,7 +101,7 @@ clblasSgemmFunctor * FunctorSelectorHawaii::select_sgemm_specific(clblasSgemmFun
   //TODO: the logic below is complicated; Needs cleanup;
   clblasSgemmFunctor * functor;
   bool Not_TT = ((args.transA==clblasNoTrans && args.transB==clblasTrans ) || ( args.transA==clblasNoTrans && args.transB==clblasNoTrans ) || ( args.transA==clblasTrans && args.transB==clblasNoTrans ));
-  bool SmallMatrices = args.M/6*args.N/6<200*200 || ((args.M%64!=0 && args.N%64!=0 && args.M<1900 &&args.N<1900 ) && (args.M%96!=0 && args.N%96!=0 && args.M<1900 &&args.N<1900 ));
+  bool SmallMatrices = args.M/6*args.N/6<150*150 || ((args.M%64!=0 && args.N%64!=0 && args.M<1900 &&args.N<1900 ) && (args.M%96!=0 && args.N%96!=0 && args.M<1900 &&args.N<1900 ));
   bool SmallMatricesMod32= (SmallMatrices && (args.M%32==0&&args.N%32==0)) ;
   SmallMatricesMod32 = SmallMatricesMod32&&Not_TT&&args.K % 16 == 0;
   //SmallMatrices= false;
@@ -109,8 +109,8 @@ clblasSgemmFunctor * FunctorSelectorHawaii::select_sgemm_specific(clblasSgemmFun
   bool useSpliKernel=((args.M%96==0 && args.N%96==0) || !(args.M%64==0 && args.N%64==0&& args.M<4000 &&args.N<4000)) /*&&args.K%16==0*/;
   useSpliKernel=useSpliKernel&&Not_TT;
   
-  //the English translation of below is: if small matrix that is not mod32 and NT and K has to be mod 16
-  if (SmallMatrices && (!SmallMatricesMod32) && (args.transA == clblasNoTrans && args.transB == clblasTrans) && (args.K%16 == 0))
+  //the English translation of below is: if small matrix that is (not mod32) and (NT or NN) and K has to be mod 16
+  if (SmallMatrices && (!SmallMatricesMod32) && (args.transA == clblasNoTrans) && (args.K%16 == 0))
   {
 	  functor = clBlashawaiiSgemmBranchKernelFunctor::provide(args, "Hawaii");
 	  if (functor)
diff --git a/src/library/blas/functor/hawaii_sgemmBranchKernel.cc b/src/library/blas/functor/hawaii_sgemmBranchKernel.cc
index 8a93bf1..34c5f33 100644
--- a/src/library/blas/functor/hawaii_sgemmBranchKernel.cc
+++ b/src/library/blas/functor/hawaii_sgemmBranchKernel.cc
@@ -122,8 +122,30 @@ static const Variant * select_variant_BranchKernel(clblasSgemmFunctor::Args & ar
 		{
 
 			// ===== sgemm NN ======
-			// currently not supported
+			// sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH
+			const char* KName_NT = SGEMM_KERNEL_NAME(N, N, 32, 32, 16, 16, 16, 2, 2, __ALPHABETA, BRANCH);
+			const char* KBin_NN64;
+			size_t KBin_NNSize64 = 0;
+#if BUILD_KERNEL_FROM_STRING
+			//currently not supported
 			return NULL;
+#else
+			if (!strcmp(DevName, "Hawaii"))
+			{
+				//KBin_NT64             = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA,  64, HAWAII) ;
+				KBin_NN64 = sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii;
+				KBin_NNSize64 = sizeof(sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii);
+
+			}
+#endif
+			static const Variant variant = SGEMM_VARIANT_OBJ(N, N, 16, 16, 16, 2, 2, 64, __ALPHABETA,
+				KName_NT,
+				NULL,
+				NULL,
+				KBin_NN64,
+				KBin_NNSize64);
+
+			return &variant;
 		}
 		if (args.transB == clblasTrans)
 		{
diff --git a/src/library/blas/gens/clTemplates/sgemm_gcn_SmallMatrices.cl b/src/library/blas/gens/clTemplates/sgemm_gcn_SmallMatrices.cl
index 9999282..17aa291 100644
--- a/src/library/blas/gens/clTemplates/sgemm_gcn_SmallMatrices.cl
+++ b/src/library/blas/gens/clTemplates/sgemm_gcn_SmallMatrices.cl
@@ -567,6 +567,131 @@ __kernel void sgemm_NN_32_32_16_16x16_2x2__ALPHA( __global float const * restric
 
 ";
 
+static const char * sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH = "
+
+#define  M2x2 \
+            rA[0][0] = lA[offA + 0];				  \
+            rA[0][1] = lA[offA + 16];				  \
+            rB[0][0] = lB[offB + 0];				  \
+            rB[0][1] = lB[offB + 16];				  \
+            offA += 33;								  \
+            offB += 33;								  \
+            rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \
+            rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \
+            rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \
+            rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \
+			mem_fence(CLK_LOCAL_MEM_FENCE);
+
+__attribute__((reqd_work_group_size(16,16,1)))
+
+__kernel void sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH( __global float const * restrict A,
+  __global float const * restrict B,
+  __global float * C,
+  uint const M,
+  uint const N,
+  uint const K,
+  float const alpha,
+  float const beta,
+  uint lda,
+  uint ldb,
+  uint ldc,
+  uint offsetA,
+  uint offsetB,
+  uint offsetC)
+{
+    float rC[2][2]  = {(float)0};
+    float rA[1][2];
+    float rB[1][2];
+    
+
+    
+    A += offsetA;
+    B += offsetB;
+    C+=offsetC;
+    
+    __local float lA[528];//16*32+16
+    __local float lB[528];
+    
+    uint gidx = get_group_id(0);
+    uint gidy = get_group_id(1);
+    uint idx = get_local_id(0);
+    uint idy = get_local_id(1);
+	
+	int CurrentOffSetA = gidx*32+ idx;
+	int CurrentOffSetB = gidy*32+ idy;
+
+    A +=  gidx*32+ idx + idy*lda;
+    B +=  gidy*32*ldb+ idx + idy*ldb;
+    
+   
+    uint block_k = K >> 4;
+    do 
+	{
+        __local float* plA = lA + idy*33+idx;
+        __local float* plB = lB + idx*33+idy;
+        barrier(CLK_LOCAL_MEM_FENCE);
+		
+        plB[0]  = CurrentOffSetB>=N?0.0:B[0];
+        plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
+	   
+	    plA[0]  = CurrentOffSetA>=M?0.0:A[0];
+        plA[16] = CurrentOffSetA+16>=M?0.0:A[16];
+
+        
+        barrier(CLK_LOCAL_MEM_FENCE);
+        uint offA = idx;
+        uint offB = idy;
+		
+        M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+		M2x2
+
+        A += lda<<4;
+        B += 16;
+    //}
+	} while (--block_k > 0);
+
+	int offset_x = gidx*32+idx;
+    int offset_y = gidy*32+ idy;
+    if(offset_x>=M || offset_y>=N )
+      return;
+	
+    C+=offset_x+offset_y*ldc;
+	
+    
+	int i = 0;
+    do 
+	{
+	  C[0     ] = mad(alpha, rC[i][0], beta*C[0]);
+	  if(offset_y+16<N)
+        C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
+      
+	  C+=16;
+	  offset_x+=16;
+	  if(offset_x>=M )
+        return;
+
+	    
+	}
+    while (++i < 2);
+   
+}
+
+";
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 static const char * sgemm_TN_32_32_16_16x16_2x2__ALPHABETA = "
 
diff --git a/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl b/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl
index 1158598..47884c5 100644
--- a/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl
+++ b/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl
@@ -2546,7 +2546,7 @@ static const char * sgemm_NN_16_SPLIT__ALPHABETA = "
             rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
             rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
             rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
-			barrier(CLK_LOCAL_MEM_FENCE);
+			mem_fence(CLK_LOCAL_MEM_FENCE);
 
 __attribute__((reqd_work_group_size(16,16,1)))
 __kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHABETA_SPLIT_MAIN( __global float const * restrict A,
@@ -2591,6 +2591,7 @@ __kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHABETA_SPLIT_MAIN( __global float
 	{
         __local float* plA = lA + idy*97+idx;
         __local float* plB = lB + idx*97+idy;
+		barrier(CLK_LOCAL_MEM_FENCE);
         plB[0] = B[0];
         plB[16] = B[16*ldb];
         plB[32] = B[32*ldb];

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git



More information about the debian-science-commits mailing list