[clblas] 15/75: Fixing issues for when Beta == 0 in sgemm special cases

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Tue Jan 24 23:30:31 UTC 2017


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/master
in repository clblas.

commit 9c66a77e1d51db8dd5a4f2039f9d64236d50edf6
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Mon Jan 18 13:53:16 2016 -0500

    Fixing issues for when Beta == 0 in sgemm special cases
---
 ...sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 70 ++++++++++++----------
 ...sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp | 68 ++++++++++++---------
 ...sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp | 33 +++++-----
 .../sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp   | 20 ++++++-
 .../sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp   | 18 ++++--
 ...sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 58 +++++++++++-------
 6 files changed, 162 insertions(+), 105 deletions(-)

diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp
index 831b5dc..cc90ff9 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp
@@ -57,46 +57,46 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH (
     float rC[2][2]  = { {(float)0} };
     float rA[1][2];
     float rB[1][2];
-    
 
-    
+
+
     A += offsetA;
     B += offsetB;
     C+=offsetC;
-    
+
     __local float lA[528];//16*32+16
     __local float lB[528];
-    
+
     uint gidx = get_group_id(0);
     uint gidy = get_group_id(1);
     uint idx = get_local_id(0);
     uint idy = get_local_id(1);
-	
+
 	int CurrentOffSetA = gidx*32+ idx;
 	int CurrentOffSetB = gidy*32+ idy;
 
     A +=  gidx*32+ idx + idy*lda;
     B +=  gidy*32*ldb+ idx + idy*ldb;
-    
-   
+
+
     uint block_k = K >> 4;
-    do 
+    do
 	{
         __local float* plA = lA + idy*33+idx;
         __local float* plB = lB + idx*33+idy;
         barrier(CLK_LOCAL_MEM_FENCE);
-		
+
         plB[0]  = CurrentOffSetB>=N?0.0:B[0];
         plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
-	   
+
 	    plA[0]  = CurrentOffSetA>=M?0.0:A[0];
         plA[16] = CurrentOffSetA+16>=M?0.0:A[16];
 
-        
+
         barrier(CLK_LOCAL_MEM_FENCE);
         uint offA = idx;
         uint offB = idy;
-		
+
         M2x2
 		M2x2
 		M2x2
@@ -123,26 +123,36 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH (
     int offset_y = gidy*32+ idy;
     if(offset_x>=M || offset_y>=N )
       return;
-	
+
     C+=offset_x+offset_y*ldc;
-	
-    
+
+
 	int i = 0;
-    do 
-	{
-	  C[0     ] = mad(alpha, rC[i][0], beta*C[0]);
-	  if(offset_y+16<N)
-        C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
-      
-	  C+=16;
-	  offset_x+=16;
-	  if(offset_x>=M )
-        return;
-
-	    
-	}
-    while (++i < 2);
-   
+    if (beta != 0) {
+      do
+      {
+        C[0     ] = mad(alpha, rC[i][0], beta*C[0]);
+        if(offset_y+16<N)
+          C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
+          C+=16;
+          offset_x+=16;
+          if(offset_x>=M )
+            return;
+      }
+      while (++i < 2);
+    } else {
+      do
+      {
+        C[0     ] = alpha * rC[i][0];
+        if(offset_y+16<N)
+          C[16*ldc] = alpha * rC[i][1];
+        C+=16;
+        offset_x+=16;
+        if(offset_x>=M )
+          return;
+      }
+      while (++i < 2);
+    }
 }
 
 );
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp
index f50b580..f26ddec 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp
@@ -57,41 +57,41 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH (
     float rC[2][2]  = { {(float)0} };
     float rA[1][2];
     float rB[1][2];
-    
-    
+
+
     A += offsetA;
     B += offsetB;
     C+=offsetC;
-    
+
     __local float lA[528];//16*32+16
     __local float lB[528];
-    
+
     uint gidx = get_group_id(0);
     uint gidy = get_group_id(1);
     uint idx = get_local_id(0);
     uint idy = get_local_id(1);
-    
+
 	int CurrentOffSetA = gidx*32+ idx;
 	int CurrentOffSetB = gidy*32+ idx;
-    
+
     A +=  gidx*32+ idx + idy*lda;
     B +=  gidy*32+ idx + idy*ldb;
-    
-   
+
+
     uint block_k = K >> 4;
-    do 
+    do
 	{
         __local float* plA = lA + idy*33+idx;
         __local float* plB = lB + idy*33+idx;
         barrier(CLK_LOCAL_MEM_FENCE);
-		
+
         plB[0]  = CurrentOffSetB>=N?0.0:B[0];
         plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
-	   
+
 	    plA[0]  = CurrentOffSetA>=M?0.0:A[0];
         plA[16] = CurrentOffSetA+16>=M?0.0:A[16];
 
-        
+
         barrier(CLK_LOCAL_MEM_FENCE);
         uint offA = idx;
         uint offB = idy;
@@ -126,23 +126,35 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH (
       return;
 
     C+=offset_x+offset_y*ldc;
-    
-	int i = 0;
-    do 
-	{
-	  C[0     ] = mad(alpha, rC[i][0], beta*C[0]);
-	  if(offset_y+16<N)
-        C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
-      
-	  C+=16;
-	  offset_x+=16;
-	  if(offset_x>=M )
-        return;
-
-	    
-	}
-    while (++i < 2);
 
+	int i = 0;
+    if (beta !=0 ) {
+      do
+      {
+        C[0     ] = mad(alpha, rC[i][0], beta*C[0]);
+        if(offset_y+16<N)
+          C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
+
+        C+=16;
+        offset_x+=16;
+        if(offset_x>=M )
+          return;
+      }
+      while (++i < 2);
+    } else {
+      do
+      {
+        C[0     ] = alpha * rC[i][0];
+        if(offset_y+16<N)
+          C[16*ldc] = alpha * rC[i][1];
+
+        C+=16;
+        offset_x+=16;
+        if(offset_x>=M )
+          return;
+      }
+      while (++i < 2);
+    }
 }
 );
 #endif
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp
index be5b219..a01958f 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp
@@ -133,25 +133,26 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE (
 	int offset_x = gidx * 64 + idx;
 	int offset_y = gidy * 64 + idy;
 
-	//if(offset_x>=M || offset_y>=N )
-	//  return;
-
 	C += offset_x + offset_y*ldc;
 
 	int i = 0;
-	do
-	{
-		C[0] = mad(alpha, rC[i][0], beta*C[0]);
-		C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
-
-
-		C += 16;
-		offset_x += 16;
-		//if(offset_x>=M )
-		//  return;
-
-
-	} while (++i < 2);
+    if (beta != 0) {
+      do
+      {
+        C[0] = mad(alpha, rC[i][0], beta*C[0]);
+        C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
+        C += 16;
+        offset_x += 16;
+      } while (++i < 2);
+    } else {
+      do
+      {
+        C[0] = alpha * rC[i][0];
+        C[16 * ldc] = alpha * rC[i][1];
+        C += 16;
+        offset_x += 16;
+      } while (++i < 2);
+    }
 
 }
 );
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp
index 5c41406..ae477cb 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp
@@ -145,8 +145,9 @@ __kernel void sgemm_Col_NT_B1_MX032_NX064_KX16_ROW (
 	C += offset_x + offset_y*ldc;
 
 	int i = 0;
-	do
-	{
+    if (beta != 0) {
+      do
+      {
 		C[0] = mad(alpha, rC[i][0], beta*C[0]);
 		C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
 		C[32 * ldc] = mad(alpha, rC[i][2], beta*C[32 * ldc]);
@@ -155,7 +156,20 @@ __kernel void sgemm_Col_NT_B1_MX032_NX064_KX16_ROW (
 		offset_x += 16;
 		//if(offset_x>=M )
 		//  return;
-	} while (++i < 2);
+      } while (++i < 2);
+    } else {
+      do
+      {
+        C[0] = alpha * rC[i][0];
+		C[16 * ldc] = alpha * rC[i][1];
+		C[32 * ldc] = alpha * rC[i][2];
+		C[48 * ldc] = alpha * rC[i][3];
+		C += 16;
+		offset_x += 16;
+		//if(offset_x>=M )
+		//  return;
+      } while (++i < 2);
+    }
 }
 );
 #endif
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp
index 2c9e9ff..3d39977 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp
@@ -143,15 +143,21 @@ __kernel void sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN (
 	C += offset_x + offset_y*ldc;
 
 	int i = 0;
-	do
-	{
+    if (beta != 0) {
+  	  do
+      {
 		C[0] = mad(alpha, rC[i][0], beta*C[0]);
 		C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
-
 		C += 16;
-
-	} while (++i < 4);
-
+      } while (++i < 4);
+    } else {
+      do
+      {
+        C[0] = alpha * rC[i][0];
+		C[16 * ldc] = alpha * rC[i][1];
+		C += 16;
+      } while (++i < 4);
+    }
 }
 );
 #endif
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp
index 370ca88..a41a09e 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp
@@ -57,34 +57,34 @@ __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src (
     float rC[2][2]  = { {(float)0} };
     float rA[1][2];
     float rB[1][2];
-    
-    
+
+
     A += offsetA;
     B += offsetB;
     C+=offsetC;
-    
+
     __local float lA[528];//16*32+16
     __local float lB[528];
-    
+
     uint gidx = get_group_id(0);
     uint gidy = get_group_id(1);
     uint idx = get_local_id(0);
     uint idy = get_local_id(1);
-    
+
     int CurrentOffSetA = gidx*32+ idy;
     int CurrentOffSetB = gidy*32+ idy;
 
     A +=  (gidx*32+idy)*lda + idx;
     B +=  (gidy*32+idy)*ldb + idx;
-    
-   
+
+
     uint block_k = K >> 4;
-    do 
+    do
     {
       __local float* plA = lA + idx*33+idy;
       __local float* plB = lB + idx*33+idy;
       barrier(CLK_LOCAL_MEM_FENCE);
-  
+
       plB[0]  = CurrentOffSetB>=N?0.0:B[0];
       plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
 
@@ -127,21 +127,35 @@ __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src (
 
     C+=offset_x+offset_y*ldc;
 
-    int i = 0;
-    do 
-    {
-      C[0     ] = mad(alpha, rC[i][0], beta*C[0]);
-      if(offset_y+16<N)
-        C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
-
-      C+=16;
-      offset_x+=16;
-      if(offset_x>=M )
-        return;
-
 
+    int i = 0;
+    if (beta != 0) {
+      do
+      {
+        C[0   ] = mad(alpha, rC[i][0], beta*C[0]);
+        if(offset_y+16<N)
+          C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
+
+        C+=16;
+        offset_x+=16;
+        if(offset_x>=M )
+          return;
+      }
+      while (++i < 2);
+    } else {
+      do
+      {
+        C[0   ] = alpha * rC[i][0];
+        if(offset_y+16<N)
+          C[16*ldc] = alpha * rC[i][1];
+
+        C+=16;
+        offset_x+=16;
+        if(offset_x>=M )
+          return;
+      }
+      while (++i < 2);
     }
-    while (++i < 2);
 }
 
 );

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git



More information about the debian-science-commits mailing list