[clblas] 14/75: Fixing issue with beta == 0 in UserGemm kernels

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Tue Jan 24 23:30:31 UTC 2017


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/master
in repository clblas.

commit 627c6545591d16da159bc6f479070441916f827c
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date:   Fri Jan 15 20:28:13 2016 -0500

    Fixing issue with beta == 0 in UserGemm kernels
    
    Related to 1af16a8bdf3ef3b356054a9038afd2f0b94b0627
---
 .../sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp       | 72 +++++++++++-----------
 .../sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp       | 72 +++++++++++-----------
 .../sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp       | 72 +++++++++++-----------
 3 files changed, 108 insertions(+), 108 deletions(-)

diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp
index 5151bde..c1f9256 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp
@@ -161,47 +161,47 @@ __kernel void sgemm_Col_NN_B0_MX096_NX096_KX16 (
     C+= gidy*96*ldc;
     C+= idy*ldc;
 
-    C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc];
+    C[0 *ldc] = alpha*rC[0][0];
+    C[16*ldc] = alpha*rC[0][1];
+    C[32*ldc] = alpha*rC[0][2];
+    C[48*ldc] = alpha*rC[0][3];
+    C[64*ldc] = alpha*rC[0][4];
+    C[80*ldc] = alpha*rC[0][5];
     C+=16;
-    C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc];
+    C[0 *ldc] = alpha*rC[1][0];
+    C[16*ldc] = alpha*rC[1][1];
+    C[32*ldc] = alpha*rC[1][2];
+    C[48*ldc] = alpha*rC[1][3];
+    C[64*ldc] = alpha*rC[1][4];
+    C[80*ldc] = alpha*rC[1][5];
     C+=16;
-    C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc];
+    C[0 *ldc] = alpha*rC[2][0];
+    C[16*ldc] = alpha*rC[2][1];
+    C[32*ldc] = alpha*rC[2][2];
+    C[48*ldc] = alpha*rC[2][3];
+    C[64*ldc] = alpha*rC[2][4];
+    C[80*ldc] = alpha*rC[2][5];
     C+=16;
-    C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc];
+    C[0 *ldc] = alpha*rC[3][0];
+    C[16*ldc] = alpha*rC[3][1];
+    C[32*ldc] = alpha*rC[3][2];
+    C[48*ldc] = alpha*rC[3][3];
+    C[64*ldc] = alpha*rC[3][4];
+    C[80*ldc] = alpha*rC[3][5];
     C+=16;
-    C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc];
+    C[0 *ldc] = alpha*rC[4][0];
+    C[16*ldc] = alpha*rC[4][1];
+    C[32*ldc] = alpha*rC[4][2];
+    C[48*ldc] = alpha*rC[4][3];
+    C[64*ldc] = alpha*rC[4][4];
+    C[80*ldc] = alpha*rC[4][5];
     C+=16;
-    C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc];
+    C[0 *ldc] = alpha*rC[5][0];
+    C[16*ldc] = alpha*rC[5][1];
+    C[32*ldc] = alpha*rC[5][2];
+    C[48*ldc] = alpha*rC[5][3];
+    C[64*ldc] = alpha*rC[5][4];
+    C[80*ldc] = alpha*rC[5][5];
 
 }
 );
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp
index d22eca6..a8d0fec 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp
@@ -163,47 +163,47 @@ __kernel void sgemm_Col_NT_B0_MX096_NX096_KX16 (
     C+= gidy*96*ldc;
     C+= idy*ldc;
 
-	  C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc];
+    C[0*ldc] = alpha*rC[0][0];
+    C[16*ldc] = alpha*rC[0][1];
+    C[32*ldc] = alpha*rC[0][2];
+    C[48*ldc] = alpha*rC[0][3];
+    C[64*ldc] = alpha*rC[0][4];
+    C[80*ldc] = alpha*rC[0][5];
     C+=16;
-    C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc];
+    C[0*ldc] = alpha*rC[1][0];
+    C[16*ldc] = alpha*rC[1][1];
+    C[32*ldc] = alpha*rC[1][2];
+    C[48*ldc] = alpha*rC[1][3];
+    C[64*ldc] = alpha*rC[1][4];
+    C[80*ldc] = alpha*rC[1][5];
     C+=16;
-    C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc];
+    C[0*ldc] = alpha*rC[2][0];
+    C[16*ldc] = alpha*rC[2][1];
+    C[32*ldc] = alpha*rC[2][2];
+    C[48*ldc] = alpha*rC[2][3];
+    C[64*ldc] = alpha*rC[2][4];
+    C[80*ldc] = alpha*rC[2][5];
     C+=16;
-    C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc];
+    C[0*ldc] = alpha*rC[3][0];
+    C[16*ldc] = alpha*rC[3][1];
+    C[32*ldc] = alpha*rC[3][2];
+    C[48*ldc] = alpha*rC[3][3];
+    C[64*ldc] = alpha*rC[3][4];
+    C[80*ldc] = alpha*rC[3][5];
     C+=16;
-    C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc];
+    C[0*ldc] = alpha*rC[4][0];
+    C[16*ldc] = alpha*rC[4][1];
+    C[32*ldc] = alpha*rC[4][2];
+    C[48*ldc] = alpha*rC[4][3];
+    C[64*ldc] = alpha*rC[4][4];
+    C[80*ldc] = alpha*rC[4][5];
     C+=16;
-    C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc];
-    C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc];
-    C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc];
-    C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc];
-    C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc];
-    C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc];
+    C[0*ldc] = alpha*rC[5][0];
+    C[16*ldc] = alpha*rC[5][1];
+    C[32*ldc] = alpha*rC[5][2];
+    C[48*ldc] = alpha*rC[5][3];
+    C[64*ldc] = alpha*rC[5][4];
+    C[80*ldc] = alpha*rC[5][5];
 
 }
 );
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp
index 2668bfa..48323fc 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp
@@ -162,47 +162,47 @@ __kernel void sgemm_Col_TN_B0_MX096_NX096_KX16 (
   C+= gidy*96*ldc;
   C+= idy*ldc;
 
-  C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc];
-  C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc];
-  C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc];
-  C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc];
-  C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc];
-  C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc];
+  C[0*ldc] = alpha*rC[0][0];
+  C[16*ldc] = alpha*rC[0][1];
+  C[32*ldc] = alpha*rC[0][2];
+  C[48*ldc] = alpha*rC[0][3];
+  C[64*ldc] = alpha*rC[0][4];
+  C[80*ldc] = alpha*rC[0][5];
   C+=16;
-  C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc];
-  C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc];
-  C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc];
-  C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc];
-  C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc];
-  C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc];
+  C[0*ldc] = alpha*rC[1][0];
+  C[16*ldc] = alpha*rC[1][1];
+  C[32*ldc] = alpha*rC[1][2];
+  C[48*ldc] = alpha*rC[1][3];
+  C[64*ldc] = alpha*rC[1][4];
+  C[80*ldc] = alpha*rC[1][5];
   C+=16;
-  C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc];
-  C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc];
-  C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc];
-  C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc];
-  C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc];
-  C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc];
+  C[0*ldc] = alpha*rC[2][0];
+  C[16*ldc] = alpha*rC[2][1];
+  C[32*ldc] = alpha*rC[2][2];
+  C[48*ldc] = alpha*rC[2][3];
+  C[64*ldc] = alpha*rC[2][4];
+  C[80*ldc] = alpha*rC[2][5];
   C+=16;
-  C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc];
-  C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc];
-  C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc];
-  C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc];
-  C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc];
-  C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc];
+  C[0*ldc] = alpha*rC[3][0];
+  C[16*ldc] = alpha*rC[3][1];
+  C[32*ldc] = alpha*rC[3][2];
+  C[48*ldc] = alpha*rC[3][3];
+  C[64*ldc] = alpha*rC[3][4];
+  C[80*ldc] = alpha*rC[3][5];
   C+=16;
-  C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc];
-  C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc];
-  C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc];
-  C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc];
-  C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc];
-  C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc];
+  C[0*ldc] = alpha*rC[4][0];
+  C[16*ldc] = alpha*rC[4][1];
+  C[32*ldc] = alpha*rC[4][2];
+  C[48*ldc] = alpha*rC[4][3];
+  C[64*ldc] = alpha*rC[4][4];
+  C[80*ldc] = alpha*rC[4][5];
   C+=16;
-  C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc];
-  C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc];
-  C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc];
-  C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc];
-  C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc];
-  C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc];
+  C[0*ldc] = alpha*rC[5][0];
+  C[16*ldc] = alpha*rC[5][1];
+  C[32*ldc] = alpha*rC[5][2];
+  C[48*ldc] = alpha*rC[5][3];
+  C[64*ldc] = alpha*rC[5][4];
+  C[80*ldc] = alpha*rC[5][5];
 
 }
 );

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git



More information about the debian-science-commits mailing list