[clblas] 15/75: Fixing issues for when Beta == 0 in sgemm special cases
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Jan 24 23:30:31 UTC 2017
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/master
in repository clblas.
commit 9c66a77e1d51db8dd5a4f2039f9d64236d50edf6
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Mon Jan 18 13:53:16 2016 -0500
Fixing issues for when Beta == 0 in sgemm special cases
---
...sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 70 ++++++++++++----------
...sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp | 68 ++++++++++++---------
...sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp | 33 +++++-----
.../sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp | 20 ++++++-
.../sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp | 18 ++++--
...sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 58 +++++++++++-------
6 files changed, 162 insertions(+), 105 deletions(-)
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp
index 831b5dc..cc90ff9 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp
@@ -57,46 +57,46 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH (
float rC[2][2] = { {(float)0} };
float rA[1][2];
float rB[1][2];
-
-
+
+
A += offsetA;
B += offsetB;
C+=offsetC;
-
+
__local float lA[528];//16*32+16
__local float lB[528];
-
+
uint gidx = get_group_id(0);
uint gidy = get_group_id(1);
uint idx = get_local_id(0);
uint idy = get_local_id(1);
-
+
int CurrentOffSetA = gidx*32+ idx;
int CurrentOffSetB = gidy*32+ idy;
A += gidx*32+ idx + idy*lda;
B += gidy*32*ldb+ idx + idy*ldb;
-
-
+
+
uint block_k = K >> 4;
- do
+ do
{
__local float* plA = lA + idy*33+idx;
__local float* plB = lB + idx*33+idy;
barrier(CLK_LOCAL_MEM_FENCE);
-
+
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
-
+
plA[0] = CurrentOffSetA>=M?0.0:A[0];
plA[16] = CurrentOffSetA+16>=M?0.0:A[16];
-
+
barrier(CLK_LOCAL_MEM_FENCE);
uint offA = idx;
uint offB = idy;
-
+
M2x2
M2x2
M2x2
@@ -123,26 +123,36 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH (
int offset_y = gidy*32+ idy;
if(offset_x>=M || offset_y>=N )
return;
-
+
C+=offset_x+offset_y*ldc;
-
-
+
+
int i = 0;
- do
- {
- C[0 ] = mad(alpha, rC[i][0], beta*C[0]);
- if(offset_y+16<N)
- C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
-
- C+=16;
- offset_x+=16;
- if(offset_x>=M )
- return;
-
-
- }
- while (++i < 2);
-
+ if (beta != 0) {
+ do
+ {
+ C[0 ] = mad(alpha, rC[i][0], beta*C[0]);
+ if(offset_y+16<N)
+ C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
+ C+=16;
+ offset_x+=16;
+ if(offset_x>=M )
+ return;
+ }
+ while (++i < 2);
+ } else {
+ do
+ {
+ C[0 ] = alpha * rC[i][0];
+ if(offset_y+16<N)
+ C[16*ldc] = alpha * rC[i][1];
+ C+=16;
+ offset_x+=16;
+ if(offset_x>=M )
+ return;
+ }
+ while (++i < 2);
+ }
}
);
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp
index f50b580..f26ddec 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp
@@ -57,41 +57,41 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH (
float rC[2][2] = { {(float)0} };
float rA[1][2];
float rB[1][2];
-
-
+
+
A += offsetA;
B += offsetB;
C+=offsetC;
-
+
__local float lA[528];//16*32+16
__local float lB[528];
-
+
uint gidx = get_group_id(0);
uint gidy = get_group_id(1);
uint idx = get_local_id(0);
uint idy = get_local_id(1);
-
+
int CurrentOffSetA = gidx*32+ idx;
int CurrentOffSetB = gidy*32+ idx;
-
+
A += gidx*32+ idx + idy*lda;
B += gidy*32+ idx + idy*ldb;
-
-
+
+
uint block_k = K >> 4;
- do
+ do
{
__local float* plA = lA + idy*33+idx;
__local float* plB = lB + idy*33+idx;
barrier(CLK_LOCAL_MEM_FENCE);
-
+
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
-
+
plA[0] = CurrentOffSetA>=M?0.0:A[0];
plA[16] = CurrentOffSetA+16>=M?0.0:A[16];
-
+
barrier(CLK_LOCAL_MEM_FENCE);
uint offA = idx;
uint offB = idy;
@@ -126,23 +126,35 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH (
return;
C+=offset_x+offset_y*ldc;
-
- int i = 0;
- do
- {
- C[0 ] = mad(alpha, rC[i][0], beta*C[0]);
- if(offset_y+16<N)
- C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
-
- C+=16;
- offset_x+=16;
- if(offset_x>=M )
- return;
-
-
- }
- while (++i < 2);
+ int i = 0;
+ if (beta !=0 ) {
+ do
+ {
+ C[0 ] = mad(alpha, rC[i][0], beta*C[0]);
+ if(offset_y+16<N)
+ C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
+
+ C+=16;
+ offset_x+=16;
+ if(offset_x>=M )
+ return;
+ }
+ while (++i < 2);
+ } else {
+ do
+ {
+ C[0 ] = alpha * rC[i][0];
+ if(offset_y+16<N)
+ C[16*ldc] = alpha * rC[i][1];
+
+ C+=16;
+ offset_x+=16;
+ if(offset_x>=M )
+ return;
+ }
+ while (++i < 2);
+ }
}
);
#endif
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp
index be5b219..a01958f 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp
@@ -133,25 +133,26 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE (
int offset_x = gidx * 64 + idx;
int offset_y = gidy * 64 + idy;
- //if(offset_x>=M || offset_y>=N )
- // return;
-
C += offset_x + offset_y*ldc;
int i = 0;
- do
- {
- C[0] = mad(alpha, rC[i][0], beta*C[0]);
- C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
-
-
- C += 16;
- offset_x += 16;
- //if(offset_x>=M )
- // return;
-
-
- } while (++i < 2);
+ if (beta != 0) {
+ do
+ {
+ C[0] = mad(alpha, rC[i][0], beta*C[0]);
+ C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
+ C += 16;
+ offset_x += 16;
+ } while (++i < 2);
+ } else {
+ do
+ {
+ C[0] = alpha * rC[i][0];
+ C[16 * ldc] = alpha * rC[i][1];
+ C += 16;
+ offset_x += 16;
+ } while (++i < 2);
+ }
}
);
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp
index 5c41406..ae477cb 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp
@@ -145,8 +145,9 @@ __kernel void sgemm_Col_NT_B1_MX032_NX064_KX16_ROW (
C += offset_x + offset_y*ldc;
int i = 0;
- do
- {
+ if (beta != 0) {
+ do
+ {
C[0] = mad(alpha, rC[i][0], beta*C[0]);
C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
C[32 * ldc] = mad(alpha, rC[i][2], beta*C[32 * ldc]);
@@ -155,7 +156,20 @@ __kernel void sgemm_Col_NT_B1_MX032_NX064_KX16_ROW (
offset_x += 16;
//if(offset_x>=M )
// return;
- } while (++i < 2);
+ } while (++i < 2);
+ } else {
+ do
+ {
+ C[0] = alpha * rC[i][0];
+ C[16 * ldc] = alpha * rC[i][1];
+ C[32 * ldc] = alpha * rC[i][2];
+ C[48 * ldc] = alpha * rC[i][3];
+ C += 16;
+ offset_x += 16;
+ //if(offset_x>=M )
+ // return;
+ } while (++i < 2);
+ }
}
);
#endif
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp
index 2c9e9ff..3d39977 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp
@@ -143,15 +143,21 @@ __kernel void sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN (
C += offset_x + offset_y*ldc;
int i = 0;
- do
- {
+ if (beta != 0) {
+ do
+ {
C[0] = mad(alpha, rC[i][0], beta*C[0]);
C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]);
-
C += 16;
-
- } while (++i < 4);
-
+ } while (++i < 4);
+ } else {
+ do
+ {
+ C[0] = alpha * rC[i][0];
+ C[16 * ldc] = alpha * rC[i][1];
+ C += 16;
+ } while (++i < 4);
+ }
}
);
#endif
diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp
index 370ca88..a41a09e 100644
--- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp
+++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp
@@ -57,34 +57,34 @@ __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src (
float rC[2][2] = { {(float)0} };
float rA[1][2];
float rB[1][2];
-
-
+
+
A += offsetA;
B += offsetB;
C+=offsetC;
-
+
__local float lA[528];//16*32+16
__local float lB[528];
-
+
uint gidx = get_group_id(0);
uint gidy = get_group_id(1);
uint idx = get_local_id(0);
uint idy = get_local_id(1);
-
+
int CurrentOffSetA = gidx*32+ idy;
int CurrentOffSetB = gidy*32+ idy;
A += (gidx*32+idy)*lda + idx;
B += (gidy*32+idy)*ldb + idx;
-
-
+
+
uint block_k = K >> 4;
- do
+ do
{
__local float* plA = lA + idx*33+idy;
__local float* plB = lB + idx*33+idy;
barrier(CLK_LOCAL_MEM_FENCE);
-
+
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -127,21 +127,35 @@ __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src (
C+=offset_x+offset_y*ldc;
- int i = 0;
- do
- {
- C[0 ] = mad(alpha, rC[i][0], beta*C[0]);
- if(offset_y+16<N)
- C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
-
- C+=16;
- offset_x+=16;
- if(offset_x>=M )
- return;
-
+ int i = 0;
+ if (beta != 0) {
+ do
+ {
+ C[0 ] = mad(alpha, rC[i][0], beta*C[0]);
+ if(offset_y+16<N)
+ C[16*ldc] = mad(alpha, rC[i][1], beta*C[16*ldc]);
+
+ C+=16;
+ offset_x+=16;
+ if(offset_x>=M )
+ return;
+ }
+ while (++i < 2);
+ } else {
+ do
+ {
+ C[0 ] = alpha * rC[i][0];
+ if(offset_y+16<N)
+ C[16*ldc] = alpha * rC[i][1];
+
+ C+=16;
+ offset_x+=16;
+ if(offset_x>=M )
+ return;
+ }
+ while (++i < 2);
}
- while (++i < 2);
}
);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list