[clblas] 19/61: some static kernel code clean up
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri Jul 24 22:49:44 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 8ef0a432fb2fdf50edd1e36e33140f5ebe05cc31
Author: Timmy <timmy.liu at amd.com>
Date: Wed Apr 29 10:26:02 2015 -0500
some static kernel code clean up
---
.../gens/clTemplates/sgemm_hawaiiSplitKernel.cl | 42 ++++++++++++++++------
1 file changed, 31 insertions(+), 11 deletions(-)
diff --git a/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl b/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl
index 47884c5..8eb6749 100644
--- a/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl
+++ b/src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl
@@ -238,6 +238,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idy*97+idx;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0+0*ldb];
plB[16] = B[16+0*ldb];
@@ -355,6 +356,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idy*97+idx;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -472,6 +474,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idy*97+idx;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -602,7 +605,7 @@ static const char * sgemm_NT_16_SPLIT__ALPHA = "
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
- barrier(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -648,7 +651,7 @@ __kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
//{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idy*97+idx;
- // barrier(CLK_LOCAL_MEM_FENCE);
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0+0*ldb];
plB[16] = B[16+0*ldb];
plB[32] = B[32+0*ldb];
@@ -787,6 +790,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idy*97+idx;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0+0*ldb];
plB[16] = B[16+0*ldb];
@@ -903,6 +907,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idy*97+idx;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -1020,6 +1025,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idy*97+idx;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -1830,7 +1836,7 @@ static const char * sgemm_NT_1_SPLIT__ALPHA = "
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
- barrier(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void sgemm_NT_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -2727,6 +2733,7 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0];
plB[16] = B[16*ldb];
@@ -2844,6 +2851,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -2962,6 +2970,7 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -3095,7 +3104,7 @@ static const char * sgemm_NN_16_SPLIT__ALPHA = "
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
- barrier(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -3141,7 +3150,7 @@ __kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
//{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idx*97+idy;
- // barrier(CLK_LOCAL_MEM_FENCE);
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0];
plB[16] = B[16*ldb];
plB[32] = B[32*ldb];
@@ -3280,7 +3289,8 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idx*97+idy;
-
+ barrier(CLK_LOCAL_MEM_FENCE);
+
plB[0] = B[0];
plB[16] = B[16*ldb];
plB[32] = B[32*ldb];
@@ -3396,6 +3406,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -3513,7 +3524,8 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
{
__local float* plA = lA + idy*97+idx;
__local float* plB = lB + idx*97+idy;
-
+ barrier(CLK_LOCAL_MEM_FENCE);
+
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
plB[32] = CurrentOffSetB+32>=N?0.0:B[32*ldb];
@@ -3667,7 +3679,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHABETA = "
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
- barrier(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHABETA_SPLIT_MAIN( __global float const * restrict A,
@@ -4400,7 +4412,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHA = "
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
- barrier(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -5122,7 +5134,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHABETA = "
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
- barrier(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHABETA_SPLIT_MAIN( __global float const * restrict A,
@@ -5167,6 +5179,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0];
plB[16] = B[16*ldb];
@@ -5302,6 +5315,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0];
plB[16] = B[16*ldb];
@@ -5419,6 +5433,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -5537,6 +5552,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -5668,7 +5684,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHA = "
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
- barrier(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -5712,6 +5728,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0];
plB[16] = B[16*ldb];
@@ -5846,6 +5863,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = B[0];
plB[16] = B[16*ldb];
@@ -5962,6 +5980,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -6079,6 +6098,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
{
__local float* plA = lA + idx*97+idy;
__local float* plB = lB + idx*97+idy;
+ barrier(CLK_LOCAL_MEM_FENCE);
plB[0] = CurrentOffSetB>=N?0.0:B[0];
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list