[clblas] 99/125: enable sgemm column major TN case to take advantage of faster sgemm NN kernel by doing transpose separately
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri May 29 06:57:26 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 37aeff54c03d566154a46054a946e785596fd5b8
Author: Timmy <timmy.liu at amd.com>
Date: Thu Oct 30 17:02:46 2014 -0500
enable sgemm column major TN case to take advantage of faster sgemm NN kernel by doing transpose separately
---
src/library/blas/include/blas_funcs.h | 1 +
src/library/blas/xgemm.c | 526 ++++++++++++++++++++++++++++-
src/library/tools/tune/subdim.c | 4 +-
src/tests/correctness/tcase-filter.cpp | 4 +-
src/tests/correctness/test-correctness.cpp | 5 +
5 files changed, 536 insertions(+), 4 deletions(-)
diff --git a/src/library/blas/include/blas_funcs.h b/src/library/blas/include/blas_funcs.h
index 5536f9c..f460eb0 100644
--- a/src/library/blas/include/blas_funcs.h
+++ b/src/library/blas/include/blas_funcs.h
@@ -78,6 +78,7 @@ typedef enum BlasFunctionID {
CLBLAS_iAMAX,
CLBLAS_NRM2,
CLBLAS_ASUM,
+ CLBLAS_TRANSPOSE,
/* ! Must be the last */
BLAS_FUNCTIONS_NUMBER
diff --git a/src/library/blas/xgemm.c b/src/library/blas/xgemm.c
index 895c99d..c5d7209 100644
--- a/src/library/blas/xgemm.c
+++ b/src/library/blas/xgemm.c
@@ -17,6 +17,7 @@
#include <string.h>
#include <clBLAS.h>
+#include <stdlib.h>
#include <devinfo.h>
#include "clblas-internal.h"
@@ -110,6 +111,465 @@ doGemm(
return (clblasStatus)err;
}
+
+static ssize_t
+TransposeKernel(
+ char *buf,
+ size_t buflen,
+ const struct SubproblemDim *subdims,
+ void *extra)
+{
+/*
+ *Transpose kernel generator
+ *a typical kernel for mod4 sizes in both direction looks like below
+
+ "// micro tile size is 4 x 4 \n"
+ "// matrix are of column major \n"
+ "#pragma OPENCL EXTENSION cl_amd_printf : enable \n"
+ "void __kernel \n"
+ "transpose( uint X, \n"
+ "uint Y, \n"
+ "uint ld, \n"
+ "uint offset, \n"
+ "const __global float *restrict mat, \n"
+ "__global float *transposed_mat) \n"
+ "{ \n"
+ "transposed_mat += offset; \n"
+ "mat += offset; \n"
+ "transposed_mat += ( (uint)get_global_id(1) * Y + (uint)get_global_id(0) ) << 2; \n"
+ "mat += ( (uint)get_global_id(0) * ld + (uint)get_global_id(1) ) << 2; \n"
+ "//transpose inside the block \n"
+ "transposed_mat[0] = mat[0]; \n"
+ "transposed_mat[1] = mat[ld]; \n"
+ "transposed_mat[2] = mat[ld*2]; \n"
+ "transposed_mat[3] = mat[ld*3]; \n"
+ "\n"
+ "transposed_mat[Y] = mat[1]; \n"
+ "transposed_mat[Y+1] = mat[1+ld]; \n"
+ "transposed_mat[Y+2] = mat[1+ld*2]; \n"
+ "transposed_mat[Y+3] = mat[1+ld*3]; \n"
+ "\n"
+ "transposed_mat[2*Y] = mat[2]; \n"
+ "transposed_mat[2*Y+1] = mat[2+ld]; \n"
+ "transposed_mat[2*Y+2] = mat[2+ld*2]; \n"
+ "transposed_mat[2*Y+3] = mat[2+ld*3]; \n"
+ "\n"
+ "transposed_mat[3*Y] = mat[3]; \n"
+ "transposed_mat[3*Y+1] = mat[3+ld]; \n"
+ "transposed_mat[3*Y+2] = mat[3+ld*2]; \n"
+ "transposed_mat[3*Y+3] = mat[3+ld*3]; \n"
+ "}";
+*/
+ struct KgenContext *ctx;
+ ssize_t ret = 0;
+ char tmp[2048];
+ int modX = subdims->x;
+ int modY = subdims->y;
+
+ ctx = createKgenContext(buf, buflen, true);
+
+ sprintf(tmp, "// micro tile size is 4 x 4 \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "// matrix are of column major \n");
+ kgenAddStmt(ctx, tmp);
+
+ //kernel declartion
+ sprintf(tmp, "void __kernel \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "transpose( uint X, \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "uint Y, \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "uint ld, \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "uint offset, \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "const __global float *restrict mat, \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "__global float *transposed_mat) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+
+ //kernel body
+ sprintf(tmp, "uint global_id_0 = (uint)get_global_id(0); \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "uint global_id_1 = (uint)get_global_id(1); \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "uint global_size_0 = (uint)get_global_size(0); \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "uint global_size_1 = (uint)get_global_size(1); \n");
+ kgenAddStmt(ctx, tmp);
+
+ sprintf(tmp, "transposed_mat += offset; \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "mat += offset; \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "transposed_mat += ( global_id_1 * Y + global_id_0 ) << 2; \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "mat += ( global_id_0 * ld + global_id_1 ) << 2; \n");
+ kgenAddStmt(ctx, tmp);
+
+ sprintf(tmp, "//transpose inside the block \n");
+ kgenAddStmt(ctx, tmp);
+ //first block
+ sprintf(tmp, "transposed_mat[0] = mat[0]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[1] = mat[ld]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2 )
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[2] = mat[ld*2]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[3] = mat[ld*3]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "\n");
+ kgenAddStmt(ctx, tmp);
+
+ //second block
+ if(modX == 1)
+ {
+ sprintf(tmp, "if( global_id_1 < global_size_1 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[Y] = mat[1]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[Y+1] = mat[1+ld]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[Y+2] = mat[1+ld*2]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[Y+3] = mat[1+ld*3]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "\n");
+ kgenAddStmt(ctx, tmp);
+ if(modX == 1)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+
+ //third block
+ if(modX == 1 || modX == 2)
+ {
+ sprintf(tmp, "if( global_id_1 < global_size_1 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[2*Y] = mat[2]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[2*Y+1] = mat[2+ld]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[2*Y+2] = mat[2+ld*2]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[2*Y+3] = mat[2+ld*3]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "\n");
+ kgenAddStmt(ctx, tmp);
+ if(modX == 1 || modX == 2)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+
+ //fourth block
+ if(modX == 1 || modX == 2 || modX == 3)
+ {
+ sprintf(tmp, "if( global_id_1 < global_size_1 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[3*Y] = mat[3]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[3*Y+1] = mat[3+ld]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY == 1)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[3*Y+2] = mat[3+ld*2]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "if( global_id_0 < global_size_0 - 1 ) \n");
+ kgenAddStmt(ctx, tmp);
+ sprintf(tmp, "{ \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ sprintf(tmp, "transposed_mat[3*Y+3] = mat[3+ld*3]; \n");
+ kgenAddStmt(ctx, tmp);
+ if(modY ==1 || modY == 2 || modY == 3)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+ if(modX == 1 || modX == 2 || modX == 3)
+ {
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+ }
+
+
+ sprintf(tmp, "} \n");
+ kgenAddStmt(ctx, tmp);
+
+ if (!ret) {
+ ret = (ssize_t)kgenSourceSize(ctx) + 1;
+ }
+ destroyKgenContext(ctx);
+ return ret;
+}
+void
+transposeMemObject(
+ clblasOrder order,
+ size_t X,
+ size_t Y,
+ size_t ld,
+ size_t offset,
+ const cl_mem src,
+ cl_mem dst,
+ cl_context context,
+ cl_uint numCommandQueues,
+ cl_command_queue *commandQueues,
+ cl_uint numEventsInWaitList,
+ const cl_event *eventWaitList,
+ cl_event *events)
+{
+/**
+ *transposition of a 2D MemObject
+ * @param[in] order Row/column order.
+ * @param[in] X Number of columns in transposed matrix / rows in input matrix.
+ * @param[in] Y Number of rows in transposed matrix / columns in input matrix.
+ * @param[in] ld Leading dimension of input matrix.
+ * @param[in] offset offset size.
+ * @param[in] src Input matrix of the transposition.
+ * @param[in] dst Output matrix of the transposition.
+ * @param[in] numCommandQueues Number of OpenCL command queues in which the
+ * task is to be performed.
+ * @param[in] commandQueues OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList Event wait list.
+ * @param[in] events Event objects per each command queue that identify
+ * a particular kernel execution instance.
+ */
+
+
+ char *source;
+ cl_int err;
+ cl_kernel kernel;
+ Kernel *transpose_kernel;
+ solver_id_t sid;
+ KernelKey key;
+ CLBLASKernExtra extra;
+ BlasFunctionID funcID = CLBLAS_TRANSPOSE;
+ char *log;
+ const cl_uint workDim = 2;
+ const size_t localWorkSize[2] = { 8, 8 };
+ size_t globalWorkSize[2];
+ ssize_t size;
+
+ sid = makeSolverID(funcID, 1);
+ memset(key.subdims, 0, sizeof(key.subdims));
+ key.nrDims = 2;
+ key.subdims[0].x = X%4;
+ key.subdims[0].y = Y%4;
+ key.subdims[0].bwidth = 1;
+ key.subdims[0].itemX = 4;
+ key.subdims[0].itemY = 4;
+ memset(&extra, 0, sizeof(extra));
+
+ err = getQueueDevice(*commandQueues, &key.device);
+ err = getQueueContext(*commandQueues, &key.context);
+
+ //look for the kernel from cache first
+ if (areKernelsCacheable())
+ {
+ transpose_kernel = findKernel(clblasKernelCache, sid, &key, &extra);
+ }
+
+ // if transpose_kernel was not found from cache, create the kernel
+ if (transpose_kernel == NULL)
+ {
+ transpose_kernel = allocKernel();
+ log = malloc(65536);
+ if (log) {
+ log[0] = '\0';
+ }
+ //kernel source auto generation
+ //call size = TransposeKernel(NULL, 0, ...)
+ //then allocate buffer and call TransposeKernel again
+ size = TransposeKernel(NULL, 0, &key.subdims[0], &extra);
+ source = calloc(1, size);
+ TransposeKernel(source, size, &key.subdims[0], &extra);
+ //printf("transpose source: %s\n", source);
+ transpose_kernel->program = buildClProgram(source, NULL, key.context, key.device,
+ log, 65536, &err);
+ transpose_kernel->extraSize = sizeof(CLBLASKernExtra);
+ transpose_kernel->extra = calloc(1, transpose_kernel->extraSize);// memory freed by clblasTeardown
+ *(CLBLASKernExtra*)(transpose_kernel->extra) = extra;
+
+ //save the kernel in cache
+ getKernel(transpose_kernel);
+ if (addKernelToCache(clblasKernelCache, sid, transpose_kernel, &key,
+ clblasKernelExtraCmp))
+ {
+ putKernel(clblasKernelCache, transpose_kernel);
+ }
+ free(log);
+ free(source);
+ }
+
+ //launch the kernel
+ err = clCreateKernelsInProgram(transpose_kernel->program, 1, &kernel, NULL);
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_uint), &X);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_uint), &Y);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_uint), &ld);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_uint), &offset);
+ err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &src);
+ err = clSetKernelArg(kernel, 5, sizeof(cl_mem), &dst);
+
+ globalWorkSize[0] = (Y+4-1)/4;
+ globalWorkSize[1] = (X+4-1)/4;
+
+ err = clEnqueueNDRangeKernel(*commandQueues, kernel, workDim, NULL,
+ globalWorkSize, localWorkSize, 0, NULL, NULL);
+ clFinish(*commandQueues);
+
+ clReleaseKernel(kernel);
+
+
+}
+
clblasStatus
clblasSgemm(
clblasOrder order,
@@ -136,15 +596,79 @@ clblasSgemm(
cl_event *events)
{
CLBlasKargs kargs;
+ cl_context context;
+ cl_device_id device;
+ cl_int err;
+ cl_mem transposed_A;
+ clblasStatus status;
+ float *transposed_A_host;
+ size_t device_size;
+ char* device_name;
+ int fast_sgemmtn = 0;
+
memset(&kargs, 0, sizeof(kargs));
kargs.dtype = TYPE_FLOAT;
kargs.alpha.argFloat = alpha;
kargs.beta.argFloat = beta;
- return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda,
+ err = clGetCommandQueueInfo( *commandQueues, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL);
+ if( err < 0)
+ return clblasInvalidCommandQueue;
+
+ err = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &device_size);
+ device_name = (char*)malloc(device_size * sizeof(char));
+ err = clGetDeviceInfo(device, CL_DEVICE_NAME, device_size, device_name, NULL);
+ if( err < 0)
+ return clblasInvalidDevice;
+
+ if(getenv("CLBLAS_FAST_SGEMM_TN") != NULL)
+ fast_sgemmtn = *getenv("CLBLAS_FAST_SGEMM_TN") - '0';
+
+
+ //if the env CLBLAS_FAST_SGEMMTN is set to 1
+ //and if transA = T, transB = N and order = clblasColumnMajor
+ //and if the devices are Spectre, Hawaii or Tahiti
+ //do the transpose first
+ //and then call the NN sgemm is a fater apporach.
+ //the cost of this approach is the use of an extra cl_mem object
+ if( ( fast_sgemmtn == 1 ) && ( strcmp(device_name, "Spectre") || strcmp(device_name, "Hawaii") || strcmp(device_name, "Tahiti") ) && (transA == clblasTrans && transB == clblasNoTrans && order == clblasColumnMajor) )
+ {
+ //do the transpose on A
+ //only transpose the leading part of the matrix
+ //update ldb and transB would be necessary
+ free(device_name);
+ err = clGetCommandQueueInfo( *commandQueues, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
+
+ transposed_A = clCreateBuffer(context, CL_MEM_READ_WRITE, (M * K + offA) * sizeof(float), NULL, &err);
+ if( err < 0 )
+ return clblasOutOfResources;
+
+ transposeMemObject(order, K, M, lda, offA, A, transposed_A, context, numCommandQueues, commandQueues,
+ numEventsInWaitList, eventWaitList, events);
+
+ //transA should be reset to clblasNoTrans
+ transA = clblasNoTrans;
+ //update lda to the minimal size
+ lda = M;
+ //now call doGemm with transposed A, updated lda and updated transA
+ status = doGemm(&kargs, order, transA, transB, M, N, K, transposed_A, offA, lda,
+ B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues,
+ numEventsInWaitList, eventWaitList, events);
+
+ clReleaseMemObject( transposed_A );
+
+ return status;
+ }
+ else
+ {
+ free(device_name);
+ return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda,
B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues,
numEventsInWaitList, eventWaitList, events);
+ }
+
+
}
clblasStatus
diff --git a/src/library/tools/tune/subdim.c b/src/library/tools/tune/subdim.c
index 6510593..12b3f84 100644
--- a/src/library/tools/tune/subdim.c
+++ b/src/library/tools/tune/subdim.c
@@ -213,8 +213,8 @@ initVector(SubDimInfo* sd)
setVariable(sd, V_L1_BW, 6, &dim[0]);
}
else {
- setVariable(sd, V_L0_X, 4, &dim[5]);
- setVariable(sd, V_L0_Y, 4, &dim[5]);
+ setVariable(sd, V_L0_X, 4, &dim[4]);
+ setVariable(sd, V_L0_Y, 4, &dim[4]);
setVariable(sd, V_L0_BW, 6, &dim[0]);
setVariable(sd, V_L1_X, 6, &dim[0]);
setVariable(sd, V_L1_Y, 6, &dim[0]);
diff --git a/src/tests/correctness/tcase-filter.cpp b/src/tests/correctness/tcase-filter.cpp
index a389ffd..35a892b 100644
--- a/src/tests/correctness/tcase-filter.cpp
+++ b/src/tests/correctness/tcase-filter.cpp
@@ -99,6 +99,7 @@ canCaseBeSkipped(const TestParams *params, bool isComplex)
/*
* filter BigLDA cases
*/
+ /*
s = nonZeroSize(params->lda, params->ldb, params->ldc);
lda = selectSize(params->lda, s);
ldb = selectSize(params->ldb, s);
@@ -106,10 +107,11 @@ canCaseBeSkipped(const TestParams *params, bool isComplex)
if (sizeEquCount(lda, ldb, ldc) < 3) {
return true;
}
+
if (!isEquToAny(lda, m, n, k)) {
return true;
}
-
+ */
return false;
}
diff --git a/src/tests/correctness/test-correctness.cpp b/src/tests/correctness/test-correctness.cpp
index 7a1a084..75da51a 100644
--- a/src/tests/correctness/test-correctness.cpp
+++ b/src/tests/correctness/test-correctness.cpp
@@ -610,6 +610,11 @@ INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, GEMM, Combine(
ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OffSet, GEMM, Combine(
+ Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+ ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+ Values(clMath::ExtraTestSizes(500, 501, 502, 1, 0, 0)), Values(1)));
+
// Cases for extended versions with offsets
#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list