[clblas] 32/54: Add caching mechanism based on context and device for gemm and trsm
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jan 14 20:07:36 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository clblas.
commit 843bff5d0f7569f287640dcf0de4924ede2fe634
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Tue Dec 15 16:00:55 2015 -0500
Add caching mechanism based on context and device for gemm and trsm
---
src/library/blas/xgemm.cc | 142 ++++++++------
src/library/blas/xtrsm.cc | 464 +++++++++++++++++++++++++---------------------
2 files changed, 336 insertions(+), 270 deletions(-)
diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc
index 5acdeeb..6ea99bb 100644
--- a/src/library/blas/xgemm.cc
+++ b/src/library/blas/xgemm.cc
@@ -14,6 +14,9 @@
* limitations under the License.
* ************************************************************************/
+#include <map>
+#include <string>
+#include <sstream>
#include <stdio.h>
#include <string.h>
#include <clBLAS.h>
@@ -86,11 +89,35 @@ bool isZero<DoubleComplex>( DoubleComplex value ) {
return CREAL(value) == 0 && CIMAG(value) == 0;
};
+static char *getKernelName(cl_kernel clKernel)
+{
+ cl_int err;
+ // get kernel name
+ size_t kernelNameLength;
+ err = clGetKernelInfo(
+ clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ sizeof(kernelNameLength),
+ NULL,
+ &kernelNameLength);
+ CL_CHECK(err)
+ char *kernelName = new char[kernelNameLength];
+ err = clGetKernelInfo(
+ clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ kernelNameLength*sizeof(char),
+ kernelName,
+ NULL );
+ CL_CHECK(err)
+
+ return kernelName;
+}
/******************************************************************************
* Make Gemm Kernel
*****************************************************************************/
+//FIXME: This function should be returning an error.
void makeGemmKernel(
cl_kernel *clKernel,
cl_command_queue clQueue,
@@ -100,39 +127,47 @@ void makeGemmKernel(
size_t *kernelBinarySize,
const char *binaryBuildOptions)
{
+ //TODO: This will need to be converted to thread local when making clBLAS thread safe
+ typedef std::map<std::string, cl_kernel> kernel_map_t;
+ static kernel_map_t kernel_map;
+
+ cl_context clContext;
+ cl_device_id clDevice;
cl_int err;
+
+ err = clGetCommandQueueInfo( clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL);
+ CL_CHECK(err)
+ err = clGetCommandQueueInfo( clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL);
+ CL_CHECK(err)
+
+ std::stringstream ss;
+ ss << clDevice << "_" << clContext;
+ std::string prefix = ss.str();
+
if (*clKernel) {
+ char *kernelName = getKernelName(*clKernel);
// kernel has already been built, return
#ifdef AUTOGEMM_PRINT_DEBUG
- // get kernel name
- size_t kernelNameLength;
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- sizeof(kernelNameLength),
- NULL,
- &kernelNameLength );
- CL_CHECK(err)
- char *kernelName = new char[kernelNameLength];
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- kernelNameLength*sizeof(char),
- kernelName,
- NULL );
- CL_CHECK(err)
printf("makeGemmKernel: \"%s\" already built; returning.\n", kernelName);
- delete[] kernelName;
#endif
- return;
- } else {
+
+ // Check if kernel exists for this device
+ std::string key = prefix + "_" + kernelName;
+ kernel_map_t::iterator idx = kernel_map.find(key);
+
+
+ // If kernel not found for this device, set to NULL
+ if (idx == kernel_map.end()) {
+ *clKernel = NULL;
+ } else {
+ *clKernel = idx->second;
+ }
+
+ delete[] kernelName;
+ }
+
+ if (!*clKernel) {
// kernel has not been built, so build it (from binary, preferably)
- cl_context clContext;
- cl_device_id clDevice;
- err = clGetCommandQueueInfo( clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL);
- CL_CHECK(err)
- err = clGetCommandQueueInfo( clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL);
- CL_CHECK(err)
cl_program clProgram;
cl_int clBinaryStatus;
if (*kernelBinary) {
@@ -151,6 +186,9 @@ void makeGemmKernel(
binaryBuildOptions, NULL, NULL );
CL_CHECK(err)
} else {
+#ifdef AUTOGEMM_PRINT_DEBUG
+ printf("makeGemmKernel: Creating program from source\n", *kernelBinarySize);
+#endif
clProgram = clCreateProgramWithSource(
clContext,
1, &kernelSource,
@@ -178,6 +216,7 @@ void makeGemmKernel(
printf("%s\n", buildLog);
//printf("\n\nKernel String:\n\n");
//printf("%s\n", kernelSource);
+ //FIXME: The function should be exiting at this point
}
err = clCreateKernelsInProgram(
@@ -187,32 +226,21 @@ void makeGemmKernel(
CL_CHECK(err)
err = clReleaseProgram(clProgram);
CL_CHECK(err)
-
+
+ char *kernelName = getKernelName(*clKernel);
+
#ifdef AUTOGEMM_PRINT_DEBUG
- // get kernel name
- size_t kernelNameLength;
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- sizeof(kernelNameLength),
- NULL,
- &kernelNameLength );
- CL_CHECK(err)
- char *kernelName = new char[kernelNameLength];
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- kernelNameLength*sizeof(char),
- kernelName,
- NULL );
- CL_CHECK(err)
printf("makeGemmKernel: \"%s\" now built; returning.\n", kernelName);
- delete[] kernelName;
#endif
+
+ std::string key = prefix + "_" + kernelName;
+ kernel_map[key] = *clKernel;
+ delete[] kernelName;
}
+
+ return;
}
-
/******************************************************************************
* Enqueue Gemm Kernel
*****************************************************************************/
@@ -266,7 +294,7 @@ template<> clblasTranspose correctTranspose<DoubleComplex>( clblasTranspose tran
* templated Gemm
*****************************************************************************/
template<typename Precision>
-clblasStatus
+clblasStatus
clblasGemm(
clblasOrder order,
clblasTranspose transA,
@@ -308,7 +336,7 @@ clblasGemm(
M, N, offA, offB, lda, ldb, A, B );
-
+
/******************************************************************************
* Handle Special Cases
*
@@ -318,7 +346,7 @@ clblasGemm(
* and are mod32 but not mod96 or mod64
*
*****************************************************************************/
-
+
bool specialCaseHandled = false;
clblasStatus SpecialCaseStatus = GemmSpecialCases<Precision>(order,
@@ -339,8 +367,8 @@ clblasGemm(
if (specialCaseHandled)
return SpecialCaseStatus;
-
-
+
+
/******************************************************************************
* Optimal num elements per thread
*****************************************************************************/
@@ -512,7 +540,7 @@ clblasGemm(
gemmKernelArgs[11] = &offA; gemmKernelArgSizes[11] = sizeof(cl_uint);
gemmKernelArgs[12] = &offB; gemmKernelArgSizes[12] = sizeof(cl_uint);
gemmKernelArgs[13] = &offC; gemmKernelArgSizes[13] = sizeof(cl_uint);
-
+
/******************************************************************************
* Enqueue Tile kernel
@@ -577,8 +605,8 @@ clblasGemm(
/******************************************************************************
* SGEMM API call
*****************************************************************************/
-extern "C"
-clblasStatus
+extern "C"
+clblasStatus
clblasSgemm(
clblasOrder order,
clblasTranspose transA,
@@ -615,7 +643,7 @@ clblasSgemm(
/******************************************************************************
* DGEMM API call
*****************************************************************************/
-extern "C"
+extern "C"
clblasStatus
clblasDgemm( clblasOrder order,
clblasTranspose transA,
@@ -652,7 +680,7 @@ clblasDgemm( clblasOrder order,
/******************************************************************************
* CGEMM API call
*****************************************************************************/
-extern "C"
+extern "C"
clblasStatus
clblasCgemm(
clblasOrder order,
@@ -690,7 +718,7 @@ clblasCgemm(
/******************************************************************************
* ZGEMM API
*****************************************************************************/
-extern "C"
+extern "C"
clblasStatus
clblasZgemm(
clblasOrder order,
diff --git a/src/library/blas/xtrsm.cc b/src/library/blas/xtrsm.cc
index 167b41c..505ed12 100644
--- a/src/library/blas/xtrsm.cc
+++ b/src/library/blas/xtrsm.cc
@@ -14,6 +14,9 @@
* limitations under the License.
* ************************************************************************/
+#include <map>
+#include <string>
+#include <sstream>
#include <stdlib.h>
#include <string.h>
#include <clBLAS.h>
@@ -34,7 +37,7 @@
//
// The idea is basically that
// B = A*X
-// can be computed as
+// can be computed as
// B' = (A*X)'
// = X'*A'
// And since changing the order is basically a transpose on each matrix,
@@ -42,7 +45,7 @@
// B = X*A (so only the side, the uplo must be changed and the M and N sizes swapped)
//
// When enabled, only the ColumnMajor kernels need to be implemented
-// for all TRSM
+// for all TRSM
//
#define FORCE_COLUMN_MAJOR 1
@@ -64,35 +67,35 @@ static void force_trsm_column_major(Args & args)
//
// This file provide the public clBLAS API for
//
-// clblasStrsm()
-// clblasDtrsm()
-// clblasCtrsm()
-// clblasZtrsm()
+// clblasStrsm()
+// clblasDtrsm()
+// clblasCtrsm()
+// clblasZtrsm()
//
-// using functors
-//
-// Potential optimizations:
+// using functors
+//
+// Potential optimizations:
////
-// - Get rid of the 'order' argument assuming that
+// - Get rid of the 'order' argument assuming that
// row-major is equivalent to the transpose of column-major.
-// That is
+// That is
//
// B = A*X
//
-// is equivalent to
+// is equivalent to
//
// B' = X'*A'
//
-// and, when considering the opposite order, is equivalent to
+// and, when considering the opposite order, is equivalent to
//
// B = X*A (with A swap between upper and lower)
//
-// By applying that transformation early, the functors implementing
-// the TRSMs only have to consider one of the two cases.
+// By applying that transformation early, the functors implementing
+// the TRSMs only have to consider one of the two cases.
//
//
-// Common part of all XTRSM implementations using the old Solver infrastructure
+// Common part of all XTRSM implementations using the old Solver infrastructure
//
#define CL_CHECK(RET) \
@@ -103,118 +106,153 @@ static void force_trsm_column_major(Args & args)
#define min(x,y) ((x)<(y)?(x):(y))
-static void makeKernel(
- cl_kernel *clKernel,
- cl_command_queue clQueue,
- const char *kernelSource,
- const char *sourceBuildOptions,
- const unsigned char **kernelBinary,
- size_t *kernelBinarySize,
- const char *binaryBuildOptions)
+static char *getKernelName(cl_kernel clKernel)
{
- cl_int err;
- if (*clKernel) {
- // kernel has already been built, return
+ cl_int err;
+ // get kernel name
+ size_t kernelNameLength;
+ err = clGetKernelInfo(
+ clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ sizeof(kernelNameLength),
+ NULL,
+ &kernelNameLength);
+ CL_CHECK(err)
+
+ char *kernelName = new char[kernelNameLength];
+ err = clGetKernelInfo(
+ clKernel,
+ CL_KERNEL_FUNCTION_NAME,
+ kernelNameLength*sizeof(char),
+ kernelName,
+ NULL );
+ CL_CHECK(err)
+
+ return kernelName;
+}
+
+//FIXME: This function should be returning an error.
+void makeKernel(
+ cl_kernel *clKernel,
+ cl_command_queue clQueue,
+ const char *kernelSource,
+ const char *sourceBuildOptions,
+ const unsigned char **kernelBinary,
+ size_t *kernelBinarySize,
+ const char *binaryBuildOptions)
+{
+ //TODO: This will need to be converted to thread local when making clBLAS thread safe
+ typedef std::map<std::string, cl_kernel> kernel_map_t;
+ static kernel_map_t kernel_map;
+
+ cl_context clContext;
+ cl_device_id clDevice;
+ cl_int err;
+
+ err = clGetCommandQueueInfo( clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL);
+ CL_CHECK(err)
+ err = clGetCommandQueueInfo( clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL);
+ CL_CHECK(err)
+
+ std::stringstream ss;
+ ss << clDevice << "_" << clContext;
+ std::string prefix = ss.str();
+
+ if (*clKernel) {
+ char *kernelName = getKernelName(*clKernel);
+ // kernel has already been built, return
#ifdef AUTOGEMM_PRINT_DEBUG
- // get kernel name
- size_t kernelNameLength;
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- sizeof(kernelNameLength),
- NULL,
- &kernelNameLength);
- CL_CHECK(err)
- char *kernelName = new char[kernelNameLength];
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- kernelNameLength*sizeof(char),
- kernelName,
- NULL);
- CL_CHECK(err)
- printf("makeGemmKernel: \"%s\" already built; returning.\n", kernelName);
- delete[] kernelName;
+ printf("makeKernel: \"%s\" already built; returning.\n", kernelName);
#endif
- return;
- }
- else {
- // kernel has not been built, so build it (from binary, preferably)
- cl_context clContext;
- cl_device_id clDevice;
- err = clGetCommandQueueInfo(clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL);
- CL_CHECK(err)
- err = clGetCommandQueueInfo(clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL);
- CL_CHECK(err)
- cl_program clProgram;
- cl_int clBinaryStatus;
- if (*kernelBinary) {
+
+ // Check if kernel exists for this device
+ std::string key = prefix + "_" + kernelName;
+ kernel_map_t::iterator idx = kernel_map.find(key);
+
+ // If kernel not found for this device, set to NULL
+ if (idx == kernel_map.end()) {
+ *clKernel = NULL;
+ } else {
+ *clKernel = idx->second;
+ }
+
+ delete[] kernelName;
+ }
+
+ if (!*clKernel) {
+ // kernel has not been built, so build it (from binary, preferably)
+ cl_program clProgram;
+ cl_int clBinaryStatus;
+ if (*kernelBinary) {
#ifdef AUTOGEMM_PRINT_DEBUG
- printf("makeGemmKernel: pre-compiled binary found: %llu bytes\n", *kernelBinarySize);
+ printf("makeKernel: pre-compiled binary found: %llu bytes\n", *kernelBinarySize);
#endif
- clProgram = clCreateProgramWithBinary(
- clContext,
- 1, &clDevice,
- kernelBinarySize, kernelBinary,
- &clBinaryStatus, &err);
- CL_CHECK(err)
- err = clBuildProgram(
- clProgram,
- 1, &clDevice,
- binaryBuildOptions, NULL, NULL);
- CL_CHECK(err)
- }
- else {
- clProgram = clCreateProgramWithSource(
- clContext,
- 1, &kernelSource,
- NULL, &err);
- CL_CHECK(err)
- err = clBuildProgram(
- clProgram,
- 1, &clDevice,
- sourceBuildOptions, NULL, NULL);
- if (err != CL_SUCCESS) {
- size_t logSize = 0;
- char* log;
- clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
- log = (char*)calloc(1, logSize + 1);
- clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
- printf("=== Build log ===\n%s\n", log);
- free(log);
- clReleaseProgram(clProgram);
- }
- CL_CHECK(err)
- }
- err = clCreateKernelsInProgram(
- clProgram,
- 1, clKernel,
- NULL);
- CL_CHECK(err)
- err = clReleaseProgram(clProgram);
- CL_CHECK(err)
+ clProgram = clCreateProgramWithBinary(
+ clContext,
+ 1, &clDevice,
+ kernelBinarySize, kernelBinary,
+ &clBinaryStatus, &err );
+ CL_CHECK(err)
+ err = clBuildProgram(
+ clProgram,
+ 1, &clDevice,
+ binaryBuildOptions, NULL, NULL );
+ CL_CHECK(err)
+ } else {
#ifdef AUTOGEMM_PRINT_DEBUG
- // get kernel name
- size_t kernelNameLength;
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- sizeof(kernelNameLength),
- NULL,
- &kernelNameLength);
- CL_CHECK(err)
- char *kernelName = new char[kernelNameLength];
- err = clGetKernelInfo(
- *clKernel,
- CL_KERNEL_FUNCTION_NAME,
- kernelNameLength*sizeof(char),
- kernelName,
- NULL);
- CL_CHECK(err)
- printf("makeGemmKernel: \"%s\" now built; returning.\n", kernelName);
- delete[] kernelName;
+ printf("makeKernel: Creating program from source\n", *kernelBinarySize);
#endif
- }
+ clProgram = clCreateProgramWithSource(
+ clContext,
+ 1, &kernelSource,
+ NULL, &err );
+ CL_CHECK(err)
+ err = clBuildProgram(
+ clProgram,
+ 1, &clDevice,
+ sourceBuildOptions, NULL, NULL );
+ CL_CHECK(err)
+ }
+
+ // print build failure
+ if (err != CL_SUCCESS) {
+ printf("clBuildProgram Failed\n");
+ printf("err = %d\n", err);
+
+ size_t len = 0;
+ clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
+ char* buildLog = new char[len];
+
+ printf("Error: Failed to build program executable!\n");
+ clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, len*sizeof(char), buildLog, 0);
+ printf("\nBuild Log:\n\n");
+ printf("%s\n", buildLog);
+ //printf("\n\nKernel String:\n\n");
+ //printf("%s\n", kernelSource);
+ //FIXME: The function should be exiting at this point
+ }
+
+ err = clCreateKernelsInProgram(
+ clProgram,
+ 1, clKernel,
+ NULL );
+ CL_CHECK(err)
+ err = clReleaseProgram(clProgram);
+ CL_CHECK(err)
+
+ char *kernelName = getKernelName(*clKernel);
+
+#ifdef AUTOGEMM_PRINT_DEBUG
+ printf("makeKernel: \"%s\" now built; returning.\n", kernelName);
+#endif
+
+ std::string key = prefix + "_" + kernelName;
+ kernel_map[key] = *clKernel;
+
+ delete[] kernelName;
+ }
+
+ return;
}
static cl_int clearBuffer(cl_command_queue queue,
@@ -224,13 +262,13 @@ static cl_int clearBuffer(cl_command_queue queue,
cl_int err = 0;
cl_event event;
- // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!!
+ // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!!
double zero = 0.0;
err = clEnqueueFillBuffer(queue,
buffer,
&zero,
sizeof(double),
- 0, // offset
+ 0, // offset
buffer_size,
0,
NULL,
@@ -240,7 +278,7 @@ static cl_int clearBuffer(cl_command_queue queue,
return err;
}
-// Compute the number of blocks of the specified 'size' to fully cover 'n'
+// Compute the number of blocks of the specified 'size' to fully cover 'n'
// Simply speaking, this is n/size rounded up.
#define BLOCKS(n,size) ( ( (n) / size ) + ( (n) % (size) != 0 ) )
@@ -369,10 +407,10 @@ cl_int diag_dtrtri192(
//cl_kernel diag_dtrtri_kernel_upper = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_UPPER", &err);
makeKernel(diag_dtrtri_kernel_upper_ClKernel,
- queue,
- diag_dtrtri_kernel_upper_KernelSource,
+ queue,
+ diag_dtrtri_kernel_upper_KernelSource,
TrtriBuildOptions,
- &diag_dtrtri_kernel_upper_KernelBinary,
+ &diag_dtrtri_kernel_upper_KernelBinary,
&diag_dtrtri_kernel_upper_KernelBinarySize,
TrtribinBuildOptions);
@@ -409,7 +447,7 @@ cl_int diag_dtrtri192(
}
// update the inverse up to the size of BLOCK_SIZE
-
+
for (int i = inner_block_size; i < outer_block_size; i *= 2) {
switch (i) {
@@ -425,7 +463,7 @@ cl_int diag_dtrtri192(
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
break;
-
+
case 24:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
@@ -494,7 +532,7 @@ cl_int diag_dtrtri192(
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
break;
-
+
default:
break;
@@ -503,7 +541,7 @@ cl_int diag_dtrtri192(
if (i * 2 >= M) break;
}
-
+
}
return err;
@@ -537,7 +575,7 @@ static clblasStatus gpu_dtrsm192(
if ((M % 192 == 0) && (N % 192 == 0))
{
- //TODO: the implementation of sub block being 192 only supports
+ //TODO: the implementation of sub block being 192 only supports
//side == right
//uplo == upper
//trans == notrans
@@ -566,7 +604,7 @@ static clblasStatus gpu_dtrsm192(
cl_mem InvA = 0;
cl_mem X = 0;
- // X of size mxn will contain the result
+ // X of size mxn will contain the result
size_t ldX = M;
size_t offX = 0; //must be 0: needed by the _(X,i,j) macro
size_t size_X = N*ldX * sizeof(double);
@@ -725,11 +763,11 @@ cl_int call_kernel_triple_update128(
clSetKernelArg(*kernel, 4, sizeof(unsigned int), &lda);
clSetKernelArg(*kernel, 5, sizeof(int), &npages);
clSetKernelArg(*kernel, 6, sizeof(unsigned int), &m);
-
+
err = clEnqueueNDRangeKernel(queue, *kernel, 2, NULL,
globalThreads, globalLocal,
0, NULL, event);
-
+
return err;
}
@@ -770,7 +808,7 @@ cl_int diag_dtrtri128(
if (uplo == clblasLower) {
-
+
diag_dtrtri_kernel_lower_KernelSource = diag_dtrtri_lower_128_16_src;
diag_dtrtri_kernel_lower_ClKernel = &diag_dtrtri_lower_128_16_clKernel;
diag_dtrtri_kernel_lower_KernelBinary = diag_dtrtri_lower_128_16_bin;
@@ -916,10 +954,10 @@ cl_int diag_dtrtri128(
}
if (i * 2 >= M) break;
}
-
+
}
else {
-
+
diag_dtrtri_kernel_upper_KernelSource = diag_dtrtri_upper_128_16_src;
diag_dtrtri_kernel_upper_ClKernel = &diag_dtrtri_upper_128_16_clKernel;
diag_dtrtri_kernel_upper_KernelBinary = diag_dtrtri_upper_128_16_bin;
@@ -949,14 +987,14 @@ cl_int diag_dtrtri128(
size_t globalThreads[1] = { nthreads };
size_t globalLocal[1] = { inner_block_size };
-
+
err = clEnqueueNDRangeKernel(queue, *diag_dtrtri_kernel_upper_ClKernel, 1, NULL,
globalThreads, globalLocal,
0, NULL, NULL);
CL_CHECK(err);
//err = clFinish(queue);
//CL_CHECK(err);
-
+
if (err != CL_SUCCESS) {
//printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
return err;
@@ -968,13 +1006,13 @@ cl_int diag_dtrtri128(
//}
// update the inverse up to the size of BLOCK_SIZE
-
+
for (int i = inner_block_size; i < outer_block_size; i *= 2) {
switch (i) {
case 16:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
-
+
err = call_kernel_triple_update128(&triple_dgemm_update_128_16_R_clKernel,
triple_dgemm_update_128_16_R_src,
TrtriBuildOptions,
@@ -991,7 +1029,7 @@ cl_int diag_dtrtri128(
case 32:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
-
+
err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART1_R_clKernel,
triple_dgemm_update_128_32_PART1_R_src,
TrtriBuildOptions,
@@ -1010,13 +1048,13 @@ cl_int diag_dtrtri128(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
-
+
break;
case 64:
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
//CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event);
-
+
err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART1_R_clKernel,
triple_dgemm_update_128_64_PART1_R_src,
TrtriBuildOptions,
@@ -1035,7 +1073,7 @@ cl_int diag_dtrtri128(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
-
+
break;
default:
@@ -1125,12 +1163,12 @@ static clblasStatus gpu_dtrsm128(
double one = 1.0;
double zero = 0.0;
- // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm
-#define _(M,i,j) M , (off##M + ((i)+(j)*ld##M) ) , ld##M
+ // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm
+#define _(M,i,j) M , (off##M + ((i)+(j)*ld##M) ) , ld##M
cl_mem InvA = 0;
cl_mem X = 0;
- // X of size mxn will contain the result
+ // X of size mxn will contain the result
size_t ldX = M;
size_t offX = 0; //must be 0: needed by the _(X,i,j) macro
size_t size_X = N*ldX * sizeof(double);
@@ -1158,17 +1196,17 @@ static clblasStatus gpu_dtrsm128(
CL_CHECK(err);
//
- // Helper for C = alpha * transp(A) * B + beta * C
+ // Helper for C = alpha * transp(A) * B + beta * C
//
// In the calls below:
// - the 1st matrix shall be either A or InvA transposed according to transA.
- // - the 2nd and 3rd matrices are either B and X
- //
+ // - the 2nd and 3rd matrices are either B and X
+ //
#define DGEMM_LEFT(m, n, k, alpha, A, B, beta, C) \
do { \
err = clblasDgemm(clblasColumnMajor, transA, clblasNoTrans , m, n, k, alpha, A, B, beta, C , 1, commandQueues, 0, NULL, events ) ; \
CL_CHECK(err); \
- } while(0)
+ } while(0)
if (transA == clblasNoTrans)
@@ -1198,12 +1236,12 @@ static clblasStatus gpu_dtrsm128(
DGEMM_LEFT(M - i - outer_block_size, N, outer_block_size, neg_one, _(A, i + outer_block_size, i), _(X, i, 0), one, _(B, i + outer_block_size, 0));
}
- //check_last_error() ;
+ //check_last_error() ;
}
}
- else // if ( uplo == clblasUpper)
+ else // if ( uplo == clblasUpper)
{
/* the upper case */
/* handle the first block seperately with alpha */
@@ -1283,7 +1321,7 @@ static clblasStatus gpu_dtrsm128(
else
{
//
- // Helper for C = alpha * B * A + beta * C
+ // Helper for C = alpha * B * A + beta * C
//
// In the calls below
// - the 2nd matrix shall be either A or InvA transposed according to transA
@@ -1293,7 +1331,7 @@ static clblasStatus gpu_dtrsm128(
do { \
err = clblasDgemm(clblasColumnMajor, clblasNoTrans, transA , m, n, k, alpha, B, A, beta, C , 1, commandQueues, 0, NULL, events ) ; \
CL_CHECK(err); \
- } while(0)
+ } while(0)
// side=R
@@ -1545,8 +1583,8 @@ cl_event *events)
return (clblasStatus)err;
}
-extern "C"
-clblasStatus
+extern "C"
+clblasStatus
clblasStrsm(
clblasOrder order,
clblasSide side,
@@ -1571,25 +1609,25 @@ clblasStrsm(
CHECK_QUEUES(numCommandQueues, commandQueues);
CHECK_EVENTS(numEventsInWaitList, eventWaitList);
- if ( numCommandQueues>1 )
+ if ( numCommandQueues>1 )
{
numCommandQueues = 1 ; // No support for multi-device (yet)
}
- cl_command_queue queue = commandQueues[0];
+ cl_command_queue queue = commandQueues[0];
- clblasStrsmFunctor::Args args(order,
- side,
- uplo,
- transA,
- diag,
- M, N,
+ clblasStrsmFunctor::Args args(order,
+ side,
+ uplo,
+ transA,
+ diag,
+ M, N,
alpha,
A, offA, lda,
- B, offB, ldb,
+ B, offB, ldb,
queue,
- numEventsInWaitList,
- eventWaitList,
+ numEventsInWaitList,
+ eventWaitList,
events);
@@ -1608,8 +1646,8 @@ clblasStrsm(
return res;
}
-extern "C"
-clblasStatus
+extern "C"
+clblasStatus
clblasDtrsm(
clblasOrder order,
clblasSide side,
@@ -1635,25 +1673,25 @@ clblasDtrsm(
CHECK_QUEUES(numCommandQueues, commandQueues);
CHECK_EVENTS(numEventsInWaitList, eventWaitList);
- if ( numCommandQueues>1 )
+ if ( numCommandQueues>1 )
{
numCommandQueues = 1 ; // No support for multi-device (yet)
}
- cl_command_queue queue = commandQueues[0];
+ cl_command_queue queue = commandQueues[0];
- clblasDtrsmFunctor::Args args(order,
- side,
- uplo,
- transA,
- diag,
- M, N,
+ clblasDtrsmFunctor::Args args(order,
+ side,
+ uplo,
+ transA,
+ diag,
+ M, N,
alpha,
A, offA, lda,
- B, offB, ldb,
+ B, offB, ldb,
queue,
- numEventsInWaitList,
- eventWaitList,
+ numEventsInWaitList,
+ eventWaitList,
events);
@@ -1689,7 +1727,7 @@ clblasDtrsm(
if (specialCaseHandled)
return SpecialCaseStatus;
-
+
SpecialCaseStatus = gpu_dtrsm128(order,
side,
uplo,
@@ -1707,14 +1745,14 @@ clblasDtrsm(
if (specialCaseHandled)
return SpecialCaseStatus;
-
+
CLBlasKargs kargs;
memset(&kargs, 0, sizeof(kargs));
kargs.dtype = TYPE_DOUBLE;
kargs.alpha.argDouble = alpha;
-
+
return doTrsm(&kargs,
order,
side,
@@ -1728,11 +1766,11 @@ clblasDtrsm(
numEventsInWaitList,
eventWaitList,
events);
-
+
}
-extern "C"
-clblasStatus
+extern "C"
+clblasStatus
clblasCtrsm(
clblasOrder order,
clblasSide side,
@@ -1757,25 +1795,25 @@ clblasCtrsm(
CHECK_QUEUES(numCommandQueues, commandQueues);
CHECK_EVENTS(numEventsInWaitList, eventWaitList);
- if ( numCommandQueues>1 )
+ if ( numCommandQueues>1 )
{
numCommandQueues = 1 ; // No support for multi-device (yet)
}
- cl_command_queue queue = commandQueues[0];
+ cl_command_queue queue = commandQueues[0];
- clblasCtrsmFunctor::Args args(order,
- side,
- uplo,
- transA,
- diag,
- M, N,
+ clblasCtrsmFunctor::Args args(order,
+ side,
+ uplo,
+ transA,
+ diag,
+ M, N,
alpha,
A, offA, lda,
- B, offB, ldb,
+ B, offB, ldb,
queue,
- numEventsInWaitList,
- eventWaitList,
+ numEventsInWaitList,
+ eventWaitList,
events);
@@ -1794,8 +1832,8 @@ clblasCtrsm(
return res;
}
-extern "C"
-clblasStatus
+extern "C"
+clblasStatus
clblasZtrsm(
clblasOrder order,
clblasSide side,
@@ -1820,25 +1858,25 @@ clblasZtrsm(
CHECK_QUEUES(numCommandQueues, commandQueues);
CHECK_EVENTS(numEventsInWaitList, eventWaitList);
- if ( numCommandQueues>1 )
+ if ( numCommandQueues>1 )
{
numCommandQueues = 1 ; // No support for multi-device (yet)
}
- cl_command_queue queue = commandQueues[0];
+ cl_command_queue queue = commandQueues[0];
- clblasZtrsmFunctor::Args args(order,
- side,
- uplo,
- transA,
- diag,
- M, N,
+ clblasZtrsmFunctor::Args args(order,
+ side,
+ uplo,
+ transA,
+ diag,
+ M, N,
alpha,
A, offA, lda,
- B, offB, ldb,
+ B, offB, ldb,
queue,
- numEventsInWaitList,
- eventWaitList,
+ numEventsInWaitList,
+ eventWaitList,
events);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list