[clblas] 12/67: fix sgemm NT perf drop when fix sgemm NT perf drop when lda=ldb=4096 or 5120 and k>lda/4
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Oct 27 08:02:09 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 458c9dafd37fae8679fdc6bbaf4bf086d5bcea0b
Author: Timmy <timmy.liu at amd.com>
Date: Sat Aug 15 04:33:53 2015 -0500
fix sgemm NT perf drop when fix sgemm NT perf drop when lda=ldb=4096 or 5120 and k>lda/4
---
src/library/CMakeLists.txt | 2 +
src/library/blas/functor/hawaii.cc | 17 +
.../blas/functor/hawaii_sgemmBig1024Kernel.cc | 388 +++++++++++++++++++++
.../functor/include/hawaii_sgemmBig1024Kernel.h | 48 +++
4 files changed, 455 insertions(+)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 49e9b7a..fa195bc 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -81,6 +81,7 @@ set(SRC_BLAS
blas/functor/gcn_dgemmSmallMatrices.cc
blas/functor/gcn_sgemmSmallMatrices.cc
blas/functor/hawaii_sgemmBranchKernel.cc
+ blas/functor/hawaii_sgemmBig1024Kernel.cc
)
set(SRC_BLAS_HEADERS
@@ -114,6 +115,7 @@ set(SRC_BLAS_HEADERS
blas/functor/include/gcn_dgemmSmallMatrices.h
blas/functor/include/gcn_sgemmSmallMatrices.h
blas/functor/include/hawaii_sgemmBranchKernel.h
+ blas/functor/include/hawaii_sgemmBig1024Kernel.h
)
set(SRC_BLAS_GENERIC
diff --git a/src/library/blas/functor/hawaii.cc b/src/library/blas/functor/hawaii.cc
index 62f6666..63fe71d 100644
--- a/src/library/blas/functor/hawaii.cc
+++ b/src/library/blas/functor/hawaii.cc
@@ -27,6 +27,7 @@
#include "hawaii_sgemmSplit64_32.h"
#include "gcn_zgemm.h"
#include "gpu_dtrsm192.h"
+#include "hawaii_sgemmBig1024Kernel.h"
FunctorSelectorHawaii FunctorSelectorHawaii::instance ;
@@ -116,6 +117,22 @@ clblasSgemmFunctor * FunctorSelectorHawaii::select_sgemm_specific(clblasSgemmFun
//if (functor)
// return functor;
+ if ((args.lda % 1024 == 0) && (args.ldb % 1024 == 0) && (args.K > args.lda / 4))
+ {
+ if ((args.lda == args.ldb) && (args.lda >= 4096) && (args.lda <= 8192)) // between 4096 and 8192 for now
+ {
+ if (args.lda != 6144)// 6144 is handled by a special case split
+ {
+ if (args.M % 128 == 0 && args.N % 128 == 0 && args.K % 64 == 0)
+ {
+ functor = clBlashawaiiSgemmBig1024KernelFunctor::provide(args, "Hawaii");
+ if (functor)
+ return functor;
+ }
+ }
+ }
+ }
+
if ((args.M >= 1184 && args.N >= 1184) && (args.M <= 3872 && args.N <= 3872) && (args.M % 64 != 0 && args.N % 64 != 0) && (args.M % 96 != 0 && args.N % 96 != 0) && (args.K % 16 == 0))
{
//all the mod32 sizes that is not mod64 or mod96 ranging from 1184 to 3872
diff --git a/src/library/blas/functor/hawaii_sgemmBig1024Kernel.cc b/src/library/blas/functor/hawaii_sgemmBig1024Kernel.cc
new file mode 100644
index 0000000..87819ba
--- /dev/null
+++ b/src/library/blas/functor/hawaii_sgemmBig1024Kernel.cc
@@ -0,0 +1,388 @@
+#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+#include <functor.h>
+#include <binary_lookup.h>
+#include <iostream>
+
+#include <functor_xgemm.h>
+#include <tahiti.h>
+#include <hawaii.h>
+
+#include "BinaryBuild.h"
+#include "hawaii_sgemmBig1024Kernel.h"
+
+//only non-multiples of 32 is implemented right now, which is a small matrix.
+#if BUILD_KERNEL_FROM_STRING
+#include "sgemm_gcn_BigMatrices.clT"
+#else
+#include "sgemm_gcn_BigMatrices.clHawaii_64.bin.clT"
+#include "sgemm_gcn_BigMatrices.clBonaire_64.bin.clT"
+#endif
+
+// Just because the full name is too long
+typedef clBlashawaiiSgemmBig1024KernelFunctor::Variant Variant;
+
+//define the string name of the soure/binary code
+#define SGEMM_SRC_NAME(TA,TB, DIVK, MULT) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT
+#define SGEMM_SRC_NAME_HAWAII(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Hawaii
+#define SGEMM_SRC_NAME_BONAIRE(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Bonaire
+#define SGEMM_SRC_NAME_BIN(TA,TB, DIVK, MULT, BITS, DEVICE) SGEMM_SRC_NAME##_##DEVICE(TA,TB, DIVK, MULT, BITS)
+
+
+//variant name used to differentiate the different ones
+#define SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT) "sgemm_" #TA #TB "_" #DIVK "_SPLIT" #MULT
+//SGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL),
+
+#define SGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT) "sgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT
+
+
+#define trans_N clblasNoTrans
+#define trans_T clblasTrans
+
+// Fill a variant descriptor using OpenCL source
+#define SGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \
+ KERNEL_NAME_MAIN, \
+ KERNELS_SRC, \
+ KERNEL_BUILD_OPTIONS, \
+ KERNELS_BIN, \
+ KERNEL_BIN_SIZE) { \
+ SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT), \
+{ KERNEL_NAME_MAIN } , \
+ KERNELS_SRC, \
+ KERNEL_BUILD_OPTIONS, \
+ KERNELS_BIN, \
+ KERNEL_BIN_SIZE, \
+ trans_##TA, trans_##TB, \
+ DIVK , \
+{ BS0, BS1 } , \
+{ NV0, NV1 } , \
+#MULT \
+}
+
+typedef clblasFunctorCache<clBlashawaiiSgemmBig1024KernelFunctor, const Variant *> CacheBig1024;
+static CacheBig1024 cachebig1024;
+
+// Make it 1 to enable additional debug 'print'
+#define VERB 0
+
+
+
+//static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args, int RefMultiple )
+//{
+//#if 0
+// // Transpose values are tested in select_variant
+// if ( args.transA != var.transA ) return false ;
+// if ( args.transB != var.transB ) return false ;
+//#endif
+//
+// //if (args.N>=var.divN && args.N % var.divN != 0 )
+// if ( args.N % var.divN != 0 )
+// return false ;
+// if ( args.M % var.divM != 0 )
+// return false ;
+// if(var.Greater[0]?args.M<RefMultiple:args.M>=RefMultiple)
+// return false;
+// if(var.Greater[1]?args.N<RefMultiple:args.N>=RefMultiple)
+// return false;
+// if ( args.beta==0 && var.mult.compare("__ALPHA")!=0)
+// return false ;
+// return true ;
+//}
+
+static void to_upper(char* input)
+{
+ while(*input)
+ {
+ *input=toupper(*input);
+ input++;
+ }
+}
+
+
+static const Variant * select_variant_Big1024Kernel(clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse)
+{
+ if (_64BitsUse != 64)
+ {
+ std::cout << "we don't support clblas on 32 bits" << std::endl;
+ assert(1);
+ return NULL;
+ }
+
+
+ if (args.transA == clblasNoTrans)
+ {
+ if (args.transB == clblasNoTrans)
+ {
+
+ // ===== sgemm NN ======
+ // sgemm NN does not have big 1024 perf drop problem
+
+
+ return NULL;
+ }
+ if (args.transB == clblasTrans)
+ {
+
+
+ // ===== SGEMM NT ======
+ //sgemm_NT_128_128_16_16x16_8x8__ALPHABETA
+ const char* KName_NT = SGEMM_KERNEL_NAME(N, T, 128, 128, 16, 16, 16, 8, 8, __ALPHABETA);
+
+
+ const char* KBin_NT64;
+ size_t KBin_NTSize64 = 0;
+
+
+#if BUILD_KERNEL_FROM_STRING
+ //currently not supported
+ return NULL;
+#else
+ if (!strcmp(DevName, "Hawaii"))
+ {
+ //KBin_NT64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, HAWAII) ;
+ KBin_NT64 = sgemm_NT_128_128_16_16x16_8x8__ALPHABETA_64_bin_Hawaii;
+ KBin_NTSize64 = sizeof(sgemm_NT_128_128_16_16x16_8x8__ALPHABETA_64_bin_Hawaii);
+
+ }
+
+#endif
+
+ // ===== SGEMM NT ======
+ static const Variant variant = SGEMM_VARIANT_OBJ(N, T, 16, 16, 16, 8, 8, 64, __ALPHABETA,
+ KName_NT,
+ NULL,
+ NULL,
+ KBin_NT64,
+ KBin_NTSize64);
+
+ return &variant;
+ }
+ }
+ else
+ {
+ if (args.transB == clblasNoTrans)
+ {
+
+ // ===== sgemm TN ======
+ //sgemm TN does not have big 1024 perf drop problem
+ return NULL;
+ }
+ return NULL;
+ }
+
+ return NULL;
+}
+
+
+clBlashawaiiSgemmBig1024KernelFunctor::clBlashawaiiSgemmBig1024KernelFunctor(Args & args, const Variant * variant, cl_int & err)
+{
+
+ cl_device_id device;
+ cl_context context;
+ m_program=NULL;
+ m_variantBig1024 = variant;
+
+
+ cl_command_queue queue = args.queue;
+ err = getDeviceAndContext(queue, device, context);
+ if( err != CL_SUCCESS )
+ {
+ return;
+ }
+
+ if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variantBig1024->variantName);
+
+ //Ben do I use the correct "kernel_name"?
+ BinaryLookup bl(context, device, "clBlashawaiiSgemmBig1024KernelFunctor");
+
+ bl.variantRaw(this->m_variantBig1024->variantName, strlen(this->m_variantBig1024->variantName) + 1);
+
+ if ( !bl.found() ) // may create empty file or may wait until file is ready
+ {
+ if (this->m_variantBig1024->bin != NULL)
+ {
+ // build from a pre-compiled version of the kernel (SPIR or cl binaries)
+ //only 1 binary containing all the kernel
+ err = bl.buildFromBinary(this->m_variantBig1024->bin, this->m_variantBig1024->bin_size, "-cl-std=2.0");
+ }
+ else
+ {
+ //// directly build from a char*
+ if (VERB) printf(" ===> BUILD PROBLEM WE DON'T SUPPORT SOURCE BUILD FOR big 1024 SGEMM\n") ;
+ return;
+ }
+
+ if ( err != CL_SUCCESS )
+ {
+ if (VERB) printf(" ===> BUILD PROBLEM\n") ;
+
+ return;
+ }
+ }
+
+ this->m_program = bl.getProgram();
+}
+
+
+
+clBlashawaiiSgemmBig1024KernelFunctor *
+clBlashawaiiSgemmBig1024KernelFunctor::provide(clblasSgemmFunctor::Args & args, char* DevName)
+{
+
+ if ( args.order == clblasRowMajor )
+ return NULL ; // The RowMajor case shall never occur.
+
+ cl_device_id dev;
+ cl_context ctxt;
+
+ cl_int err = getDeviceAndContext(args.queue, dev, ctxt);
+ if (err != CL_SUCCESS)
+ {
+ return NULL;
+ }
+ cl_uint bitness = getAddressBits(dev);
+
+ int major;
+ int minor;
+
+ getCLVersion(dev, major, minor);
+
+ //if (major<2)
+ // return NULL;
+
+ // to_upper( DevName);
+ const Variant * variant = select_variant_Big1024Kernel(args, DevName, bitness);
+ if ( variant == NULL )
+ return NULL ;
+
+
+
+ CacheBig1024::Lookup lookup(cachebig1024, ctxt, dev, variant) ;
+
+
+ if ( lookup.ok() )
+ {
+ clBlashawaiiSgemmBig1024KernelFunctor * functor = lookup.get();
+ functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used
+ return functor;
+ }
+
+ clBlashawaiiSgemmBig1024KernelFunctor * functor = new clBlashawaiiSgemmBig1024KernelFunctor(args, variant, err);
+ if (err != CL_SUCCESS)
+ {
+ return NULL;
+ }
+
+ lookup.set(functor) ;
+
+ return functor;
+
+}
+
+
+cl_int clBlashawaiiSgemmBig1024KernelFunctor::KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[1], Args &args)
+{
+ //((Mvalue - 1) / 128 + 1) * 16
+ size_t GlobalX = ((args.M-1) / 128 + 1)*16 ;
+
+ //
+
+ size_t GlobalY = ((args.N - 1) / 128 + 1) * 16;
+
+
+ std::size_t gs[2] = {GlobalX, GlobalY};
+ cl_int error = 0;
+
+
+ //if (VERB) printf(" ===> EXECUTE KERNEL 0 \n") ;
+ error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantBig1024->ls, args.numEventsInWaitList, args.eventWaitList,args.events);
+ return error;
+
+
+ return clblasNotImplemented;
+}
+
+
+
+clblasStatus clBlashawaiiSgemmBig1024KernelFunctor::execute(Args &args)
+{
+ cl_int err;
+ cl_command_queue queue = args.queue;
+
+ if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variantBig1024->kernel_name, args.alpha, args.beta);
+
+ cl_kernel kernel[1];
+ int NBKernel = 0;
+
+
+ if (this->m_variantBig1024->kernel_name[0])
+ {
+ kernel[0] = clCreateKernel(this->m_program, this->m_variantBig1024->kernel_name[0], &err);
+ if (err != CL_SUCCESS)
+ return clblasStatus(err) ;
+ NBKernel++;
+ }
+
+
+ if (NBKernel != 1) return clblasStatus(clblasBuildProgramFailure) ;
+
+ if (VERB)
+ {
+ for (int i=0; i<NBKernel; i++)
+ printf(" ===> FOUND %s\n", this->m_variantBig1024->kernel_name[i]);
+ }
+
+ int M = args.M, N = args.N, K = args.K;
+ int lda = args.lda, ldb = args.ldb, ldc = args.ldc;
+
+ int offsetA = args.offA;
+ int offsetB = args.offB;
+ int offsetC = args.offC;
+
+ int arg[4]={0, 0, 0, 0} ;
+
+ //// All sgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc)
+
+ for (int i=0; i<NBKernel; i++)
+ {
+ setKernelArg<cl_mem>(kernel[i], arg[i]++, args.A);
+ setKernelArg<cl_mem>(kernel[i], arg[i]++, args.B);
+ setKernelArg<cl_mem>(kernel[i], arg[i]++, args.C);
+
+ setKernelArg<int>(kernel[i], arg[i]++, M);
+ setKernelArg<int>(kernel[i], arg[i]++, N);
+ setKernelArg<int>(kernel[i], arg[i]++, K);
+
+ setKernelArg<cl_float>(kernel[i], arg[i]++, args.alpha);
+ setKernelArg<cl_float>(kernel[i], arg[i]++, args.beta);
+
+ setKernelArg<int>(kernel[i], arg[i]++, lda);
+ setKernelArg<int>(kernel[i], arg[i]++, ldb);
+ setKernelArg<int>(kernel[i], arg[i]++, ldc);
+
+ setKernelArg<int>(kernel[i], arg[i]++, offsetA);
+ setKernelArg<int>(kernel[i], arg[i]++, offsetB);
+ setKernelArg<int>(kernel[i], arg[i]++, offsetC);
+ }
+
+ err = KernelsLaunch(queue, kernel, args);
+
+
+
+ for (int i = 0; i<NBKernel; i++)
+ clReleaseKernel(kernel[i]) ;
+
+ if (VERB) printf(" ===> ERR=%d \n",(int)err) ;
+
+ // err= clFinish(queue);
+ return clblasStatus(err) ;
+
+}
+#endif
diff --git a/src/library/blas/functor/include/hawaii_sgemmBig1024Kernel.h b/src/library/blas/functor/include/hawaii_sgemmBig1024Kernel.h
new file mode 100644
index 0000000..fb70ae0
--- /dev/null
+++ b/src/library/blas/functor/include/hawaii_sgemmBig1024Kernel.h
@@ -0,0 +1,48 @@
+/*
+Handles lda=ldb=4096, 5120, 7168, 8192
+lda=ldb=6144 should be handled by a special case in hawaii_sgemmSplitKernel
+*/
+#ifndef HAWAII_SGEMMBIG1024KERNEL
+#define HAWAII_SGEMMBIG1024KERNEL
+
+#include "gcn_sgemm.h"
+
+
+class clBlashawaiiSgemmBig1024KernelFunctor : public clblasSgemmFunctorGCN
+{
+public:
+ struct Variant
+ {
+ const char * variantName;
+ const char * kernel_name[1] ; //just one kernel here
+ const char * source ; // the kernel source (shall be unique)
+ const char * build_options;
+ const char * bin ;
+ size_t bin_size ;
+ clblasTranspose transA ; //
+ clblasTranspose transB ; //
+ unsigned divK ; // Required divisor of N (use 1 when N can be of any value)
+ size_t ls[2] ; // Local size (the work-group size)
+ size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items
+ // So basically each kernel is computing a block of
+ // (ls[0]*bwi[0]) x (ls[1]*bwi[1])
+ // elements of C.
+ std::string mult;
+
+ } ;
+
+
+ private: // Constructor & Destructor
+
+ clBlashawaiiSgemmBig1024KernelFunctor(Args & args, const Variant * variant, cl_int & err);
+ cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[1], Args &args);
+ const Variant * m_variantBig1024 ; // Pointer to a 'const static' object describing the kernel variant.
+
+public:
+
+ static clBlashawaiiSgemmBig1024KernelFunctor * provide(clblasSgemmFunctor::Args & args, char* DevName);
+ virtual clblasStatus execute(Args &args) ;
+
+};
+
+#endif
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list