[clblas] 46/67: finished dtrsm offline compile dev
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Oct 27 08:02:14 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 4f204b2b70fb508ff94d017818c62bb413e57027
Author: Timmy <timmy.liu at amd.com>
Date: Fri Oct 2 14:58:50 2015 -0500
finished dtrsm offline compile dev
---
src/library/CMakeLists.txt | 57 ++++
src/library/OCLBinaryGenerator.cmake | 86 +++++
.../AutoGemmTools/AutoGemmPreCompileKernels.cpp | 5 +-
.../blas/trtri/TrtriKernelSourceIncludes.cpp | 38 +++
.../blas/trtri/diag_dtrtri_lower_128_16.cpp | 3 +
.../blas/trtri/diag_dtrtri_upper_128_16.cpp | 1 +
.../blas/trtri/diag_dtrtri_upper_192_12.cpp | 1 +
.../trtri/triple_dgemm_update_128_16_PART1_L.cpp | 1 +
.../trtri/triple_dgemm_update_128_16_PART2_L.cpp | 1 +
.../blas/trtri/triple_dgemm_update_128_16_R.cpp | 1 +
.../trtri/triple_dgemm_update_128_32_PART1_L.cpp | 1 +
.../trtri/triple_dgemm_update_128_32_PART1_R.cpp | 1 +
.../trtri/triple_dgemm_update_128_32_PART2_L.cpp | 1 +
.../trtri/triple_dgemm_update_128_32_PART2_R.cpp | 1 +
.../trtri/triple_dgemm_update_128_64_PART1_L.cpp | 1 +
.../trtri/triple_dgemm_update_128_64_PART1_R.cpp | 1 +
.../trtri/triple_dgemm_update_128_64_PART2_L.cpp | 1 +
.../trtri/triple_dgemm_update_128_64_PART2_R.cpp | 1 +
.../triple_dgemm_update_128_ABOVE64_PART1_L.cpp | 1 +
.../triple_dgemm_update_128_ABOVE64_PART1_R.cpp | 1 +
.../triple_dgemm_update_128_ABOVE64_PART2_L.cpp | 1 +
.../triple_dgemm_update_128_ABOVE64_PART2_R.cpp | 1 +
.../triple_dgemm_update_128_ABOVE64_PART3_L.cpp | 3 +
.../triple_dgemm_update_128_ABOVE64_PART3_R.cpp | 3 +
.../blas/trtri/triple_dgemm_update_192_12_R.cpp | 1 +
.../trtri/triple_dgemm_update_192_24_PART1_R.cpp | 1 +
.../trtri/triple_dgemm_update_192_24_PART2_R.cpp | 1 +
.../trtri/triple_dgemm_update_192_48_PART1_R.cpp | 1 +
.../trtri/triple_dgemm_update_192_48_PART2_R.cpp | 1 +
.../trtri/triple_dgemm_update_192_96_PART1_R.cpp | 1 +
.../trtri/triple_dgemm_update_192_96_PART2_R.cpp | 1 +
.../tools/OCLBinaryGenerator/CMakeLists.txt | 33 ++
.../OCLBinaryGenerator/OCLBinaryGenerator.cpp | 347 +++++++++++++++++++++
33 files changed, 597 insertions(+), 2 deletions(-)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 9a6b2f9..377aa9d 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -90,6 +90,7 @@ option( PRECOMPILE_GEMM_TRANS_CN "AutoGemm: pre-compile CN transpose cases" OFF)
option( PRECOMPILE_GEMM_TRANS_CT "AutoGemm: pre-compile CT transpose cases" OFF)
option( PRECOMPILE_GEMM_TRANS_CC "AutoGemm: pre-compile CC transpose cases" OFF)
+
# opencl compiler version
#set( PRECOMPILE_GEMM_OPENCL_VERSION "2.0" CACHE STRING "OpenCL compiler version supported by device driver." )
#set_property( CACHE PRECOMPILE_GEMM_OPENCL_VERSION PROPERTY STRINGS 2.0 1.2 1.1 )
@@ -291,7 +292,25 @@ source_group(AutoGemm\\src FILES ${AUTOGEMM_SRC} ${AUTOGEMM_PRECOMPILED_KERNELS}
# AutoGemm End
################################################################################
+################################################################################
+# BEGIN Pre Compile General (static) Kernels
+################################################################################
+# options for pre-compiling trsm kernels
+option( PRECOMPILE_TRSM_STRSM "pre-compile available dtrsm kernels" OFF )
+option( PRECOMPILE_TRSM_DTRSM "pre-compile available strsm kernels" OFF )
+if(PRECOMPILE_TRSM_DTRSM)
+add_definitions(-DCLBLAS_OFFLINE_COMPILE_DTRSM)
+message(STATUS "precompile DTRSM kernels.")
+endif()
+if(PRECOMPILE_TRSM_STRSM)
+add_definitions(-DCLBLAS_OFFLINE_COMPILE_STRSM)
+message(STATUS "precompile STRSM kernels. (not yet implemented)")
+endif()
+
+################################################################################
+# END Pre Compile General (static) Kernels
+################################################################################
set(SRC_BLAS
blas/init.c
@@ -670,6 +689,40 @@ ExternalProject_Add( tplgen
INSTALL_COMMAND ""
)
+################OCLBinaryGenerator
+if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM)
+
+
+ExternalProject_Add( OCLBinaryGenerator
+ URL "${CMAKE_SOURCE_DIR}/library/tools/OCLBinaryGenerator"
+ CMAKE_ARGS -DOPENCL_LIBRARIES=${OPENCL_LIBRARIES} -DOPENCL_INCLUDE_DIRS=${OPENCL_INCLUDE_DIRS}
+ INSTALL_COMMAND ""
+)
+ExternalProject_Get_Property( OCLBinaryGenerator binary_dir )
+message(STATUS "OCLBinaryGenerator binary_dir =${binary_dir}")
+set( OCLBinaryGeneratorBinaryDir "${binary_dir}/staging" )
+
+# OCLBinaryGenerator requires at least three inputs
+# 1, path to the kernel file
+# 2, file name
+# 3, output directory
+# 4, [optional] compiler flags
+# 5, [optional] trageted hardware. If this is not supplied OCLBinaryGenerator will generate binary for the first device on system
+set( OCL_COMPILER_FLAGS " ")
+if( OPENCL_VERSION STREQUAL "2.0")
+ set( OCL_COMPILER_FLAGS "-cl-std=CL2.0")
+endif()
+
+add_custom_target( OCLBinaryGenerator_GEN )
+add_custom_command(TARGET OCLBinaryGenerator_GEN
+ PRE_BUILD
+ COMMAND ${CMAKE_COMMAND} -DOCLBinaryGeneratorBinaryDir=${OCLBinaryGeneratorBinaryDir} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DOCL_COMPILER_FLAGS=${OCL_COMPILER_FLAGS}
+ -P "${CMAKE_SOURCE_DIR}/library/OCLBinaryGenerator.cmake"
+ )
+add_dependencies( OCLBinaryGenerator_GEN OCLBinaryGenerator )
+
+endif()
+
# if offline compilation is not chosen, bingen should not be built
if(OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OR OPENCL_OFFLINE_BUILD_HAWAII_KERNEL OR OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL)
ExternalProject_Add( bingen
@@ -801,6 +854,10 @@ add_library(clBLAS
)
add_dependencies(clBLAS GENERATE_CLT)
+if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM)
+add_dependencies(clBLAS OCLBinaryGenerator_GEN)
+endif()
+
# AutoGemm needs compiler flag to utilize pre-compiled kernels
if ( ${PRECOMPILE_GEMM_ACTIVE} )
set_target_properties(clBLAS PROPERTIES COMPILE_FLAGS -DAUTOGEMM_USE_PRE_COMPILED_KERNELS)
diff --git a/src/library/OCLBinaryGenerator.cmake b/src/library/OCLBinaryGenerator.cmake
new file mode 100644
index 0000000..602f078
--- /dev/null
+++ b/src/library/OCLBinaryGenerator.cmake
@@ -0,0 +1,86 @@
+
+message(STATUS "inside OCLBinaryGenerator.cmake")
+message(STATUS "OCLBinary.cmake SOURCE_DIR=${SOURCE_DIR}")
+message(STATUS "OCLBinary.cmake BINARY_DIR=${BINARY_DIR}")
+
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_lower_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_192_12 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_12_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
+execute_process(
+ COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+ )
diff --git a/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp b/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp
index 1d75dbb..c94e9a2 100644
--- a/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp
+++ b/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp
@@ -682,6 +682,7 @@ int main( int argc, char *argv[] ) {
clockStart = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec;
#endif
const int specialKernelCount = user_kernel_count;
+
totalKernelsToCompile = gemmPreCompileNum;
totalKernelsToCompile *= 4;
totalKernelsToCompile += specialKernelCount;
@@ -823,7 +824,7 @@ int main( int argc, char *argv[] ) {
beta = 1.0;
char *appendString = appendStringArray[i];
-
+
compileKernelAndWriteToFile<float>(
context,
clblasColumnMajor,
@@ -838,7 +839,7 @@ int main( int argc, char *argv[] ) {
tileKernelSource,
binaryBuildOptions,
appendString);
-
+
}
// for each kernel to be pre-compiled
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
index 0a229b2..e4a685f 100644
--- a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
@@ -6,6 +6,7 @@
#ifndef TRTRI_SOURCE_INCLUDES_CPP
#define TRTRI_SOURCE_INCLUDES_CPP
+#ifndef CLBLAS_OFFLINE_COMPILE_DTRSM
/*mod 192 dtrsm*/
#include "diag_dtrtri_upper_192_12.cpp"
#include "triple_dgemm_update_192_12_R.cpp"
@@ -40,4 +41,41 @@
#include "triple_dgemm_update_128_ABOVE64_PART2_L.cpp"
#include "triple_dgemm_update_128_ABOVE64_PART3_L.cpp"
+#else
+/*mod 192 dtrsm*/
+#include "diag_dtrtri_upper_192_12_bin.cpp"
+#include "triple_dgemm_update_192_12_R_bin.cpp"
+#include "triple_dgemm_update_192_24_PART1_R_bin.cpp"
+#include "triple_dgemm_update_192_24_PART2_R_bin.cpp"
+#include "triple_dgemm_update_192_48_PART1_R_bin.cpp"
+#include "triple_dgemm_update_192_48_PART2_R_bin.cpp"
+#include "triple_dgemm_update_192_96_PART1_R_bin.cpp"
+#include "triple_dgemm_update_192_96_PART2_R_bin.cpp"
+
+/*mod 128 dtrsm*/
+/*upper*/
+#include "diag_dtrtri_upper_128_16_bin.cpp"
+#include "triple_dgemm_update_128_16_R_bin.cpp"
+#include "triple_dgemm_update_128_32_PART1_R_bin.cpp"
+#include "triple_dgemm_update_128_32_PART2_R_bin.cpp"
+#include "triple_dgemm_update_128_64_PART1_R_bin.cpp"
+#include "triple_dgemm_update_128_64_PART2_R_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART1_R_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART2_R_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART3_R_bin.cpp"
+
+/*lower*/
+#include "diag_dtrtri_lower_128_16_bin.cpp"
+#include "triple_dgemm_update_128_16_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_16_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_32_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_32_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_64_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_64_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART3_L_bin.cpp"
+
+#endif //CLBLAS_OFFLINE_COMPILE_DTRSM
+
#endif
diff --git a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
index 17b8be6..6037eed 100644
--- a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
+++ b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
@@ -11,9 +11,11 @@
#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
#endif
+
unsigned char *diag_dtrtri_lower_128_16_bin = 0;
size_t diag_dtrtri_lower_128_16_binSize = 0;
+
const char * const diag_dtrtri_lower_128_16_src = STRINGIFY(
#define BLOCK_SIZE 16 \n
#define NB 128 \n
@@ -165,5 +167,6 @@ for (i = BLOCK_SIZE - 2; i >= 0; i--) {\n
for (i = 0; i < BLOCK_SIZE; i++)\n
*(d_dinvA + i*NB + tx) = Bs[i*BLOCK_SIZE + tx]; \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
index 83f42f2..c31a218 100644
--- a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
+++ b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
@@ -146,5 +146,6 @@ uint na)\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
index b4409c5..dde0776 100644
--- a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
+++ b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
@@ -144,5 +144,6 @@ for (i = 0; i < BLOCK_SIZE; i++)\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
index cfdf9ab..c0e3b4c 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
@@ -156,5 +156,6 @@ Ain = Ain + offAin; \n
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE); \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
index 5ccbda2..71c13dc 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
@@ -138,5 +138,6 @@ Ain = Ain + offAin; \n
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE); \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
index 7026563..237d3fe 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
@@ -234,5 +234,6 @@ barrier(CLK_LOCAL_MEM_FENCE); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
index f0ce2f1..46b7e97 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
@@ -145,5 +145,6 @@ int PagesPerNB = NB / (blk * 2); \n
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE); \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
index 06a9b73..3358af6 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
@@ -146,5 +146,6 @@ int PagesPerNB = NB / (blk * 2); \n
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE); \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
index 4b9b813..e480d6b 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
@@ -130,5 +130,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
index 2fcd684..6c04dee 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
@@ -131,5 +131,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
index 5e80dc5..eef824c 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
@@ -140,5 +140,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
index 92361d4..0f64809 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
@@ -140,5 +140,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
index 47e7fca..83e0c7e 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
@@ -128,5 +128,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
index 09e37c4..5ce3e42 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
@@ -129,5 +129,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
index 571cb72..af7f518 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
@@ -141,5 +141,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
index a7bf4e6..51a3e87 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
@@ -139,5 +139,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
index f2b99ba..674fdd5 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
@@ -129,5 +129,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
index b485b50..a45494b 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
@@ -130,5 +130,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
index 994fabb..d2077bf 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
@@ -40,6 +40,8 @@ double * __restrict__ c)\n
c[15] += alpha * b[15]; \n
}\n
#define NB 128\n
+#define ZERO ( 0.0) \n
+#define ONE ( 1.0) \n
#define __mul(i,j) ((i)*(j))\n
#define qmod(a, b) ((a)%(b))\n
__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
@@ -84,5 +86,6 @@ int PagesPerNB = NB / (blk * 2); \n
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE); \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
index 7d09937..004a8d2 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
@@ -42,6 +42,8 @@ double * __restrict__ c)\n
c[15] += alpha * b[15]; \n
}\n
#define NB 128\n
+#define ZERO ( 0.0) \n
+#define ONE ( 1.0) \n
#define __mul(i,j) ((i)*(j))\n
#define qmod(a, b) ((a)%(b))\n
__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
@@ -87,5 +89,6 @@ int PagesPerNB = NB / (blk * 2); \n
}\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
index ca5529f..79bc4c0 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
@@ -189,5 +189,6 @@ const char * const triple_dgemm_update_192_12_R_src = STRINGIFY(
} while (i < 12);\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
index b23a276..6b62eb4 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
@@ -112,5 +112,6 @@ const char * const triple_dgemm_update_192_24_PART1_R_src = STRINGIFY(
i = i + 1; \n
} while (i < 12); \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
index e3a396c..d8c2f99 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
@@ -107,5 +107,6 @@ const char * const triple_dgemm_update_192_24_PART2_R_src = STRINGIFY(
} while (i < 12);\n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
index 473d989..dafa65b 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
@@ -139,5 +139,6 @@ const char * const triple_dgemm_update_192_48_PART1_R_src = STRINGIFY(
i = i + 1; \n
} while (i < 12); \n
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
index 5315196..4571112 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
@@ -140,5 +140,6 @@ const char * const triple_dgemm_update_192_48_PART2_R_src = STRINGIFY(
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
index 1c8111d..8e58ab6 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
@@ -151,5 +151,6 @@ const char * const triple_dgemm_update_192_96_PART1_R_src = STRINGIFY(
}\n
+// end of kernel
);
#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
index 6c2c61d..2ff217d 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
@@ -152,5 +152,6 @@ const char * const triple_dgemm_update_192_96_PART2_R_src = STRINGIFY(
}\n
+// end of kernel
);
#endif
diff --git a/src/library/tools/OCLBinaryGenerator/CMakeLists.txt b/src/library/tools/OCLBinaryGenerator/CMakeLists.txt
new file mode 100644
index 0000000..227bffe
--- /dev/null
+++ b/src/library/tools/OCLBinaryGenerator/CMakeLists.txt
@@ -0,0 +1,33 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+cmake_minimum_required(VERSION 2.6)
+project(OCLBinaryGenerator C CXX)
+ADD_DEFINITIONS(/D_CRT_SECURE_NO_WARNINGS)
+ADD_EXECUTABLE(OCLBinaryGenerator OCLBinaryGenerator.cpp)
+target_link_libraries(OCLBinaryGenerator ${OPENCL_LIBRARIES})
+include_directories(${OPENCL_INCLUDE_DIRS})
+
+set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/staging" )
+
+if ( MSVC )
+ set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_CURRENT_BINARY_DIR}/staging" )
+ set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_CURRENT_BINARY_DIR}/staging" )
+endif( )
+
+
+
+
diff --git a/src/library/tools/OCLBinaryGenerator/OCLBinaryGenerator.cpp b/src/library/tools/OCLBinaryGenerator/OCLBinaryGenerator.cpp
new file mode 100644
index 0000000..3feea11
--- /dev/null
+++ b/src/library/tools/OCLBinaryGenerator/OCLBinaryGenerator.cpp
@@ -0,0 +1,347 @@
+#include <assert.h>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <cstring>
+#include <iostream>
+
+#ifdef __GNUC__
+// Linux
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#else
+// Windows
+#include <Windows.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+//#define stat _stat
+#endif
+
+#include "CL/cl.h"
+
+void find_and_replace(std::string& str, const std::string& findStr, const std::string& replaceStr){
+ size_t pos = 0;
+ while ((pos = str.find(findStr, pos)) != std::string::npos){
+ str.replace(pos, findStr.length(), replaceStr);
+ pos += replaceStr.length();
+ }
+}
+
+/******************************************************************************
+* Check OpenCL Errors
+*****************************************************************************/
+#define CL_CHECK(STATUS) \
+ if(STATUS != CL_SUCCESS) { \
+ printf("OpenCL error %i on line %u\n", STATUS, __LINE__); \
+ assert(false); \
+ }
+
+/******************************************************************************
+* write binary to stream
+*****************************************************************************/
+void writeBinaryToStream(std::ostream & out, char *binary, size_t binarySize) {
+ for (int i = 0; i < binarySize; i++) {
+
+ out << std::setw(4) << (int)binary[i];
+
+ if (i < binarySize - 1) {
+ out << ",";
+ }
+ if ((i + 1) % 16 == 0) {
+ out << std::endl;
+ }
+ }
+ out << std::endl;
+}
+
+/******************************************************************************
+* Get AMD Platform
+*****************************************************************************/
+cl_int getAMDPlatform(cl_platform_id *platform) {
+ *platform = NULL;
+ cl_int status = CL_SUCCESS;
+
+ // get num platforms
+ cl_uint numPlatforms;
+ status = clGetPlatformIDs(0, NULL, &numPlatforms);
+ if (status != CL_SUCCESS) {
+ std::cout << "Error: clGetPlatformIDs failed. Error code: " << status << std::endl;
+ return status;
+ }
+
+ if (numPlatforms > 0) {
+ // Get selected platform
+ cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+ status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+ if (status != CL_SUCCESS) {
+ std::cout << "Error: clGetPlatformIDs failed. Error code : " << status << std::endl;
+ return status;
+ }
+
+ // Print all platforms
+ for (unsigned i = 0; i < numPlatforms; ++i) {
+ char pbuf[100];
+ status = clGetPlatformInfo(platforms[i],
+ CL_PLATFORM_VENDOR,
+ sizeof(pbuf),
+ pbuf,
+ NULL);
+
+ if (status != CL_SUCCESS) {
+ std::cout << "Error: clGetPlatformInfo failed. Error code : " << status << std::endl;
+ return status;
+ }
+
+ //std::cout << "Platform " << i << " : " << pbuf << std::endl;
+ }
+
+ // Get AMD platform
+ for (unsigned i = 0; i < numPlatforms; ++i) {
+ char pbuf[100];
+ status = clGetPlatformInfo(platforms[i],
+ CL_PLATFORM_VENDOR,
+ sizeof(pbuf),
+ pbuf,
+ NULL);
+
+ if (status != CL_SUCCESS) {
+ std::cout << "Error: clGetPlatformInfo failed. Error code: " << status << std::endl;
+ return status;
+ }
+
+ *platform = platforms[i];
+ if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+ break;
+ }
+ }
+
+ // verify AMD platform
+ char pbuf[100];
+ status = clGetPlatformInfo(*platform,
+ CL_PLATFORM_VENDOR,
+ sizeof(pbuf),
+ pbuf,
+ NULL);
+
+ if (status != CL_SUCCESS) {
+ std::cout << "Error: clGetPlatformInfo failed. Error code: " << status << std::endl;
+ return status;
+ }
+ if (strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+ std::cout << "AMD platform not found" << std::endl;
+ return CL_INVALID_PLATFORM;
+ }
+
+ }
+ else {
+ std::cout << "No OpenCL platforms found." << std::endl;
+ return CL_INVALID_PLATFORM;
+ }
+
+ return status;
+}
+
+
+/******************************************************************************
+* get kernel binary from source
+*****************************************************************************/
+cl_int getKernelBinaryFromSource(
+ cl_context context,
+ const char *source,
+ const char *buildOptions,
+ char **binary,
+ size_t *binarySize)
+{
+ cl_int status = CL_SUCCESS;
+
+ // create program
+ cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &status);
+ CL_CHECK(status);
+
+ cl_uint numDevicesInContext;
+ status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL);
+ CL_CHECK(status);
+
+ // get devices
+ cl_device_id* devices = new cl_device_id[numDevicesInContext];
+ clGetContextInfo(context, CL_CONTEXT_DEVICES, numDevicesInContext*sizeof(cl_device_id), devices, NULL);
+ CL_CHECK(status);
+
+ // choose device 0
+ cl_device_id device = devices[0];
+
+ // build program for device
+ status = clBuildProgram(program, 1, &device, buildOptions, NULL, NULL);
+
+
+ // print build failure
+ if (status != CL_SUCCESS) {
+ printf("clBuildProgram Failed\n");
+ printf("status = %d\n", status);
+
+ size_t len = 0;
+ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
+ char* buildLog = new char[len];
+
+ printf("Error: Failed to build program executable!\n");
+ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len*sizeof(char), buildLog, 0);
+ printf("\nBuild Log:\n\n");
+ printf("%s\n", buildLog);
+ printf("\n\nKernel String:\n\n");
+ printf("%s\n", source);
+
+ binary[0] = 0;
+ *binarySize = 0;
+ return status;
+ }
+
+
+ // get binary from program
+ status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), binarySize, NULL);
+
+ binary[0] = new char[*binarySize];
+
+ status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 8 /*?*/, binary, NULL);
+ CL_CHECK(status);
+
+ return CL_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+ /*
+ OCLBinaryGenerator requires at least two inputs
+ 1, path the kernel file
+ 2, file name
+ 3, output directory
+ 4, optional compiler flags
+ 5, [optional] trageted hardware. If this is not supplied OCLBinaryGenerator will generate binary for the first device on system
+ */
+ if (argc < 4)
+ {
+ printf("not enough arguments. OCLBinaryGenerator aborted.\n");
+ exit(-1);
+ }
+
+ //get the input path
+ std::string inputPath = argv[1];
+ inputPath += "\\";
+ std::cout << "OCLBinaryGenerator input path is " << inputPath <<std::endl;
+
+ //get the input filename
+ std::string inputFilename = argv[2];
+ inputFilename += ".cpp";
+ std::cout << "OCLBinaryGenerator input filename is " << inputFilename << std::endl;
+
+ //get the path to destination
+ std::string outputPath = argv[3];
+ outputPath += "\\";
+ std::cout << "OCLBinaryGenerator output path is " << outputPath << std::endl;
+
+ //output filename
+ std::string outputFilename = argv[2];
+ std::string outputKernelName = argv[2];
+ outputFilename += "_bin.cpp";
+ std::cout << "OCLBinaryGenerator output filename is " << outputFilename << std::endl;
+
+ //get compiler flag
+ std::string OCL_flag = " ";
+ if (argc > 4)
+ OCL_flag = argv[4];
+ std::cout << "OCLBinaryGenerator compiler flag is " << OCL_flag << std::endl;
+
+ //start writing file
+ std::ofstream outputFile;
+ outputFile.open((outputPath+outputFilename).c_str(), std::ios::out);
+
+ const char *outputFileHeader =
+ "/*****************************************************************************\n"
+ " * kernel binary file\n"
+ " ****************************************************************************/\n\n";
+ outputFile << outputFileHeader;
+
+ // get AMD platform
+ cl_platform_id platform;
+ cl_int status = getAMDPlatform(&platform);
+ CL_CHECK(status);
+
+ cl_uint numDevices;
+ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+ CL_CHECK(status);
+
+ // get all gpu devices
+ cl_device_id* devices = new cl_device_id[numDevices];
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
+ CL_CHECK(status);
+
+ // choose device 0 or we can choose a target device in the future
+ cl_device_id device = devices[0];
+
+ // create context
+ cl_context_properties cps[3] =
+ {
+ CL_CONTEXT_PLATFORM,
+ (cl_context_properties)platform,
+ 0
+ };
+ cl_context context = clCreateContext(
+ cps,
+ 1, // device
+ &device,
+ NULL,
+ NULL,
+ &status);
+ CL_CHECK(status);
+
+ cl_uint numDevicesInContext;
+ status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL);
+ CL_CHECK(status);
+
+ char **kernelBinary = new char*[1];
+ size_t kernelBinarySize;
+
+
+ std::ifstream inputfile(inputPath+inputFilename);
+ if (!inputfile.is_open())
+ {
+ printf("Input file does not exist. OCLBinaryGenerator aborted.\n");
+ exit(-1);
+ }
+ std::string str((std::istreambuf_iterator<char>(inputfile)), std::istreambuf_iterator<char>());
+ inputfile.close();
+ //std::cout<<str<<std::endl;
+ std::string str_temp("= STRINGIFY(");
+ size_t startOfKernel = str.find("= STRINGIFY(");
+ startOfKernel += str_temp.length();
+ size_t endOfKernel = str.find("// end of kernel");
+ std::string kernelStr = str.substr(startOfKernel, endOfKernel - startOfKernel);
+ //std::cout << "kernelStr = " << std::endl;
+ //std::cout << kernelStr << std::endl;
+ find_and_replace(kernelStr, "\\n", " ");
+ //std::cout << "after kernelStr = " <<std::endl;
+ //std::cout << kernelStr << std::endl;
+
+
+ status = getKernelBinaryFromSource(context, kernelStr.c_str(), OCL_flag.c_str(), kernelBinary, &kernelBinarySize);
+ CL_CHECK(status);
+ //writing the file
+ outputFile << "char " << outputKernelName << "_binArray[" << kernelBinarySize << "] = {" <<std::endl;
+ writeBinaryToStream(outputFile, *kernelBinary, kernelBinarySize);
+ outputFile << "};" << std::endl;
+
+ outputFile << "unsigned char *" << outputKernelName << "_bin = " << "reinterpret_cast<unsigned char *>(" << outputKernelName << "_binArray);" << std::endl;
+ outputFile << "size_t " << outputKernelName << "_binSize = " << kernelBinarySize << ";" << std::endl;
+ outputFile << "const char * const " << outputKernelName << "_src = NULL;" << std::endl;
+
+ //end writing file
+ outputFile.close();
+ printf("OCLBinaryGenerator finished.\n");
+}
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list