[clblas] 46/67: finished dtrsm offline compile dev

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Tue Oct 27 08:02:14 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clblas.

commit 4f204b2b70fb508ff94d017818c62bb413e57027
Author: Timmy <timmy.liu at amd.com>
Date:   Fri Oct 2 14:58:50 2015 -0500

    finished dtrsm offline compile dev
---
 src/library/CMakeLists.txt                         |  57 ++++
 src/library/OCLBinaryGenerator.cmake               |  86 +++++
 .../AutoGemmTools/AutoGemmPreCompileKernels.cpp    |   5 +-
 .../blas/trtri/TrtriKernelSourceIncludes.cpp       |  38 +++
 .../blas/trtri/diag_dtrtri_lower_128_16.cpp        |   3 +
 .../blas/trtri/diag_dtrtri_upper_128_16.cpp        |   1 +
 .../blas/trtri/diag_dtrtri_upper_192_12.cpp        |   1 +
 .../trtri/triple_dgemm_update_128_16_PART1_L.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_16_PART2_L.cpp   |   1 +
 .../blas/trtri/triple_dgemm_update_128_16_R.cpp    |   1 +
 .../trtri/triple_dgemm_update_128_32_PART1_L.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_32_PART1_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_32_PART2_L.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_32_PART2_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_64_PART1_L.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_64_PART1_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_64_PART2_L.cpp   |   1 +
 .../trtri/triple_dgemm_update_128_64_PART2_R.cpp   |   1 +
 .../triple_dgemm_update_128_ABOVE64_PART1_L.cpp    |   1 +
 .../triple_dgemm_update_128_ABOVE64_PART1_R.cpp    |   1 +
 .../triple_dgemm_update_128_ABOVE64_PART2_L.cpp    |   1 +
 .../triple_dgemm_update_128_ABOVE64_PART2_R.cpp    |   1 +
 .../triple_dgemm_update_128_ABOVE64_PART3_L.cpp    |   3 +
 .../triple_dgemm_update_128_ABOVE64_PART3_R.cpp    |   3 +
 .../blas/trtri/triple_dgemm_update_192_12_R.cpp    |   1 +
 .../trtri/triple_dgemm_update_192_24_PART1_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_192_24_PART2_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_192_48_PART1_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_192_48_PART2_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_192_96_PART1_R.cpp   |   1 +
 .../trtri/triple_dgemm_update_192_96_PART2_R.cpp   |   1 +
 .../tools/OCLBinaryGenerator/CMakeLists.txt        |  33 ++
 .../OCLBinaryGenerator/OCLBinaryGenerator.cpp      | 347 +++++++++++++++++++++
 33 files changed, 597 insertions(+), 2 deletions(-)

diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 9a6b2f9..377aa9d 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -90,6 +90,7 @@ option( PRECOMPILE_GEMM_TRANS_CN "AutoGemm: pre-compile CN transpose cases" OFF)
 option( PRECOMPILE_GEMM_TRANS_CT "AutoGemm: pre-compile CT transpose cases" OFF)
 option( PRECOMPILE_GEMM_TRANS_CC "AutoGemm: pre-compile CC transpose cases" OFF)
 
+
 # opencl compiler version
 #set( PRECOMPILE_GEMM_OPENCL_VERSION "2.0" CACHE STRING "OpenCL compiler version supported by device driver." )
 #set_property( CACHE PRECOMPILE_GEMM_OPENCL_VERSION PROPERTY STRINGS 2.0 1.2 1.1 )
@@ -291,7 +292,25 @@ source_group(AutoGemm\\src FILES ${AUTOGEMM_SRC} ${AUTOGEMM_PRECOMPILED_KERNELS}
 # AutoGemm End
 ################################################################################
 
+################################################################################
+# BEGIN Pre Compile General (static) Kernels
+################################################################################
+# options for pre-compiling trsm kernels
+option( PRECOMPILE_TRSM_STRSM "pre-compile available dtrsm kernels" OFF )
+option( PRECOMPILE_TRSM_DTRSM "pre-compile available strsm kernels" OFF )
+if(PRECOMPILE_TRSM_DTRSM)
+add_definitions(-DCLBLAS_OFFLINE_COMPILE_DTRSM)
+message(STATUS "precompile DTRSM kernels.")
+endif()
+if(PRECOMPILE_TRSM_STRSM)
+add_definitions(-DCLBLAS_OFFLINE_COMPILE_STRSM)
+message(STATUS "precompile STRSM kernels. (not yet implemented)")
+endif()
+
 
+################################################################################
+# END Pre Compile General (static) Kernels
+################################################################################
 
 set(SRC_BLAS
     blas/init.c
@@ -670,6 +689,40 @@ ExternalProject_Add( tplgen
     INSTALL_COMMAND ""
 )
 
+################OCLBinaryGenerator
+if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM)
+
+
+ExternalProject_Add( OCLBinaryGenerator
+	URL "${CMAKE_SOURCE_DIR}/library/tools/OCLBinaryGenerator"
+	CMAKE_ARGS -DOPENCL_LIBRARIES=${OPENCL_LIBRARIES} -DOPENCL_INCLUDE_DIRS=${OPENCL_INCLUDE_DIRS}
+	INSTALL_COMMAND ""
+)
+ExternalProject_Get_Property( OCLBinaryGenerator binary_dir )
+message(STATUS "OCLBinaryGenerator binary_dir =${binary_dir}")
+set( OCLBinaryGeneratorBinaryDir "${binary_dir}/staging" )
+
+# OCLBinaryGenerator requires at least three inputs
+# 1, path to the kernel file
+# 2, file name
+# 3, output directory
+# 4, [optional] compiler flags
+# 5, [optional] trageted hardware. If this is not supplied OCLBinaryGenerator will generate binary for the first device on system
+set( OCL_COMPILER_FLAGS " ")
+if( OPENCL_VERSION STREQUAL "2.0")
+    set( OCL_COMPILER_FLAGS "-cl-std=CL2.0")
+endif()
+
+add_custom_target( OCLBinaryGenerator_GEN )
+add_custom_command(TARGET OCLBinaryGenerator_GEN
+                   PRE_BUILD
+				   COMMAND ${CMAKE_COMMAND} -DOCLBinaryGeneratorBinaryDir=${OCLBinaryGeneratorBinaryDir} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DOCL_COMPILER_FLAGS=${OCL_COMPILER_FLAGS}
+				   -P "${CMAKE_SOURCE_DIR}/library/OCLBinaryGenerator.cmake"
+				   )	  
+add_dependencies( OCLBinaryGenerator_GEN OCLBinaryGenerator )
+
+endif()
+
 # if offline compilation is not chosen, bingen should not be built
 if(OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OR OPENCL_OFFLINE_BUILD_HAWAII_KERNEL OR OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL)
 	ExternalProject_Add( bingen
@@ -801,6 +854,10 @@ add_library(clBLAS
   )
 add_dependencies(clBLAS GENERATE_CLT)
 
+if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM)
+add_dependencies(clBLAS OCLBinaryGenerator_GEN)
+endif()
+
 # AutoGemm needs compiler flag to utilize pre-compiled kernels
 if ( ${PRECOMPILE_GEMM_ACTIVE} )
   set_target_properties(clBLAS PROPERTIES COMPILE_FLAGS -DAUTOGEMM_USE_PRE_COMPILED_KERNELS)
diff --git a/src/library/OCLBinaryGenerator.cmake b/src/library/OCLBinaryGenerator.cmake
new file mode 100644
index 0000000..602f078
--- /dev/null
+++ b/src/library/OCLBinaryGenerator.cmake
@@ -0,0 +1,86 @@
+
+message(STATUS "inside OCLBinaryGenerator.cmake")
+message(STATUS "OCLBinary.cmake SOURCE_DIR=${SOURCE_DIR}")
+message(STATUS "OCLBinary.cmake BINARY_DIR=${BINARY_DIR}")
+
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_lower_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_192_12 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_12_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
+execute_process(
+	COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
+	)
diff --git a/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp b/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp
index 1d75dbb..c94e9a2 100644
--- a/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp
+++ b/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp
@@ -682,6 +682,7 @@ int main( int argc, char *argv[] ) {
 	clockStart = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec;
 #endif
 	const int specialKernelCount = user_kernel_count;
+
 	totalKernelsToCompile = gemmPreCompileNum;
 	totalKernelsToCompile *= 4;
 	totalKernelsToCompile += specialKernelCount;
@@ -823,7 +824,7 @@ int main( int argc, char *argv[] ) {
 	  beta = 1.0;
 	  char *appendString = appendStringArray[i];
 
-
+  
 	  compileKernelAndWriteToFile<float>(
 		  context,
 		  clblasColumnMajor,
@@ -838,7 +839,7 @@ int main( int argc, char *argv[] ) {
 		  tileKernelSource,
 		  binaryBuildOptions,
 		  appendString);
-
+        
   }
   
   // for each kernel to be pre-compiled
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
index 0a229b2..e4a685f 100644
--- a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
@@ -6,6 +6,7 @@
 #ifndef TRTRI_SOURCE_INCLUDES_CPP
 #define TRTRI_SOURCE_INCLUDES_CPP
 
+#ifndef CLBLAS_OFFLINE_COMPILE_DTRSM
 /*mod 192 dtrsm*/
 #include "diag_dtrtri_upper_192_12.cpp"
 #include "triple_dgemm_update_192_12_R.cpp"
@@ -40,4 +41,41 @@
 #include "triple_dgemm_update_128_ABOVE64_PART2_L.cpp"
 #include "triple_dgemm_update_128_ABOVE64_PART3_L.cpp"
 
+#else
+/*mod 192 dtrsm*/
+#include "diag_dtrtri_upper_192_12_bin.cpp"
+#include "triple_dgemm_update_192_12_R_bin.cpp"
+#include "triple_dgemm_update_192_24_PART1_R_bin.cpp"
+#include "triple_dgemm_update_192_24_PART2_R_bin.cpp"
+#include "triple_dgemm_update_192_48_PART1_R_bin.cpp"
+#include "triple_dgemm_update_192_48_PART2_R_bin.cpp"
+#include "triple_dgemm_update_192_96_PART1_R_bin.cpp"
+#include "triple_dgemm_update_192_96_PART2_R_bin.cpp"
+
+/*mod 128 dtrsm*/
+/*upper*/
+#include "diag_dtrtri_upper_128_16_bin.cpp"
+#include "triple_dgemm_update_128_16_R_bin.cpp"
+#include "triple_dgemm_update_128_32_PART1_R_bin.cpp"
+#include "triple_dgemm_update_128_32_PART2_R_bin.cpp"
+#include "triple_dgemm_update_128_64_PART1_R_bin.cpp"
+#include "triple_dgemm_update_128_64_PART2_R_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART1_R_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART2_R_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART3_R_bin.cpp"
+
+/*lower*/
+#include "diag_dtrtri_lower_128_16_bin.cpp"
+#include "triple_dgemm_update_128_16_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_16_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_32_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_32_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_64_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_64_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART1_L_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART2_L_bin.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART3_L_bin.cpp"
+
+#endif //CLBLAS_OFFLINE_COMPILE_DTRSM
+
 #endif
diff --git a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
index 17b8be6..6037eed 100644
--- a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
+++ b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
@@ -11,9 +11,11 @@
 #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
 #endif
 
+
 unsigned char *diag_dtrtri_lower_128_16_bin = 0;
 size_t diag_dtrtri_lower_128_16_binSize = 0;
 
+
 const char * const diag_dtrtri_lower_128_16_src = STRINGIFY(
 #define BLOCK_SIZE 16 \n
 #define NB 128 \n
@@ -165,5 +167,6 @@ for (i = BLOCK_SIZE - 2; i >= 0; i--) {\n
 for (i = 0; i < BLOCK_SIZE; i++)\n
 	*(d_dinvA + i*NB + tx) = Bs[i*BLOCK_SIZE + tx]; \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
index 83f42f2..c31a218 100644
--- a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
+++ b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp
@@ -146,5 +146,6 @@ uint na)\n
   }\n
   
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
index b4409c5..dde0776 100644
--- a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
+++ b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp
@@ -144,5 +144,6 @@ for (i = 0; i < BLOCK_SIZE; i++)\n
 
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
index cfdf9ab..c0e3b4c 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
@@ -156,5 +156,6 @@ Ain = Ain + offAin; \n
 //__syncthreads();
 barrier(CLK_LOCAL_MEM_FENCE); \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
index 5ccbda2..71c13dc 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
@@ -138,5 +138,6 @@ Ain = Ain + offAin; \n
 //__syncthreads();
 barrier(CLK_LOCAL_MEM_FENCE); \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
index 7026563..237d3fe 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp
@@ -234,5 +234,6 @@ barrier(CLK_LOCAL_MEM_FENCE); \n
 	}\n
 }\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
index f0ce2f1..46b7e97 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
@@ -145,5 +145,6 @@ int PagesPerNB = NB / (blk * 2); \n
 //__syncthreads();
 barrier(CLK_LOCAL_MEM_FENCE); \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
index 06a9b73..3358af6 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp
@@ -146,5 +146,6 @@ int PagesPerNB = NB / (blk * 2); \n
 //__syncthreads();
 barrier(CLK_LOCAL_MEM_FENCE); \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
index 4b9b813..e480d6b 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
@@ -130,5 +130,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 }\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
index 2fcd684..6c04dee 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp
@@ -131,5 +131,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 }\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
index 5e80dc5..eef824c 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
@@ -140,5 +140,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 }\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
index 92361d4..0f64809 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp
@@ -140,5 +140,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 }\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
index 47e7fca..83e0c7e 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
@@ -128,5 +128,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 }\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
index 09e37c4..5ce3e42 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp
@@ -129,5 +129,6 @@ int PagesPerNB = NB / (blk * 2); \n
 }\n
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
index 571cb72..af7f518 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
@@ -141,5 +141,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 }\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
index a7bf4e6..51a3e87 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp
@@ -139,5 +139,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 	}\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
index f2b99ba..674fdd5 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
@@ -129,5 +129,6 @@ int PagesPerNB = NB / (blk * 2); \n
 	}\n
 	}\n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
index b485b50..a45494b 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp
@@ -130,5 +130,6 @@ int PagesPerNB = NB / (blk * 2); \n
 }\n
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
index 994fabb..d2077bf 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
@@ -40,6 +40,8 @@ double * __restrict__ c)\n
 	c[15] += alpha * b[15]; \n
 }\n
 #define NB 128\n
+#define ZERO              ( 0.0) \n
+#define ONE               ( 1.0) \n
 #define __mul(i,j) ((i)*(j))\n
 #define qmod(a, b) ((a)%(b))\n
 __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
@@ -84,5 +86,6 @@ int PagesPerNB = NB / (blk * 2); \n
 //__syncthreads();
 barrier(CLK_LOCAL_MEM_FENCE); \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
index 7d09937..004a8d2 100644
--- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp
@@ -42,6 +42,8 @@ double * __restrict__ c)\n
 	c[15] += alpha * b[15]; \n
 }\n
 #define NB 128\n
+#define ZERO              ( 0.0) \n
+#define ONE               ( 1.0) \n
 #define __mul(i,j) ((i)*(j))\n
 #define qmod(a, b) ((a)%(b))\n
 __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
@@ -87,5 +89,6 @@ int PagesPerNB = NB / (blk * 2); \n
 }\n
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
index ca5529f..79bc4c0 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp
@@ -189,5 +189,6 @@ const char * const triple_dgemm_update_192_12_R_src = STRINGIFY(
 	} while (i < 12);\n
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
index b23a276..6b62eb4 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp
@@ -112,5 +112,6 @@ const char * const triple_dgemm_update_192_24_PART1_R_src = STRINGIFY(
 		i = i + 1; \n
 	} while (i < 12); \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
index e3a396c..d8c2f99 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp
@@ -107,5 +107,6 @@ const char * const triple_dgemm_update_192_24_PART2_R_src = STRINGIFY(
 	} while (i < 12);\n
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
index 473d989..dafa65b 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp
@@ -139,5 +139,6 @@ const char * const triple_dgemm_update_192_48_PART1_R_src = STRINGIFY(
 		i = i + 1; \n
 	} while (i < 12); \n
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
index 5315196..4571112 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp
@@ -140,5 +140,6 @@ const char * const triple_dgemm_update_192_48_PART2_R_src = STRINGIFY(
 
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
index 1c8111d..8e58ab6 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp
@@ -151,5 +151,6 @@ const char * const triple_dgemm_update_192_96_PART1_R_src = STRINGIFY(
 
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
index 6c2c61d..2ff217d 100644
--- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
+++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp
@@ -152,5 +152,6 @@ const char * const triple_dgemm_update_192_96_PART2_R_src = STRINGIFY(
 
 
 }\n
+// end of kernel
 );
 #endif
diff --git a/src/library/tools/OCLBinaryGenerator/CMakeLists.txt b/src/library/tools/OCLBinaryGenerator/CMakeLists.txt
new file mode 100644
index 0000000..227bffe
--- /dev/null
+++ b/src/library/tools/OCLBinaryGenerator/CMakeLists.txt
@@ -0,0 +1,33 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+cmake_minimum_required(VERSION 2.6)
+project(OCLBinaryGenerator C CXX)
+ADD_DEFINITIONS(/D_CRT_SECURE_NO_WARNINGS)
+ADD_EXECUTABLE(OCLBinaryGenerator OCLBinaryGenerator.cpp)
+target_link_libraries(OCLBinaryGenerator ${OPENCL_LIBRARIES})
+include_directories(${OPENCL_INCLUDE_DIRS})
+
+set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/staging" )
+  
+if ( MSVC )
+  set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_CURRENT_BINARY_DIR}/staging" )
+  set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_CURRENT_BINARY_DIR}/staging" )
+endif( )
+
+
+
+
diff --git a/src/library/tools/OCLBinaryGenerator/OCLBinaryGenerator.cpp b/src/library/tools/OCLBinaryGenerator/OCLBinaryGenerator.cpp
new file mode 100644
index 0000000..3feea11
--- /dev/null
+++ b/src/library/tools/OCLBinaryGenerator/OCLBinaryGenerator.cpp
@@ -0,0 +1,347 @@
+#include <assert.h>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <cstring>
+#include <iostream>
+
+#ifdef __GNUC__
+// Linux
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#else
+// Windows
+#include <Windows.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+//#define stat _stat
+#endif
+
+#include "CL/cl.h"
+
+void find_and_replace(std::string& str, const std::string& findStr, const std::string& replaceStr){
+    size_t pos = 0;
+    while ((pos = str.find(findStr, pos)) != std::string::npos){
+        str.replace(pos, findStr.length(), replaceStr);
+        pos += replaceStr.length();
+    }
+}
+
+/******************************************************************************
+* Check OpenCL Errors
+*****************************************************************************/
+#define CL_CHECK(STATUS) \
+  if(STATUS != CL_SUCCESS) { \
+    printf("OpenCL error %i on line %u\n", STATUS, __LINE__); \
+    assert(false); \
+    }
+
+/******************************************************************************
+* write binary to stream
+*****************************************************************************/
+void writeBinaryToStream(std::ostream & out, char *binary, size_t binarySize) {
+    for (int i = 0; i < binarySize; i++) {
+
+        out << std::setw(4) << (int)binary[i];
+
+        if (i < binarySize - 1) {
+            out << ",";
+        }
+        if ((i + 1) % 16 == 0) {
+            out << std::endl;
+        }
+    }
+    out << std::endl;
+}
+
+/******************************************************************************
+* Get AMD Platform
+*****************************************************************************/
+cl_int getAMDPlatform(cl_platform_id *platform) {
+    *platform = NULL;
+    cl_int status = CL_SUCCESS;
+
+    // get num platforms
+    cl_uint numPlatforms;
+    status = clGetPlatformIDs(0, NULL, &numPlatforms);
+    if (status != CL_SUCCESS) {
+        std::cout << "Error: clGetPlatformIDs failed. Error code: " << status << std::endl;
+        return status;
+    }
+
+    if (numPlatforms > 0) {
+        // Get selected platform
+        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+        if (status != CL_SUCCESS) {
+            std::cout << "Error: clGetPlatformIDs failed. Error code : " << status << std::endl;
+            return status;
+        }
+
+        // Print all platforms
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+            char pbuf[100];
+            status = clGetPlatformInfo(platforms[i],
+                CL_PLATFORM_VENDOR,
+                sizeof(pbuf),
+                pbuf,
+                NULL);
+
+            if (status != CL_SUCCESS) {
+                std::cout << "Error: clGetPlatformInfo failed. Error code : " << status << std::endl;
+                return status;
+            }
+
+            //std::cout << "Platform " << i << " : " << pbuf << std::endl;
+        }
+
+        // Get AMD platform
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+            char pbuf[100];
+            status = clGetPlatformInfo(platforms[i],
+                CL_PLATFORM_VENDOR,
+                sizeof(pbuf),
+                pbuf,
+                NULL);
+
+            if (status != CL_SUCCESS) {
+                std::cout << "Error: clGetPlatformInfo failed. Error code: " << status << std::endl;
+                return status;
+            }
+
+            *platform = platforms[i];
+            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                break;
+            }
+        }
+
+        // verify AMD platform
+        char pbuf[100];
+        status = clGetPlatformInfo(*platform,
+            CL_PLATFORM_VENDOR,
+            sizeof(pbuf),
+            pbuf,
+            NULL);
+
+        if (status != CL_SUCCESS) {
+            std::cout << "Error: clGetPlatformInfo failed. Error code: " << status << std::endl;
+            return status;
+        }
+        if (strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+            std::cout << "AMD platform not found" << std::endl;
+            return CL_INVALID_PLATFORM;
+        }
+
+    }
+    else {
+        std::cout << "No OpenCL platforms found." << std::endl;
+        return CL_INVALID_PLATFORM;
+    }
+
+    return status;
+}
+
+
+/******************************************************************************
+* get kernel binary from source
+*****************************************************************************/
+cl_int getKernelBinaryFromSource(
+    cl_context context,
+    const char *source,
+    const char *buildOptions,
+    char **binary,
+    size_t *binarySize)
+{
+    cl_int status = CL_SUCCESS;
+
+    // create program
+    cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &status);
+    CL_CHECK(status);
+
+    cl_uint numDevicesInContext;
+    status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL);
+    CL_CHECK(status);
+
+    // get devices
+    cl_device_id* devices = new cl_device_id[numDevicesInContext];
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, numDevicesInContext*sizeof(cl_device_id), devices, NULL);
+    CL_CHECK(status);
+
+    // choose device 0
+    cl_device_id device = devices[0];
+
+    // build program for device
+    status = clBuildProgram(program, 1, &device, buildOptions, NULL, NULL);
+
+
+    // print build failure
+    if (status != CL_SUCCESS) {
+        printf("clBuildProgram Failed\n");
+        printf("status = %d\n", status);
+
+        size_t len = 0;
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
+        char* buildLog = new char[len];
+
+        printf("Error: Failed to build program executable!\n");
+        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len*sizeof(char), buildLog, 0);
+        printf("\nBuild Log:\n\n");
+        printf("%s\n", buildLog);
+        printf("\n\nKernel String:\n\n");
+        printf("%s\n", source);
+
+        binary[0] = 0;
+        *binarySize = 0;
+        return status;
+    }
+
+
+    // get binary from program
+    status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), binarySize, NULL);
+
+    binary[0] = new char[*binarySize];
+
+    status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 8 /*?*/, binary, NULL);
+    CL_CHECK(status);
+
+    return CL_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+    /*
+    OCLBinaryGenerator requires at least two inputs
+    1, path the kernel file
+    2, file name
+    3, output directory
+    4, optional compiler flags
+    5, [optional] trageted hardware. If this is not supplied OCLBinaryGenerator will generate binary for the first device on system
+    */
+    if (argc < 4)
+    {
+        printf("not enough arguments. OCLBinaryGenerator aborted.\n");
+        exit(-1);
+    }
+
+    //get the input path
+    std::string inputPath = argv[1];
+    inputPath += "\\";
+    std::cout << "OCLBinaryGenerator input path is " << inputPath <<std::endl;
+
+    //get the input filename
+    std::string inputFilename = argv[2];
+    inputFilename += ".cpp";
+    std::cout << "OCLBinaryGenerator input filename is " << inputFilename << std::endl;
+    
+    //get the path to destination
+    std::string outputPath = argv[3];
+    outputPath += "\\";
+    std::cout << "OCLBinaryGenerator output path is " << outputPath << std::endl;
+
+    //output filename
+    std::string outputFilename = argv[2];
+    std::string outputKernelName = argv[2];
+    outputFilename += "_bin.cpp";
+    std::cout << "OCLBinaryGenerator output filename is " << outputFilename << std::endl;
+
+    //get compiler flag
+    std::string OCL_flag = " ";
+    if (argc > 4)
+        OCL_flag = argv[4];
+    std::cout << "OCLBinaryGenerator compiler flag is " << OCL_flag << std::endl;
+
+    //start writing file
+    std::ofstream outputFile;
+    outputFile.open((outputPath+outputFilename).c_str(), std::ios::out);
+
+    const char *outputFileHeader =
+        "/*****************************************************************************\n"
+        " * kernel binary file\n"
+        " ****************************************************************************/\n\n";
+    outputFile << outputFileHeader;
+
+    // get AMD platform
+    cl_platform_id platform;
+    cl_int status = getAMDPlatform(&platform);
+    CL_CHECK(status);
+
+    cl_uint numDevices;
+    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+    CL_CHECK(status);
+
+    // get all gpu devices
+    cl_device_id* devices = new cl_device_id[numDevices];
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
+    CL_CHECK(status);
+
+    // choose device 0 or we can choose a target device in the future
+    cl_device_id device = devices[0];
+
+    // create context
+    cl_context_properties cps[3] = 
+    {
+        CL_CONTEXT_PLATFORM,
+        (cl_context_properties)platform,
+        0
+    };
+    cl_context context = clCreateContext(
+        cps,
+        1, // device
+        &device,
+        NULL,
+        NULL,
+        &status);
+    CL_CHECK(status);
+
+    cl_uint numDevicesInContext;
+    status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL);
+    CL_CHECK(status);
+
+    char **kernelBinary = new char*[1];
+    size_t kernelBinarySize;
+
+    
+    std::ifstream inputfile(inputPath+inputFilename);
+    if (!inputfile.is_open())
+    {
+        printf("Input file does not exist. OCLBinaryGenerator aborted.\n");
+        exit(-1);
+    }
+    std::string str((std::istreambuf_iterator<char>(inputfile)), std::istreambuf_iterator<char>());
+    inputfile.close();
+    //std::cout<<str<<std::endl;
+    std::string str_temp("= STRINGIFY(");
+    size_t startOfKernel = str.find("= STRINGIFY(");
+    startOfKernel += str_temp.length();
+    size_t endOfKernel = str.find("// end of kernel");
+    std::string kernelStr = str.substr(startOfKernel, endOfKernel - startOfKernel);
+    //std::cout << "kernelStr = " << std::endl;
+    //std::cout << kernelStr << std::endl;
+    find_and_replace(kernelStr, "\\n", " ");
+    //std::cout << "after kernelStr = " <<std::endl;
+    //std::cout << kernelStr << std::endl;
+
+
+    status = getKernelBinaryFromSource(context, kernelStr.c_str(), OCL_flag.c_str(), kernelBinary, &kernelBinarySize);
+    CL_CHECK(status);
+    //writing the file
+    outputFile << "char " << outputKernelName << "_binArray[" << kernelBinarySize << "] = {" <<std::endl;
+    writeBinaryToStream(outputFile, *kernelBinary, kernelBinarySize);
+    outputFile << "};" << std::endl;
+
+    outputFile << "unsigned char *" << outputKernelName << "_bin = " << "reinterpret_cast<unsigned char *>(" << outputKernelName << "_binArray);" << std::endl;
+    outputFile << "size_t " << outputKernelName << "_binSize = " << kernelBinarySize << ";" << std::endl;
+    outputFile << "const char * const " << outputKernelName << "_src = NULL;" << std::endl;
+
+    //end writing file
+    outputFile.close();
+    printf("OCLBinaryGenerator finished.\n");
+}
\ No newline at end of file

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git



More information about the debian-science-commits mailing list