[clblas] 53/75: allow users to easily verify the gemm/trmm GPU results with the netlib cblas through client (#274)
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Jan 24 23:30:39 UTC 2017
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/master
in repository clblas.
commit 00a29c6409c9d269e3d680026faf747d0abf85a3
Author: tingxingdong <tingxingdong at gmail.com>
Date: Mon Jun 20 12:57:24 2016 -0500
allow users to easily verify the gemm/trmm GPU results with the netlib cblas through client (#274)
* (1)update readme: netlib is preferred.(2)now you can verify the correctness of gemm&trmm through client
* give more details of how to get CBLAS on windows
* find the netlib library dir & library in Cmake files
* forget to add this file
* disable the validation on windows currently: no easy solution of building/linking netlib CBLAS on windows
---
README.md | 6 +-
src/CMakeLists.txt | 8 +
src/FindNetlib.cmake | 19 +
src/client/CMakeLists.txt | 9 +-
src/client/clfunc_common.hpp | 98 ++++-
src/client/clfunc_xgemm.hpp | 579 +++++++++++++++-----------
src/client/clfunc_xtrmm.hpp | 266 ++++++++----
src/client/client.cpp | 967 ++++++++++++++++++++++---------------------
8 files changed, 1130 insertions(+), 822 deletions(-)
diff --git a/README.md b/README.md
index cd734da..8de7d7e 100644
--- a/README.md
+++ b/README.md
@@ -197,8 +197,12 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate
### Test infrastructure
* Googletest v1.6
-* ACML on windows/linux; Accelerate on Mac OSX
* Latest Boost
+* CPU BLAS
+ - Netlib CBLAS (recommended)
+ Ubuntu: install by "apt-get install libblas-dev"
+ Windows: download & install lapack-3.6.0 which comes with CBLAS
+ - or ACML on windows/linux; Accelerate on Mac OSX
### Performance infrastructure
* Python
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 33a91ee..73ba594 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -265,6 +265,14 @@ if( BUILD_TEST )
endif( )
endif( )
+if( BUILD_CLIENT )
+ if( NETLIB_FOUND )
+ else( )
+ message( WARNING "Not find Netlib; BUILD_CLIENT needs the Netlib CBLAS library" )
+ endif()
+endif()
+
+
# This will define OPENCL_FOUND
find_package( OpenCL ${OPENCL_VERSION} )
diff --git a/src/FindNetlib.cmake b/src/FindNetlib.cmake
index a32474e..6a21e61 100644
--- a/src/FindNetlib.cmake
+++ b/src/FindNetlib.cmake
@@ -100,6 +100,25 @@ if( NOT contains_BLAS EQUAL -1 )
FIND_PACKAGE_HANDLE_STANDARD_ARGS( NETLIB DEFAULT_MSG Netlib_BLAS_LIBRARY )
endif( )
+
+#look for netlib cblas header
+if( UNIX )
+ find_path(Netlib_INCLUDE_DIRS cblas.h
+ HINTS
+ /usr/include
+ )
+else()
+ find_path(Netlib_INCLUDE_DIRS cblas.h
+ HINTS
+ ${Netlib_ROOT}/CBLAS/include/
+ )
+endif()
+
+if( Netlib_INCLUDE_DIRS )
+else()
+ message(WARNING "Cannot find cblas.h")
+endif()
+
if( NETLIB_FOUND )
list( APPEND Netlib_LIBRARIES ${Netlib_BLAS_LIBRARY} )
else( )
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index eb66f8a..752b19b 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -1,12 +1,12 @@
# ########################################################################
# Copyright 2013 Advanced Micro Devices, Inc.
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -48,10 +48,11 @@ include_directories(
${clBLAS_SOURCE_DIR}
${clBLAS_SOURCE_DIR}/include
${clBLAS_SOURCE_DIR}/tests/include
+ ${Netlib_INCLUDE_DIRS}
.)
add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER})
-target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
+target_link_libraries(client ${Netlib_LIBRARIES} ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
set_target_properties( client PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging"
OUTPUT_NAME clBLAS-client )
diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index fc2057b..0f22ef0 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -27,6 +27,11 @@
#include "test-limits.h"
#include "dis_warning.h"
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+#include "cblas.h"
+#endif
+
#include "clBLAS.h"
#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/cl_ext.h>
@@ -77,6 +82,57 @@ randomScale()
return t;
}
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+
+CBLAS_ORDER
+clblasToCblas_order(clblasOrder value)
+{
+ switch (value) {
+ case clblasRowMajor: return CblasRowMajor;
+ case clblasColumnMajor: return CblasColMajor;
+ }
+}
+
+CBLAS_TRANSPOSE
+clblasToCblas_operation(clblasTranspose value)
+{
+ switch (value) {
+ case clblasNoTrans: return CblasNoTrans;
+ case clblasTrans: return CblasTrans;
+ case clblasConjTrans: return CblasConjTrans;
+ }
+}
+
+CBLAS_UPLO
+clblasToCblas_fill(clblasUplo value)
+{
+ switch (value) {
+ case clblasUpper: return CblasUpper;
+ case clblasLower: return CblasLower;
+ }
+}
+
+CBLAS_SIDE
+clblasToCblas_side(clblasSide value)
+{
+ switch (value) {
+ case clblasLeft: return CblasLeft;
+ case clblasRight: return CblasRight;
+ }
+}
+
+CBLAS_DIAG
+clblasToCblas_diag(clblasDiag value)
+{
+ switch (value) {
+ case clblasNonUnit: return CblasNonUnit;
+ case clblasUnit: return CblasUnit;
+ }
+}
+
+#endif
+
std::string
prettyPrintClStatus( const cl_int& status )
{
@@ -269,7 +325,7 @@ public:
virtual ~clblasFunc()
{
clblasTeardown();
-
+
for (unsigned int i = 0; i < numQueues; i++) {
OPENCL_V_THROW( clReleaseCommandQueue(queues_[i]), "releasing command queue" );
}
@@ -278,21 +334,21 @@ public:
void wait_and_check()
{
- cl_int err;
+ cl_int err;
cl_int wait_status = clWaitForEvents(1, &event_);
if( wait_status != CL_SUCCESS )
{
- if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
- {
- clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS,
+ if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
+ {
+ clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &err, NULL );
- std::cout << "blas function execution status error: " << err << std::endl;
+ std::cout << "blas function execution status error: " << err << std::endl;
exit(1);
- }
+ }
else
{
- std::cout << "blas function wait status error: " << wait_status << std::endl;
+ std::cout << "blas function wait status error: " << wait_status << std::endl;
exit(1);
}
}
@@ -300,14 +356,16 @@ public:
double time_in_ns()
{
- StatisticalTimer& timer = StatisticalTimer::getInstance( );
+ StatisticalTimer& timer = StatisticalTimer::getInstance( );
return timer.getAverageTime( timer_id ) * 1e9;
}
+ virtual void validate_with_cblas(int v) {}
+
virtual void call_func() = 0;
virtual double gflops() = 0;
virtual std::string gflops_formula() = 0;
- virtual void setup_apiCallCount(cl_uint apiCallCount){}
+ virtual void setup_apiCallCount(cl_uint apiCallCount){}
virtual void setup_buffer(int order_option, int side_option,
int uplo_option, int diag_option, int
transA_option, int transB_option,
@@ -317,20 +375,20 @@ public:
virtual void initialize_cpu_buffer() = 0;
virtual void initialize_gpu_buffer() = 0;
virtual void reset_gpu_write_buffer() = 0;
- virtual void read_gpu_buffer() = 0;
- virtual void roundtrip_func() = 0;
- virtual void roundtrip_func_rect() {}
- virtual void allochostptr_roundtrip_func() {}
- virtual void usehostptr_roundtrip_func() {}
- virtual void copyhostptr_roundtrip_func() {}
- virtual void usepersismem_roundtrip_func() {}
- virtual void roundtrip_setup_buffer(int order_option, int side_option,
+ virtual void read_gpu_buffer() = 0;
+ virtual void roundtrip_func() = 0;
+ virtual void roundtrip_func_rect() {}
+ virtual void allochostptr_roundtrip_func() {}
+ virtual void usehostptr_roundtrip_func() {}
+ virtual void copyhostptr_roundtrip_func() {}
+ virtual void usepersismem_roundtrip_func() {}
+ virtual void roundtrip_setup_buffer(int order_option, int side_option,
int uplo_option, int diag_option, int
transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda,
size_t ldb, size_t ldc, size_t offA, size_t offBX,
size_t offCY, double alpha, double beta) = 0;
- virtual void releaseGPUBuffer_deleteCPUBuffer()=0;
+ virtual void releaseGPUBuffer_deleteCPUBuffer()=0;
StatisticalTimer& timer;
StatisticalTimer::sTimerID timer_id;
@@ -347,7 +405,7 @@ protected:
clblasOrder order_;
cl_event event_;
size_t maxMemAllocSize;
+ int validate_;
}; // class clblasFunc
#endif // ifndef CLBLAS_BENCHMARK_COMMON_HXX__
-
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index 8efaf63..57c283d 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -43,14 +43,16 @@ struct xGemmBuffer
T* a_;
T* b_;
T* c_;
+ T* c_copy;
cl_mem buf_a_;
cl_mem buf_b_;
cl_mem buf_c_;
T alpha_;
T beta_;
- cl_uint apiCallCount;
+ cl_uint apiCallCount;
}; // struct buffer
+
template <typename T>
class xGemm : public clblasFunc
{
@@ -68,20 +70,37 @@ public:
void call_func()
{
- timer.Start(timer_id);
- xGemm_Function(true, buffer_.apiCallCount);
- timer.Stop(timer_id);
+ timer.Start(timer_id);
+ xGemm_Function(true, buffer_.apiCallCount);
+ timer.Stop(timer_id);
+ }
+
+
+ void validate_with_cblas(int validate)
+ {
+ #if defined ( _WIN32 ) || defined ( _WIN64 )
+ #else
+ if(validate)
+ {
+ initialize_cpu_buffer();
+ initialize_gpu_buffer();
+ xGemm_Function(true, 1);
+ read_gpu_buffer();
+ validation();
+ }
+ #endif
}
+
double gflops()
{
- return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount);
+ return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount);
}
- void setup_apiCallCount(cl_uint apiCallCount)
- {
- buffer_.apiCallCount = apiCallCount;
- }
+ void setup_apiCallCount(cl_uint apiCallCount)
+ {
+ buffer_.apiCallCount = apiCallCount;
+ }
std::string gflops_formula()
{
return "2.0*M*N*K/time";
@@ -322,6 +341,7 @@ public:
buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
+ buffer_.c_copy = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
cl_int err;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
@@ -366,7 +386,7 @@ public:
{
for (size_t j = 0; j < buffer_.ldc_; ++j)
{
- buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+ buffer_.c_copy[i*buffer_.ldc_+j] = buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
randomScale<T>();
}
}
@@ -375,7 +395,7 @@ public:
void initialize_gpu_buffer()
{
- cl_int err;
+ cl_int err;
err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
buffer_.offA_ * sizeof(T),
@@ -408,19 +428,19 @@ public:
buffer_.c_, 0, NULL, NULL);
}
- void read_gpu_buffer()
- {
- cl_int err;
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ void read_gpu_buffer()
+ {
+ cl_int err;
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
sizeof(T),
- buffer_.c_, 0, NULL, NULL);
- }
+ buffer_.c_, 0, NULL, NULL);
+ }
- void roundtrip_func()
- {
- timer.Start(timer_id);
- cl_int err;
+ void roundtrip_func()
+ {
+ timer.Start(timer_id);
+ cl_int err;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
(buffer_.lda_*buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(T),
@@ -452,42 +472,42 @@ public:
buffer_.ldc_ * buffer_.c_num_vectors_ *
sizeof(T),
buffer_.c_, 0, NULL, NULL);
- xGemm_Function(false);
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ xGemm_Function(false);
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
sizeof(T),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
- }
- void roundtrip_func_rect()
- {
- timer.Start(timer_id);
- cl_int err;
- //rect
- size_t a_buffer_origin[3] = {0,0,0};
- size_t a_host_origin[3] = {0,0,0};
- size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
- size_t a_buffer_row_pitch=0*sizeof(T);//lda
- size_t a_buffer_slice_pitch=0;
- size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
- size_t a_host_slice_pitch=0;
-
- size_t b_buffer_origin[3] = {0,0,0};
- size_t b_host_origin[3] = {0,0,0};
- size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
- size_t b_buffer_row_pitch=0*sizeof(T);//ldb
- size_t b_buffer_slice_pitch=0;
- size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
- size_t b_host_slice_pitch=0;
-
- size_t c_buffer_origin[3] = {0,0,0};
- size_t c_host_origin[3] = {0,0,0};
- size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
- size_t c_buffer_row_pitch=0*sizeof(T);//ldc
- size_t c_buffer_slice_pitch=0;
- size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
- size_t c_host_slice_pitch=0;
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void roundtrip_func_rect()
+ {
+ timer.Start(timer_id);
+ cl_int err;
+ //rect
+ size_t a_buffer_origin[3] = {0,0,0};
+ size_t a_host_origin[3] = {0,0,0};
+ size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
+ size_t a_buffer_row_pitch=0*sizeof(T);//lda
+ size_t a_buffer_slice_pitch=0;
+ size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
+ size_t a_host_slice_pitch=0;
+
+ size_t b_buffer_origin[3] = {0,0,0};
+ size_t b_host_origin[3] = {0,0,0};
+ size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
+ size_t b_buffer_row_pitch=0*sizeof(T);//ldb
+ size_t b_buffer_slice_pitch=0;
+ size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
+ size_t b_host_slice_pitch=0;
+
+ size_t c_buffer_origin[3] = {0,0,0};
+ size_t c_host_origin[3] = {0,0,0};
+ size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
+ size_t c_buffer_row_pitch=0*sizeof(T);//ldc
+ size_t c_buffer_slice_pitch=0;
+ size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
+ size_t c_host_slice_pitch=0;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
(buffer_.k_*buffer_.m_ +
@@ -504,12 +524,12 @@ public:
buffer_.offC_) * sizeof(T),
NULL, &err);
/*
- err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+ err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
buffer_.offA_ * sizeof(T),
buffer_.lda_ * buffer_.a_num_vectors_ *
sizeof(T),
buffer_.a_, 0, NULL, NULL);
-
+
err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
buffer_.offB_ * sizeof(T),
buffer_.ldb_ * buffer_.b_num_vectors_ *
@@ -522,47 +542,47 @@ public:
sizeof(T),
buffer_.c_, 0, NULL, NULL);*/
err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch,
- a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
+ a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch,
- b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
+ b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
- c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
-
- if(buffer_.trans_a_==clblasNoTrans)
- {
- buffer_.lda_=buffer_.m_;
- }
- else
- {
- buffer_.lda_=buffer_.k_;
- }
- if(buffer_.trans_b_==clblasNoTrans)
- {
- buffer_.ldb_=buffer_.k_;
- }
- else
- {
- buffer_.ldb_=buffer_.n_;
- }
- buffer_.ldc_=buffer_.m_;
- xGemm_Function(false);
- /*
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
+
+ if(buffer_.trans_a_==clblasNoTrans)
+ {
+ buffer_.lda_=buffer_.m_;
+ }
+ else
+ {
+ buffer_.lda_=buffer_.k_;
+ }
+ if(buffer_.trans_b_==clblasNoTrans)
+ {
+ buffer_.ldb_=buffer_.k_;
+ }
+ else
+ {
+ buffer_.ldb_=buffer_.n_;
+ }
+ buffer_.ldc_=buffer_.m_;
+ xGemm_Function(false);
+ /*
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
sizeof(T),
- buffer_.c_, 0, NULL, &event_);
- */
- err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
- c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
- }
- void allochostptr_roundtrip_func()
- {
- timer.Start(timer_id);
-
- cl_int err;
- // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
+ buffer_.c_, 0, NULL, &event_);
+ */
+ err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+ c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void allochostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+
+ cl_int err;
+ // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
(buffer_.lda_*buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(T),
@@ -578,45 +598,45 @@ public:
buffer_.offC_) * sizeof(T),
NULL, &err);
- // map the buffers to pointers at host device
- T *map_a,*map_b,*map_c;
- map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
- (buffer_.lda_*buffer_.a_num_vectors_ +
+ // map the buffers to pointers at host device
+ T *map_a,*map_b,*map_c;
+ map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(T),
- 0, NULL, NULL, &err);
- map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
- (buffer_.ldb_*buffer_.b_num_vectors_ +
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_*buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
- 0, NULL, NULL, &err);
- map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
- (buffer_.lda_*buffer_.c_num_vectors_ +
+ 0, NULL, NULL, &err);
+ map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
- 0, NULL, NULL, &err);
- // memcpy the input A, B, C to the host pointers
- memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
- memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
- memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
- // unmap the buffers
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
- // calling clBLAS
- xGemm_Function(false);
- // map the C buffer again to read output
- map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
- (buffer_.lda_*buffer_.c_num_vectors_ +
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B, C to the host pointers
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
+ // calling clBLAS
+ xGemm_Function(false);
+ // map the C buffer again to read output
+ map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
- 0, NULL, NULL, &err);
- memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
-
- timer.Stop(timer_id);
- }
- void usehostptr_roundtrip_func()
- {
- timer.Start(timer_id);
- cl_int err;
+ 0, NULL, NULL, &err);
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+
+ timer.Stop(timer_id);
+ }
+ void usehostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ cl_int err;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
(buffer_.lda_*buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(T),
@@ -631,13 +651,13 @@ public:
(buffer_.ldc_ * buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
buffer_.c_, &err);
- xGemm_Function(true);
- timer.Stop(timer_id);
- }
- void copyhostptr_roundtrip_func()
- {
- timer.Start(timer_id);
- cl_int err;
+ xGemm_Function(true);
+ timer.Stop(timer_id);
+ }
+ void copyhostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ cl_int err;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
(buffer_.lda_*buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(T),
@@ -652,20 +672,20 @@ public:
(buffer_.ldc_ * buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
buffer_.c_, &err);
- xGemm_Function(false);
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ xGemm_Function(false);
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
sizeof(T),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
- }
- void usepersismem_roundtrip_func()
- {
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void usepersismem_roundtrip_func()
+ {
#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
- timer.Start(timer_id);
+ timer.Start(timer_id);
- cl_int err;
+ cl_int err;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
(buffer_.lda_*buffer_.a_num_vectors_ +
@@ -682,46 +702,46 @@ public:
buffer_.offC_) * sizeof(T),
NULL, &err);
- // map the buffers to pointers at host devices
- T *map_a,*map_b,*map_c;
- map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
- (buffer_.lda_*buffer_.a_num_vectors_ +
+ // map the buffers to pointers at host devices
+ T *map_a,*map_b,*map_c;
+ map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(T),
- 0, NULL, NULL, &err);
- map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
- (buffer_.ldb_*buffer_.b_num_vectors_ +
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_*buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
- 0, NULL, NULL, &err);
- map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
- (buffer_.lda_*buffer_.c_num_vectors_ +
+ 0, NULL, NULL, &err);
+ map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
- 0, NULL, NULL, &err);
- // memcpy the input A, B, C to the host pointers
- memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
- memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
- memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
- // unmap the buffers
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
- // calling clBLAS
- xGemm_Function(false);
- // map the C buffer again to read output
- map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
- (buffer_.lda_*buffer_.c_num_vectors_ +
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B, C to the host pointers
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
+ // calling clBLAS
+ xGemm_Function(false);
+ // map the C buffer again to read output
+ map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
- 0, NULL, NULL, &err);
- memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
- clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
+ 0, NULL, NULL, &err);
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ timer.Stop(timer_id);
#else
- std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+ std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
#endif
- }
- void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+ }
+ void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
size_t ldc, size_t offA, size_t offBX, size_t offCY,
@@ -958,20 +978,21 @@ public:
buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
}
- void releaseGPUBuffer_deleteCPUBuffer()
- {
- //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
- //need to do this before we eventually hit the destructor
- delete buffer_.a_;
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.a_;
delete buffer_.b_;
delete buffer_.c_;
+ delete buffer_.c_copy;
OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
"releasing buffer A");
OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
"releasing buffer B");
OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
"releasing buffer C");
- }
+ }
protected:
void initialize_scalars(double alpha, double beta)
@@ -982,33 +1003,37 @@ protected:
private:
xGemmBuffer<T> buffer_;
- void xGemm_Function(bool flush, cl_uint apiCallCount = 1);
- unsigned int numQueuesToUse;
- cl_event events_[numQueues];
+ void xGemm_Function(bool flush, cl_uint apiCallCount = 1);
+ unsigned int numQueuesToUse;
+ cl_event events_[numQueues];
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+ void validation();
+#endif
}; // class xgemm
template<>
-void
+void
xGemm<cl_float>::
xGemm_Function(bool flush, cl_uint apiCallCount )
{
for (unsigned int i = 0; i < numQueues; i++) {
events_[i] = NULL;
}
- for (unsigned int i = 0; i < apiCallCount; i++)
- {
- clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
- }
- //flush==true if only the kernel time (library call) is timed
- //flush==false if memory time is also timed
- if (flush==true)
- {
+ for (unsigned int i = 0; i < apiCallCount; i++)
+ {
+ clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+ buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+ buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
+ }
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
+ {
// check if any valid events returned
cl_uint numValidEvents = 0;
for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1025,16 +1050,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
//printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
}
}
-
+
for (unsigned int i = 0; i < numQueuesToUse; i++) {
clFlush(queues_[i]);
}
- clWaitForEvents(numValidEvents, events_);
- }
+ clWaitForEvents(numValidEvents, events_);
+ }
}
template<>
-void
+void
xGemm<cl_double>::
xGemm_Function(bool flush, cl_uint apiCallCount )
{
@@ -1042,18 +1067,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
events_[i] = NULL;
}
for (unsigned int i = 0; i < apiCallCount; i++)
- {
- clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ {
+ clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
}
- //flush==true if only the kernel time (library call) is timed
- //flush==false if memory time is also timed
- if (flush==true)
- {
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
+ {
// check if any valid events returned
cl_uint numValidEvents = 0;
for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1070,16 +1095,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
//printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
}
}
-
+
for (unsigned int i = 0; i < numQueuesToUse; i++) {
clFlush(queues_[i]);
}
- clWaitForEvents(numValidEvents, events_);
- }
+ clWaitForEvents(numValidEvents, events_);
+ }
}
template<>
-void
+void
xGemm<cl_float2>::
xGemm_Function(bool flush, cl_uint apiCallCount )
{
@@ -1087,18 +1112,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
events_[i] = NULL;
}
for (unsigned int i = 0; i < apiCallCount; i++)
- {
- clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ {
+ clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
}
- //flush==true if only the kernel time (library call) is timed
- //flush==false if memory time is also timed
- if (flush==true)
- {
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
+ {
// check if any valid events returned
cl_uint numValidEvents = 0;
for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1115,16 +1140,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
//printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
}
}
-
+
for (unsigned int i = 0; i < numQueuesToUse; i++) {
clFlush(queues_[i]);
}
- clWaitForEvents(numValidEvents, events_);
- }
+ clWaitForEvents(numValidEvents, events_);
+ }
}
template<>
-void
+void
xGemm<cl_double2>::
xGemm_Function(bool flush, cl_uint apiCallCount )
{
@@ -1132,18 +1157,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
events_[i] = NULL;
}
for (unsigned int i = 0; i < apiCallCount; i++)
- {
- clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ {
+ clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
}
- //flush==true if only the kernel time (library call) is timed
- //flush==false if memory time is also timed
- if (flush==true)
- {
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
+ {
// check if any valid events returned
cl_uint numValidEvents = 0;
for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1164,8 +1189,8 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
clFlush(queues_[i]);
}
- clWaitForEvents(numValidEvents, events_);
- }
+ clWaitForEvents(numValidEvents, events_);
+ }
}
template<>
@@ -1200,4 +1225,84 @@ gflops_formula()
return "8.0*M*N*K/time";
}
+#if defined ( _WIN32 ) || defined (_WIN64 )
+
+#else
+
+template<>
+void
+xGemm<cl_float>::
+validation()
+{
+ cblas_sgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+ buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_,
+ buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+ cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1);
+ float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+ cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+ printf("Error of clblas_sgemm against cblas_sgemm = %f \n", norm_error);
+}
+
+template<>
+void
+xGemm<cl_double>::
+validation()
+{
+ cblas_dgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+ buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_,
+ buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+ cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1);
+ double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+ cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+ printf("Error of clblas_dgemm against cblas_dgemm = %f \n", norm_error);
+}
+
+template<>
+void
+xGemm<cl_float2>::
+validation()
+{
+ cblas_cgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+ buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_),
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+ &(buffer_.beta_),
+ buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+ cl_float2 neg_one = makeScalar<cl_float2>(-1.0);
+ cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1);
+ float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+ cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+ printf("Error of clblas_cgemm against cblas_cgemm = %f \n", norm_error);
+}
+
+template<>
+void
+xGemm<cl_double2>::
+validation()
+{
+ cblas_zgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+ buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_),
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+ &(buffer_.beta_),
+ buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+ cl_double2 neg_one = makeScalar<cl_double2>(-1.0);
+ cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1);
+ double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+ cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+ printf("Error of clblas_zgemm against cblas_zgemm = %f \n", norm_error);
+}
+
+#endif
+
#endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index 92d883c..0cd1ff4 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -40,6 +40,7 @@ struct xTrmmBuffer
clblasDiag diag_;
T* a_;
T* b_;
+ T* b_copy;
cl_mem buf_a_;
cl_mem buf_b_;
T alpha_;
@@ -64,6 +65,23 @@ public:
std::cout << "xtrmm::call_func\n";
}
+
+ void validate_with_cblas(int validate)
+ {
+ #if defined ( _WIN32 ) || defined ( _WIN64 )
+ #else
+ if(validate)
+ {
+ initialize_cpu_buffer();
+ initialize_gpu_buffer();
+ call_func();
+ read_gpu_buffer();
+ validation();
+ }
+ #endif
+ }
+
+
double gflops()
{
if (buffer_.side_ == clblasLeft)
@@ -225,6 +243,7 @@ public:
buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+ buffer_.b_copy = new T[buffer_.ldb_*buffer_.b_num_vectors_];
cl_int err;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
@@ -246,7 +265,7 @@ public:
{
for (size_t j = 0; j < buffer_.ldb_; ++j)
{
- buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
+ buffer_.b_copy[i*buffer_.ldb_+j] = buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
randomScale<T>();
}
}
@@ -294,29 +313,29 @@ public:
sizeof(T),
buffer_.b_, 0, NULL, NULL);
}
- void read_gpu_buffer()
- {
- cl_int err;
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(T),
- buffer_.b_, 0, NULL, NULL);
- }
- void roundtrip_func()
- {
- std::cout << "xTrmm::roundtrip_func\n";
- }
- void zerocopy_roundtrip_func()
- {
- std::cout << "xTrmm::zerocopy_roundtrip_func\n";
- }
- void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+ void read_gpu_buffer()
+ {
+ cl_int err;
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, NULL);
+ }
+ void roundtrip_func()
+ {
+ std::cout << "xTrmm::roundtrip_func\n";
+ }
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+ }
+ void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
- {
- DUMMY_ARGS_USAGE_3(transB_option, K, beta);
+ {
+ DUMMY_ARGS_USAGE_3(transB_option, K, beta);
DUMMY_ARGS_USAGE_2(ldc, offCY);
initialize_scalars(alpha, beta);
@@ -447,18 +466,20 @@ public:
buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
- }
- void releaseGPUBuffer_deleteCPUBuffer()
- {
- //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
- //need to do this before we eventually hit the destructor
+ }
+
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
delete buffer_.a_;
delete buffer_.b_;
+ delete buffer_.b_copy;
OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
"releasing buffer A");
OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
"releasing buffer B");
- }
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
@@ -468,7 +489,10 @@ protected:
private:
xTrmmBuffer<T> buffer_;
-
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+ void validation();
+#endif
}; // class xTrmm
template<>
@@ -494,9 +518,9 @@ void
xTrmm<cl_float>::
roundtrip_func()
{
- timer.Start(timer_id);
- cl_int err;
- //set up buffer
+ timer.Start(timer_id);
+ cl_int err;
+ //set up buffer
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
(buffer_.lda_ * buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(cl_float),
@@ -506,8 +530,8 @@ roundtrip_func()
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float),
NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+ //initialize gpu buffer
+ err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
buffer_.offA_ * sizeof(cl_float),
buffer_.lda_ * buffer_.a_num_vectors_ *
sizeof(cl_float),
@@ -518,20 +542,20 @@ roundtrip_func()
buffer_.ldb_ *buffer_.b_num_vectors_ *
sizeof(cl_float),
buffer_.b_, 0, NULL, NULL);
- //call_func
- clblasStrmm(order_, buffer_.side_, buffer_.uplo_,
+ //call_func
+ clblasStrmm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
numQueues, queues_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
sizeof(cl_float),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
}
@@ -558,9 +582,9 @@ void
xTrmm<cl_double>::
roundtrip_func()
{
- timer.Start(timer_id);
- cl_int err;
- //set up buffer
+ timer.Start(timer_id);
+ cl_int err;
+ //set up buffer
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
(buffer_.lda_ * buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(cl_double),
@@ -570,8 +594,8 @@ roundtrip_func()
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double),
NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+ //initialize gpu buffer
+ err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
buffer_.offA_ * sizeof(cl_double),
buffer_.lda_ * buffer_.a_num_vectors_ *
sizeof(cl_double),
@@ -582,20 +606,20 @@ roundtrip_func()
buffer_.ldb_ *buffer_.b_num_vectors_ *
sizeof(cl_double),
buffer_.b_, 0, NULL, NULL);
- //call_func
- clblasDtrmm(order_, buffer_.side_, buffer_.uplo_,
+ //call_func
+ clblasDtrmm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
numQueues, queues_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
sizeof(cl_double),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
}
@@ -622,9 +646,9 @@ void
xTrmm<cl_float2>::
roundtrip_func()
{
- timer.Start(timer_id);
- cl_int err;
- //set up buffer
+ timer.Start(timer_id);
+ cl_int err;
+ //set up buffer
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
(buffer_.lda_ * buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(cl_float2),
@@ -634,8 +658,8 @@ roundtrip_func()
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float2),
NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+ //initialize gpu buffer
+ err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
buffer_.offA_ * sizeof(cl_float2),
buffer_.lda_ * buffer_.a_num_vectors_ *
sizeof(cl_float2),
@@ -646,20 +670,20 @@ roundtrip_func()
buffer_.ldb_ *buffer_.b_num_vectors_ *
sizeof(cl_float2),
buffer_.b_, 0, NULL, NULL);
- //call_func
- clblasCtrmm(order_, buffer_.side_, buffer_.uplo_,
+ //call_func
+ clblasCtrmm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
numQueues, queues_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
sizeof(cl_float2),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
}
@@ -686,9 +710,9 @@ void
xTrmm<cl_double2>::
roundtrip_func()
{
- timer.Start(timer_id);
- cl_int err;
- //set up buffer
+ timer.Start(timer_id);
+ cl_int err;
+ //set up buffer
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
(buffer_.lda_ * buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(cl_double2),
@@ -698,8 +722,8 @@ roundtrip_func()
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double2),
NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+ //initialize gpu buffer
+ err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
buffer_.offA_ * sizeof(cl_double2),
buffer_.lda_ * buffer_.a_num_vectors_ *
sizeof(cl_double2),
@@ -710,20 +734,20 @@ roundtrip_func()
buffer_.ldb_ *buffer_.b_num_vectors_ *
sizeof(cl_double2),
buffer_.b_, 0, NULL, NULL);
- //call_func
- clblasZtrmm(order_, buffer_.side_, buffer_.uplo_,
+ //call_func
+ clblasZtrmm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
numQueues, queues_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
sizeof(cl_double2),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
}
@@ -790,5 +814,89 @@ gflops_formula()
}
}
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+
+template<>
+void
+xTrmm<cl_float>::
+validation()
+{
+ cblas_strmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+ clblasToCblas_fill(buffer_.uplo_),
+ clblasToCblas_operation(buffer_.trans_a_),
+ clblasToCblas_diag(buffer_.diag_),
+ buffer_.m_, buffer_.n_, buffer_.alpha_,
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+ cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1);
+ float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+ cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+ printf("Error of clblas_strmm against cblas_strmm = %f \n", norm_error);
+}
+
+
+template<>
+void
+xTrmm<cl_double>::
+validation()
+{
+ cblas_dtrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+ clblasToCblas_fill(buffer_.uplo_),
+ clblasToCblas_operation(buffer_.trans_a_),
+ clblasToCblas_diag(buffer_.diag_),
+ buffer_.m_, buffer_.n_, buffer_.alpha_,
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+ cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1);
+ double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+ cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+ printf("Error of clblas_dtrmm against cblas_dtrmm = %f \n", norm_error);
+}
+
+template<>
+void
+xTrmm<cl_float2>::
+validation()
+{
+ cblas_ctrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+ clblasToCblas_fill(buffer_.uplo_),
+ clblasToCblas_operation(buffer_.trans_a_),
+ clblasToCblas_diag(buffer_.diag_),
+ buffer_.m_, buffer_.n_, &(buffer_.alpha_),
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+ cl_float2 neg_one = makeScalar<cl_float2>(-1.0);
+ cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1);
+ float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+ cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+ printf("Error of clblas_ctrmm against cblas_ctrmm = %f \n", norm_error);
+}
+
+
+template<>
+void
+xTrmm<cl_double2>::
+validation()
+{
+ cblas_ztrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+ clblasToCblas_fill(buffer_.uplo_),
+ clblasToCblas_operation(buffer_.trans_a_),
+ clblasToCblas_diag(buffer_.diag_),
+ buffer_.m_, buffer_.n_, &(buffer_.alpha_),
+ buffer_.a_ + buffer_.offA_, buffer_.lda_,
+ buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+ cl_double2 neg_one = makeScalar<cl_double2>(-1.0);
+ cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1);
+ double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+ cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+ printf("Error of clblas_ztrmm against cblas_ztrmm = %f \n", norm_error);
+}
+
+#endif
#endif // ifndef CLBLAS_BENCHMARK_XTRMM_HXX__
diff --git a/src/client/client.cpp b/src/client/client.cpp
index d067c3d..ba9c5fc 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -46,547 +46,552 @@ namespace po = boost::program_options;
int main(int argc, char *argv[])
{
- size_t M;
- size_t N;
- size_t K;
- cl_double alpha;
- cl_double beta;
- cl_uint profileCount;
- cl_uint apiCallCount;
- cl_uint commandQueueFlags = 0;
- cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
- int order_option;
- //clblasOrder order;
- //clblasTranspose transA;
- //clblasTranspose transB;
- int transA_option;
- int transB_option;
- size_t lda;
- size_t ldb;
- size_t ldc;
- size_t offA;
- size_t offBX;
- size_t offCY;
- std::string function;
- std::string precision;
- std::string roundtrip;
- std::string memalloc;
- int side_option;
- int uplo_option;
- int diag_option;
- unsigned int numQueuesToUse;
+ size_t M;
+ size_t N;
+ size_t K;
+ cl_double alpha;
+ cl_double beta;
+ cl_uint profileCount;
+ cl_uint apiCallCount;
+ cl_uint commandQueueFlags = 0;
+ cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+ int order_option;
+ //clblasOrder order;
+ //clblasTranspose transA;
+ //clblasTranspose transB;
+ int transA_option;
+ int transB_option;
+ size_t lda;
+ size_t ldb;
+ size_t ldc;
+ size_t offA;
+ size_t offBX;
+ size_t offCY;
+ std::string function;
+ std::string precision;
+ std::string roundtrip;
+ std::string memalloc;
+ int side_option;
+ int uplo_option;
+ int diag_option;
+ unsigned int numQueuesToUse;
+ int validate;
- po::options_description desc( "clBLAS client command line options" );
- desc.add_options()
- ( "help,h", "produces this help message" )
- ( "gpu,g", "Force instantiation of an OpenCL GPU device" )
- ( "cpu,c", "Force instantiation of an OpenCL CPU device" )
- ( "all,a", "Force instantiation of all OpenCL devices" )
- ( "useimages", "Use an image-based kernel" )
- ( "sizem,m", po::value<size_t>( &M )->default_value(128), "number of rows in A and C" )
- ( "sizen,n", po::value<size_t>( &N )->default_value(128), "number of columns in B and C" )
- ( "sizek,k", po::value<size_t>( &K )->default_value(128), "number of columns in A and rows in B" )
- ( "lda", po::value<size_t>( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" )
- ( "ldb", po::value<size_t>( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" )
- ( "ldc", po::value<size_t>( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" )
- ( "offA", po::value<size_t>( &offA )->default_value(0), "offset of the matrix A in memory object" )
- ( "offBX", po::value<size_t>( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" )
- ( "offCY", po::value<size_t>( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" )
- ( "alpha", po::value<cl_double>( &alpha )->default_value(1.0f), "specifies the scalar alpha" )
- ( "beta", po::value<cl_double>( &beta )->default_value(1.0f), "specifies the scalar beta" )
- ( "order,o", po::value<int>( &order_option )->default_value(0), "0 = row major, 1 = column major" )
- ( "transposeA", po::value<int>( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
- ( "transposeB", po::value<int>( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
- ( "function,f", po::value<std::string>( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" )
- ( "precision,r", po::value<std::string>( &precision )->default_value("s"), "Options: s,d,c,z" )
- ( "side", po::value<int>( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm
- ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm
- ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
- ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" )
- ( "apiCallCount", po::value<cl_uint>(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)")
- ( "numQueues", po::value<unsigned int>(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)")
- ( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
- ( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem")
- ;
+ po::options_description desc( "clBLAS client command line options" );
+ desc.add_options()
+ ( "help,h", "produces this help message" )
+ ( "gpu,g", "Force instantiation of an OpenCL GPU device" )
+ ( "cpu,c", "Force instantiation of an OpenCL CPU device" )
+ ( "all,a", "Force instantiation of all OpenCL devices" )
+ ( "useimages", "Use an image-based kernel" )
+ ( "sizem,m", po::value<size_t>( &M )->default_value(128), "number of rows in A and C" )
+ ( "sizen,n", po::value<size_t>( &N )->default_value(128), "number of columns in B and C" )
+ ( "sizek,k", po::value<size_t>( &K )->default_value(128), "number of columns in A and rows in B" )
+ ( "lda", po::value<size_t>( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" )
+ ( "ldb", po::value<size_t>( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" )
+ ( "ldc", po::value<size_t>( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" )
+ ( "offA", po::value<size_t>( &offA )->default_value(0), "offset of the matrix A in memory object" )
+ ( "offBX", po::value<size_t>( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" )
+ ( "offCY", po::value<size_t>( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" )
+ ( "alpha", po::value<cl_double>( &alpha )->default_value(1.0f), "specifies the scalar alpha" )
+ ( "beta", po::value<cl_double>( &beta )->default_value(1.0f), "specifies the scalar beta" )
+ ( "order,o", po::value<int>( &order_option )->default_value(1), "0 = row major, 1 = column major" )
+ ( "transposeA", po::value<int>( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+ ( "transposeB", po::value<int>( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+ ( "function,f", po::value<std::string>( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" )
+ ( "precision,r", po::value<std::string>( &precision )->default_value("s"), "Options: s,d,c,z" )
+ ( "side", po::value<int>( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm
+ ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm
+ ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
+ ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" )
+ ( "apiCallCount", po::value<cl_uint>(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)")
+ ( "numQueues", po::value<unsigned int>(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)")
+ ( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
+ ( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem")
+ ( "validate,v", po::value<int>(&validate)->default_value(0), "Validate GPU results with CPU BLAS? 0 = No, 1 = Yes (default: No): currently only available for gemm and trmm")
+ ;
- po::variables_map vm;
- po::store( po::parse_command_line( argc, argv, desc ), vm );
- po::notify( vm );
+ po::variables_map vm;
+ po::store( po::parse_command_line( argc, argv, desc ), vm );
+ po::notify( vm );
- if( vm.count( "help" ) )
- {
- std::cout << desc << std::endl;
- return 0;
- }
+ if( vm.count( "help" ) )
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
- if( function != "gemm"
- && function != "trsm"
- && function != "trmm"
- && function != "gemv"
- && function != "symv"
- && function != "syrk"
- && function != "syr2k"
- && function != "trsv"
- && function != "trmv"
- && function != "ger"
- && function != "syr"
- && function != "syr2"
- && function != "geru"
- && function != "gerc"
- && function != "her"
- && function != "her2"
- && function != "hemv"
- && function != "hemm"
- && function != "symm"
- && function != "herk"
- && function != "her2k"
- )
- {
- std::cerr << "Invalid value for --function" << std::endl;
- return -1;
- }
+ if( function != "gemm"
+ && function != "trsm"
+ && function != "trmm"
+ && function != "gemv"
+ && function != "symv"
+ && function != "syrk"
+ && function != "syr2k"
+ && function != "trsv"
+ && function != "trmv"
+ && function != "ger"
+ && function != "syr"
+ && function != "syr2"
+ && function != "geru"
+ && function != "gerc"
+ && function != "her"
+ && function != "her2"
+ && function != "hemv"
+ && function != "hemm"
+ && function != "symm"
+ && function != "herk"
+ && function != "her2k"
+ )
+ {
+ std::cerr << "Invalid value for --function" << std::endl;
+ return -1;
+ }
- if( precision != "s" && precision != "d" && precision != "c" && precision != "z" )
- {
- std::cerr << "Invalid value for --precision" << std::endl;
- return -1;
- }
+ if( precision != "s" && precision != "d" && precision != "c" && precision != "z" )
+ {
+ std::cerr << "Invalid value for --precision" << std::endl;
+ return -1;
+ }
- size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
- | ((vm.count( "cpu" ) > 0) ? 2 : 0)
- | ((vm.count( "all" ) > 0) ? 4 : 0);
- if((mutex & (mutex-1)) != 0) {
- std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl;
- if (vm.count ( "gpu" ) > 0) std::cerr << " gpu,g Force instantiation of an OpenCL GPU device" << std::endl;
- if (vm.count ( "cpu" ) > 0) std::cerr << " cpu,c Force instantiation of an OpenCL CPU device" << std::endl;
- if (vm.count ( "all" ) > 0) std::cerr << " all,a Force instantiation of all OpenCL devices" << std::endl;
- return 1;
- }
+ size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
+ | ((vm.count( "cpu" ) > 0) ? 2 : 0)
+ | ((vm.count( "all" ) > 0) ? 4 : 0);
+ if((mutex & (mutex-1)) != 0) {
+ std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl;
+ if (vm.count ( "gpu" ) > 0) std::cerr << " gpu,g Force instantiation of an OpenCL GPU device" << std::endl;
+ if (vm.count ( "cpu" ) > 0) std::cerr << " cpu,c Force instantiation of an OpenCL CPU device" << std::endl;
+ if (vm.count ( "all" ) > 0) std::cerr << " all,a Force instantiation of all OpenCL devices" << std::endl;
+ return 1;
+ }
- if( vm.count( "gpu" ) )
- {
- deviceType = CL_DEVICE_TYPE_GPU;
- }
+ if( vm.count( "gpu" ) )
+ {
+ deviceType = CL_DEVICE_TYPE_GPU;
+ }
- if( vm.count( "cpu" ) )
- {
- deviceType = CL_DEVICE_TYPE_CPU;
- }
+ if( vm.count( "cpu" ) )
+ {
+ deviceType = CL_DEVICE_TYPE_CPU;
+ }
- if( vm.count( "all" ) )
- {
- deviceType = CL_DEVICE_TYPE_ALL;
- }
+ if( vm.count( "all" ) )
+ {
+ deviceType = CL_DEVICE_TYPE_ALL;
+ }
- if( profileCount >= 1 )
- {
- commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE;
- }
+ if( profileCount >= 1 )
+ {
+ commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE;
+ }
- bool useimages;
- if( vm.count("useimages") )
- useimages = true;
- else
- useimages = false;
+ bool useimages;
+ if( vm.count("useimages") )
+ useimages = true;
+ else
+ useimages = false;
- StatisticalTimer& timer = StatisticalTimer::getInstance( );
- timer.Reserve( 3, profileCount );
- timer.setNormalize( true );
+ StatisticalTimer& timer = StatisticalTimer::getInstance( );
+ timer.Reserve( 3, profileCount );
+ timer.setNormalize( true );
- clblasFunc *my_function = NULL;
- if (function == "gemm")
- {
- if (precision == "s")
- my_function = new xGemm<cl_float>(timer, deviceType, numQueuesToUse);
- else if (precision == "d")
- my_function = new xGemm<cl_double>(timer, deviceType, numQueuesToUse);
- else if (precision == "c")
- my_function = new xGemm<cl_float2>(timer, deviceType, numQueuesToUse);
- else if (precision == "z")
- my_function = new xGemm<cl_double2>(timer, deviceType, numQueuesToUse);
- else
- {
- std::cerr << "Unknown gemm function" << std::endl;
- return -1;
- }
- }
- else if (function == "trsm")
- {
- if (precision == "s")
- my_function = new xTrsm<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xTrsm<cl_double>(timer, deviceType);
- else if (precision == "c")
- my_function = new xTrsm<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xTrsm<cl_double2>(timer, deviceType);
- else
- {
- std::cerr << "Unknown trsm function" << std::endl;
- return -1;
- }
- }
- else if (function == "trmm")
- {
- if (precision == "s")
- my_function = new xTrmm<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xTrmm<cl_double>(timer, deviceType);
- else if (precision == "c")
- my_function = new xTrmm<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xTrmm<cl_double2>(timer, deviceType);
- else
+ clblasFunc *my_function = NULL;
+ if (function == "gemm")
{
- std::cerr << "Unknown trmm function" << std::endl;
- return -1;
- }
- }
- else if (function == "gemv")
- {
- if (precision == "s")
- my_function = new xGemv<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xGemv<cl_double>(timer, deviceType);
- else if (precision == "c")
- my_function = new xGemv<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xGemv<cl_double2>(timer, deviceType);
- else
+ if (precision == "s")
+ my_function = new xGemm<cl_float>(timer, deviceType, numQueuesToUse);
+ else if (precision == "d")
+ my_function = new xGemm<cl_double>(timer, deviceType, numQueuesToUse);
+ else if (precision == "c")
+ my_function = new xGemm<cl_float2>(timer, deviceType, numQueuesToUse);
+ else if (precision == "z")
+ my_function = new xGemm<cl_double2>(timer, deviceType, numQueuesToUse);
+ else
+ {
+ std::cerr << "Unknown gemm function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "trsm")
{
- std::cerr << "Unknown gemv function" << std::endl;
- return -1;
+ if (precision == "s")
+ my_function = new xTrsm<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xTrsm<cl_double>(timer, deviceType);
+ else if (precision == "c")
+ my_function = new xTrsm<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xTrsm<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown trsm function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "symv")
- {
- if (precision == "s")
- my_function = new xSymv<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xSymv<cl_double>(timer, deviceType);
- else
+ else if (function == "trmm")
{
- std::cerr << "Unknown symv function" << std::endl;
- return -1;
+ if (precision == "s")
+ my_function = new xTrmm<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xTrmm<cl_double>(timer, deviceType);
+ else if (precision == "c")
+ my_function = new xTrmm<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xTrmm<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown trmm function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "syrk")
- {
- if (precision == "s")
- my_function = new xSyrk<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xSyrk<cl_double>(timer, deviceType);
+ else if (function == "gemv")
+ {
+ if (precision == "s")
+ my_function = new xGemv<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xGemv<cl_double>(timer, deviceType);
else if (precision == "c")
- my_function = new xSyrk<cl_float2>(timer, deviceType);
+ my_function = new xGemv<cl_float2>(timer, deviceType);
else if (precision == "z")
- my_function = new xSyrk<cl_double2>(timer, deviceType);
- else
+ my_function = new xGemv<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown gemv function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "symv")
{
- std::cerr << "Unknown syrk function" << std::endl;
- return -1;
- }
- }
- else if (function == "syr2k")
- {
- if (precision == "s")
- my_function = new xSyr2k<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xSyr2k<cl_double>(timer, deviceType);
- else if (precision == "c")
- my_function = new xSyr2k<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xSyr2k<cl_double2>(timer, deviceType);
- else
+ if (precision == "s")
+ my_function = new xSymv<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xSymv<cl_double>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown symv function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "syrk")
{
- std::cerr << "Unknown syr2k function" << std::endl;
- return -1;
- }
- }
- else if (function == "trsv")
- {
- if (precision == "s")
- my_function = new xTrsv<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xTrsv<cl_double>(timer, deviceType);
- else if (precision == "c")
- my_function = new xTrsv<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xTrsv<cl_double2>(timer, deviceType);
- else
+ if (precision == "s")
+ my_function = new xSyrk<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xSyrk<cl_double>(timer, deviceType);
+ else if (precision == "c")
+ my_function = new xSyrk<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xSyrk<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown syrk function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "syr2k")
{
- std::cerr << "Unknown trsv function" << std::endl;
- return -1;
- }
- }
- else if (function == "trmv")
- {
- if (precision == "s")
- my_function = new xTrmv<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xTrmv<cl_double>(timer, deviceType);
- else if (precision == "c")
- my_function = new xTrmv<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xTrmv<cl_double2>(timer, deviceType);
- else
+ if (precision == "s")
+ my_function = new xSyr2k<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xSyr2k<cl_double>(timer, deviceType);
+ else if (precision == "c")
+ my_function = new xSyr2k<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xSyr2k<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown syr2k function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "trsv")
+ {
+ if (precision == "s")
+ my_function = new xTrsv<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xTrsv<cl_double>(timer, deviceType);
+ else if (precision == "c")
+ my_function = new xTrsv<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xTrsv<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown trsv function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "trmv")
{
- std::cerr << "Unknown trmv function" << std::endl;
- return -1;
+ if (precision == "s")
+ my_function = new xTrmv<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xTrmv<cl_double>(timer, deviceType);
+ else if (precision == "c")
+ my_function = new xTrmv<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xTrmv<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown trmv function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "ger")
- {
- if (precision == "s")
- my_function = new xGer<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xGer<cl_double>(timer, deviceType);
- else
+ else if (function == "ger")
{
- std::cerr << "Unknown ger function" << std::endl;
- return -1;
+ if (precision == "s")
+ my_function = new xGer<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xGer<cl_double>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown ger function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "syr")
- {
- if (precision == "s")
- my_function = new xSyr<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xSyr<cl_double>(timer, deviceType);
- else
+ else if (function == "syr")
{
- std::cerr << "Unknown syr function" << std::endl;
- return -1;
+ if (precision == "s")
+ my_function = new xSyr<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xSyr<cl_double>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown syr function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "syr2")
- {
- if (precision == "s")
- my_function = new xSyr2<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xSyr2<cl_double>(timer, deviceType);
- else
+ else if (function == "syr2")
{
- std::cerr << "Unknown syr2 function" << std::endl;
- return -1;
+ if (precision == "s")
+ my_function = new xSyr2<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xSyr2<cl_double>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown syr2 function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "geru")
- {
- if (precision == "c")
- my_function = new xGeru<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xGeru<cl_double2>(timer, deviceType);
- else
+ else if (function == "geru")
{
- std::cerr << "Unknown geru function" << std::endl;
- return -1;
+ if (precision == "c")
+ my_function = new xGeru<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xGeru<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown geru function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "gerc")
- {
- if (precision == "c")
- my_function = new xGerc<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xGerc<cl_double2>(timer, deviceType);
- else
+ else if (function == "gerc")
{
- std::cerr << "Unknown gerc function" << std::endl;
- return -1;
+ if (precision == "c")
+ my_function = new xGerc<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xGerc<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown gerc function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "her")
- {
- if (precision == "c")
- my_function = new xHer<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xHer<cl_double2>(timer, deviceType);
- else
+ else if (function == "her")
{
- std::cerr << "Unknown her function" << std::endl;
- return -1;
+ if (precision == "c")
+ my_function = new xHer<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHer<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "her2")
- {
- if (precision == "c")
- my_function = new xHer2<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xHer2<cl_double2>(timer, deviceType);
- else
+ else if (function == "her2")
{
- std::cerr << "Unknown her2 function" << std::endl;
- return -1;
+ if (precision == "c")
+ my_function = new xHer2<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHer2<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her2 function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "hemv")
- {
- if (precision == "c")
- my_function = new xHemv<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xHemv<cl_double2>(timer, deviceType);
- else
+ else if (function == "hemv")
{
- std::cerr << "Unknown hemv function" << std::endl;
- return -1;
+ if (precision == "c")
+ my_function = new xHemv<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHemv<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown hemv function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "hemm")
- {
- if (precision == "c")
- my_function = new xHemm<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xHemm<cl_double2>(timer, deviceType);
- else
+ else if (function == "hemm")
{
- std::cerr << "Unknown hemm function" << std::endl;
- return -1;
+ if (precision == "c")
+ my_function = new xHemm<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHemm<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown hemm function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "herk")
- {
- if (precision == "c")
- my_function = new xHerk<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xHerk<cl_double2>(timer, deviceType);
- else
+ else if (function == "herk")
{
- std::cerr << "Unknown her function" << std::endl;
- return -1;
+ if (precision == "c")
+ my_function = new xHerk<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHerk<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her function" << std::endl;
+ return -1;
+ }
}
- }
- else if (function == "her2k")
- {
- if (precision == "c")
- my_function = new xHer2k<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xHer2k<cl_double2>(timer, deviceType);
- else
+ else if (function == "her2k")
{
- std::cerr << "Unknown her2 function" << std::endl;
- return -1;
- }
- }
- else if (function == "symm")
- {
- if (precision == "s")
- my_function = new xSymm<cl_float>(timer, deviceType);
- else if (precision == "d")
- my_function = new xSymm<cl_double>(timer, deviceType);
- else if (precision == "c")
- my_function = new xSymm<cl_float2>(timer, deviceType);
- else if (precision == "z")
- my_function = new xSymm<cl_double2>(timer, deviceType);
- else
+ if (precision == "c")
+ my_function = new xHer2k<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHer2k<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her2 function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "symm")
{
- std::cerr << "Unknown symm function" << std::endl;
- return -1;
+ if (precision == "s")
+ my_function = new xSymm<cl_float>(timer, deviceType);
+ else if (precision == "d")
+ my_function = new xSymm<cl_double>(timer, deviceType);
+ else if (precision == "c")
+ my_function = new xSymm<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xSymm<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown symm function" << std::endl;
+ return -1;
+ }
}
- }
- try
- {
- my_function->setup_buffer( order_option, side_option, uplo_option,
- diag_option, transA_option, transB_option,
+ try
+ {
+ my_function->setup_buffer( order_option, side_option, uplo_option,
+ diag_option, transA_option, transB_option,
M, N, K, lda, ldb, ldc, offA, offBX, offCY,
alpha, beta );
-
- my_function->initialize_cpu_buffer();
- my_function->initialize_gpu_buffer();
- my_function->setup_apiCallCount(apiCallCount);
- my_function->call_func(); // do a calculation first to get any compilation out of the way
- my_function->reset_gpu_write_buffer(); // reset GPU write buffer
- }
- catch( std::exception& exc )
- {
- std::cerr << exc.what( ) << std::endl;
- return 1;
- }
- if(roundtrip=="roundtrip"||roundtrip=="both")
- {
- timer.Reset();
- for( cl_uint i = 0; i < profileCount; ++i )
- {
- my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option,
- diag_option, transA_option, transB_option,
+ my_function->initialize_cpu_buffer();
+ my_function->initialize_gpu_buffer();
+ my_function->setup_apiCallCount(apiCallCount);
+ my_function->call_func(); // do a calculation first to get any compilation out of the way
+ my_function->reset_gpu_write_buffer(); // reset GPU write buffer
+ }
+ catch( std::exception& exc )
+ {
+ std::cerr << exc.what( ) << std::endl;
+ return 1;
+ }
+ if(roundtrip=="roundtrip"||roundtrip=="both")
+ {
+ timer.Reset();
+ for( cl_uint i = 0; i < profileCount; ++i )
+ {
+ my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option,
+ diag_option, transA_option, transB_option,
M, N, K, lda, ldb, ldc, offA, offBX, offCY,
alpha, beta );
- my_function->initialize_cpu_buffer();
- /*my_function->initialize_gpu_buffer();
- my_function->call_func();
- my_function->read_gpu_buffer();
- my_function->reset_gpu_write_buffer();*/
-
- if(memalloc=="default")
- {
- my_function->roundtrip_func();
- }
- else if (memalloc=="alloc_host_ptr")
- {
- my_function->allochostptr_roundtrip_func();
- }
- else if (memalloc=="use_host_ptr")
- {
- my_function->usehostptr_roundtrip_func();
- }
- else if (memalloc=="copy_host_ptr")
- {
- my_function->copyhostptr_roundtrip_func();
- }
- else if (memalloc=="use_persistent_mem_amd")
- {
- my_function->usepersismem_roundtrip_func();
- }
- else if (memalloc=="rect_mem")
- {
- my_function->roundtrip_func_rect();
- }
- //my_function->reset_gpu_write_buffer();
- my_function->releaseGPUBuffer_deleteCPUBuffer();
- }
+ my_function->initialize_cpu_buffer();
+ /*my_function->initialize_gpu_buffer();
+ my_function->call_func();
+ my_function->read_gpu_buffer();
+ my_function->reset_gpu_write_buffer();*/
+
+ if(memalloc=="default")
+ {
+ my_function->roundtrip_func();
+ }
+ else if (memalloc=="alloc_host_ptr")
+ {
+ my_function->allochostptr_roundtrip_func();
+ }
+ else if (memalloc=="use_host_ptr")
+ {
+ my_function->usehostptr_roundtrip_func();
+ }
+ else if (memalloc=="copy_host_ptr")
+ {
+ my_function->copyhostptr_roundtrip_func();
+ }
+ else if (memalloc=="use_persistent_mem_amd")
+ {
+ my_function->usepersismem_roundtrip_func();
+ }
+ else if (memalloc=="rect_mem")
+ {
+ my_function->roundtrip_func_rect();
+ }
+ //my_function->reset_gpu_write_buffer();
+ my_function->releaseGPUBuffer_deleteCPUBuffer();
+ }
- if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
- {
- //std::cout << timer << std::endl;
- timer.pruneOutliers( 3.0 );
- std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl;
- std::cout << "BLAS (round trip) execution Gflops < " <<
- my_function->gflops_formula() << " >: " << my_function->gflops() <<
- std::endl;
- }
- }
- if(roundtrip=="noroundtrip"||roundtrip=="both")
- {
- timer.Reset();
- my_function->setup_buffer( order_option, side_option, uplo_option,
- diag_option, transA_option, transB_option,
+ if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
+ {
+ //std::cout << timer << std::endl;
+ timer.pruneOutliers( 3.0 );
+ std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl;
+ std::cout << "BLAS (round trip) execution Gflops < " <<
+ my_function->gflops_formula() << " >: " << my_function->gflops() <<
+ std::endl;
+ }
+ }
+ if(roundtrip=="noroundtrip"||roundtrip=="both")
+ {
+ timer.Reset();
+ my_function->setup_buffer( order_option, side_option, uplo_option,
+ diag_option, transA_option, transB_option,
M, N, K, lda, ldb, ldc, offA, offBX, offCY,
alpha, beta );
- my_function->initialize_cpu_buffer();
- my_function->initialize_gpu_buffer();
- my_function->setup_apiCallCount( apiCallCount );
+ my_function->initialize_cpu_buffer();
+ my_function->initialize_gpu_buffer();
+ my_function->setup_apiCallCount( apiCallCount );
+
+
for (cl_uint i = 0; i < profileCount; ++i)
- {
- my_function->call_func();
- }
- my_function->read_gpu_buffer();
- //my_function->reset_gpu_write_buffer();
- my_function->releaseGPUBuffer_deleteCPUBuffer();
+ {
+ my_function->call_func();
+ }
+ my_function->read_gpu_buffer();
- if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
- {
- //std::cout << timer << std::endl;
- timer.pruneOutliers( 3.0 );
- std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl;
- std::cout << "BLAS kernel execution Gflops < " <<
- my_function->gflops_formula() << " >: " << my_function->gflops() <<
- std::endl;
- }
- }
- delete my_function;
- return 0;
-}
+ my_function->validate_with_cblas(validate);
+
+ //my_function->reset_gpu_write_buffer();
+ my_function->releaseGPUBuffer_deleteCPUBuffer();
+ if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
+ {
+ //std::cout << timer << std::endl;
+ timer.pruneOutliers( 3.0 );
+ std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl;
+ std::cout << "BLAS kernel execution Gflops < " <<
+ my_function->gflops_formula() << " >: " << my_function->gflops() <<
+ std::endl;
+ }
+ }
+ delete my_function;
+ return 0;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list