[clblas] 53/75: allow users to easily verify the gemm/trmm GPU results with the netlib cblas through client (#274)

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Tue Jan 24 23:30:39 UTC 2017


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/master
in repository clblas.

commit 00a29c6409c9d269e3d680026faf747d0abf85a3
Author: tingxingdong <tingxingdong at gmail.com>
Date:   Mon Jun 20 12:57:24 2016 -0500

    allow users to easily verify the gemm/trmm GPU results with the netlib cblas through client (#274)
    
    * (1)update readme: netlib is preferred.(2)now you can verify the correctness of gemm&trmm through client
    
    * give more details of how to get CBLAS on windows
    
    * find the netlib library dir & library in Cmake files
    
    * forget to add this file
    
    * disable the validation on windows currently: no easy solution of building/linking netlib CBLAS on windows
---
 README.md                    |   6 +-
 src/CMakeLists.txt           |   8 +
 src/FindNetlib.cmake         |  19 +
 src/client/CMakeLists.txt    |   9 +-
 src/client/clfunc_common.hpp |  98 ++++-
 src/client/clfunc_xgemm.hpp  | 579 +++++++++++++++-----------
 src/client/clfunc_xtrmm.hpp  | 266 ++++++++----
 src/client/client.cpp        | 967 ++++++++++++++++++++++---------------------
 8 files changed, 1130 insertions(+), 822 deletions(-)

diff --git a/README.md b/README.md
index cd734da..8de7d7e 100644
--- a/README.md
+++ b/README.md
@@ -197,8 +197,12 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate
 
 ### Test infrastructure
 *  Googletest v1.6
-*  ACML on windows/linux; Accelerate on Mac OSX
 *  Latest Boost
+*  CPU BLAS
+  - Netlib CBLAS (recommended)
+    Ubuntu: install by "apt-get install libblas-dev"
+    Windows: download & install lapack-3.6.0 which comes with CBLAS
+  - or ACML on windows/linux; Accelerate on Mac OSX 
 
 ### Performance infrastructure
 * Python
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 33a91ee..73ba594 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -265,6 +265,14 @@ if( BUILD_TEST )
 	endif( )
 endif( )
 
+if( BUILD_CLIENT )
+    if( NETLIB_FOUND )
+    else( )
+        message( WARNING "Not find Netlib; BUILD_CLIENT needs the Netlib CBLAS library" )
+    endif()
+endif()
+
+
 # This will define OPENCL_FOUND
 find_package( OpenCL ${OPENCL_VERSION} )
 
diff --git a/src/FindNetlib.cmake b/src/FindNetlib.cmake
index a32474e..6a21e61 100644
--- a/src/FindNetlib.cmake
+++ b/src/FindNetlib.cmake
@@ -100,6 +100,25 @@ if( NOT contains_BLAS EQUAL -1 )
 	FIND_PACKAGE_HANDLE_STANDARD_ARGS( NETLIB DEFAULT_MSG Netlib_BLAS_LIBRARY )
 endif( )
 
+
+#look for netlib cblas header
+if( UNIX )
+    find_path(Netlib_INCLUDE_DIRS cblas.h
+	    HINTS
+		    /usr/include
+	)
+else()
+    find_path(Netlib_INCLUDE_DIRS cblas.h
+        HINTS
+            ${Netlib_ROOT}/CBLAS/include/
+    )
+endif()
+
+if( Netlib_INCLUDE_DIRS )
+else()
+    message(WARNING "Cannot find cblas.h")
+endif()
+
 if( NETLIB_FOUND )
 	list( APPEND Netlib_LIBRARIES ${Netlib_BLAS_LIBRARY} )
 else( )
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index eb66f8a..752b19b 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -1,12 +1,12 @@
 # ########################################################################
 # Copyright 2013 Advanced Micro Devices, Inc.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -48,10 +48,11 @@ include_directories(
     ${clBLAS_SOURCE_DIR}
     ${clBLAS_SOURCE_DIR}/include
     ${clBLAS_SOURCE_DIR}/tests/include
+    ${Netlib_INCLUDE_DIRS}
     .)
 
 add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER})
-target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
+target_link_libraries(client  ${Netlib_LIBRARIES} ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
 set_target_properties( client PROPERTIES
   RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging"
   OUTPUT_NAME clBLAS-client )
diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index fc2057b..0f22ef0 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -27,6 +27,11 @@
 #include "test-limits.h"
 #include "dis_warning.h"
 
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+#include "cblas.h"
+#endif
+
 #include "clBLAS.h"
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenCL/cl_ext.h>
@@ -77,6 +82,57 @@ randomScale()
     return t;
 }
 
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+
+CBLAS_ORDER
+clblasToCblas_order(clblasOrder value)
+{
+    switch (value) {
+        case clblasRowMajor:           return CblasRowMajor;
+        case clblasColumnMajor:        return CblasColMajor;
+    }
+}
+
+CBLAS_TRANSPOSE
+clblasToCblas_operation(clblasTranspose value)
+{
+    switch (value) {
+        case clblasNoTrans:      return CblasNoTrans;
+        case clblasTrans:        return CblasTrans;
+        case clblasConjTrans:    return CblasConjTrans;
+    }
+}
+
+CBLAS_UPLO
+clblasToCblas_fill(clblasUplo value)
+{
+    switch (value) {
+        case clblasUpper:           return CblasUpper;
+        case clblasLower:           return CblasLower;
+    }
+}
+
+CBLAS_SIDE
+clblasToCblas_side(clblasSide value)
+{
+    switch (value) {
+        case clblasLeft:           return CblasLeft;
+        case clblasRight:          return CblasRight;
+    }
+}
+
+CBLAS_DIAG
+clblasToCblas_diag(clblasDiag value)
+{
+    switch (value) {
+        case clblasNonUnit:           return CblasNonUnit;
+        case clblasUnit:              return CblasUnit;
+    }
+}
+
+#endif
+
 std::string
 prettyPrintClStatus( const cl_int& status )
 {
@@ -269,7 +325,7 @@ public:
     virtual ~clblasFunc()
     {
         clblasTeardown();
-        
+
         for (unsigned int i = 0; i < numQueues; i++) {
           OPENCL_V_THROW( clReleaseCommandQueue(queues_[i]), "releasing command queue" );
         }
@@ -278,21 +334,21 @@ public:
 
     void wait_and_check()
     {
-		cl_int err;
+        cl_int err;
         cl_int wait_status = clWaitForEvents(1, &event_);
 
         if( wait_status != CL_SUCCESS )
         {
-    	    if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
-    	    {
-    	    	clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS,
+            if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
+            {
+                clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS,
                                 sizeof(cl_int), &err, NULL );
-    	    	std::cout << "blas function execution status error: " << err << std::endl;
+                std::cout << "blas function execution status error: " << err << std::endl;
                 exit(1);
-    	    }
+            }
             else
             {
-    	    	std::cout << "blas function wait status error: " << wait_status << std::endl;
+                std::cout << "blas function wait status error: " << wait_status << std::endl;
                 exit(1);
             }
         }
@@ -300,14 +356,16 @@ public:
 
     double time_in_ns()
     {
-	    StatisticalTimer& timer = StatisticalTimer::getInstance( );
+        StatisticalTimer& timer = StatisticalTimer::getInstance( );
         return timer.getAverageTime( timer_id ) * 1e9;
     }
 
+    virtual void validate_with_cblas(int v) {}
+
     virtual void call_func() = 0;
     virtual double gflops() = 0;
     virtual std::string gflops_formula() = 0;
-	virtual void setup_apiCallCount(cl_uint apiCallCount){}
+    virtual void setup_apiCallCount(cl_uint apiCallCount){}
     virtual void setup_buffer(int order_option, int side_option,
                               int uplo_option, int diag_option, int
                               transA_option, int transB_option,
@@ -317,20 +375,20 @@ public:
     virtual void initialize_cpu_buffer() = 0;
     virtual void initialize_gpu_buffer() = 0;
     virtual void reset_gpu_write_buffer() = 0;
-	virtual void read_gpu_buffer() = 0;
-	virtual void roundtrip_func() = 0;
-	virtual void roundtrip_func_rect() {}
-	virtual void allochostptr_roundtrip_func() {}
-	virtual void usehostptr_roundtrip_func() {}
-	virtual void copyhostptr_roundtrip_func() {}
-	virtual void usepersismem_roundtrip_func() {}
-	virtual void roundtrip_setup_buffer(int order_option, int side_option,
+    virtual void read_gpu_buffer() = 0;
+    virtual void roundtrip_func() = 0;
+    virtual void roundtrip_func_rect() {}
+    virtual void allochostptr_roundtrip_func() {}
+    virtual void usehostptr_roundtrip_func() {}
+    virtual void copyhostptr_roundtrip_func() {}
+    virtual void usepersismem_roundtrip_func() {}
+    virtual void roundtrip_setup_buffer(int order_option, int side_option,
                               int uplo_option, int diag_option, int
                               transA_option, int transB_option,
                               size_t M, size_t N, size_t K, size_t lda,
                               size_t ldb, size_t ldc, size_t offA, size_t offBX,
                               size_t offCY, double alpha, double beta) = 0;
-	virtual void releaseGPUBuffer_deleteCPUBuffer()=0;
+    virtual void releaseGPUBuffer_deleteCPUBuffer()=0;
     StatisticalTimer& timer;
     StatisticalTimer::sTimerID timer_id;
 
@@ -347,7 +405,7 @@ protected:
     clblasOrder order_;
     cl_event event_;
     size_t maxMemAllocSize;
+    int validate_;
 }; // class clblasFunc
 
 #endif // ifndef CLBLAS_BENCHMARK_COMMON_HXX__
-
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index 8efaf63..57c283d 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -43,14 +43,16 @@ struct xGemmBuffer
     T* a_;
     T* b_;
     T* c_;
+    T* c_copy;
     cl_mem buf_a_;
     cl_mem buf_b_;
     cl_mem buf_c_;
     T alpha_;
     T beta_;
-	cl_uint apiCallCount;
+    cl_uint apiCallCount;
 }; // struct buffer
 
+
 template <typename T>
 class xGemm : public clblasFunc
 {
@@ -68,20 +70,37 @@ public:
 
     void call_func()
     {
-		timer.Start(timer_id);
-		xGemm_Function(true, buffer_.apiCallCount);
-		timer.Stop(timer_id);
+        timer.Start(timer_id);
+        xGemm_Function(true, buffer_.apiCallCount);
+        timer.Stop(timer_id);
+    }
+
+
+    void validate_with_cblas(int validate)
+    {
+        #if defined ( _WIN32 ) || defined ( _WIN64 )
+        #else
+        if(validate)
+        {
+    	    initialize_cpu_buffer();
+    	    initialize_gpu_buffer();
+            xGemm_Function(true, 1);
+            read_gpu_buffer();
+            validation();
+        }
+        #endif
     }
 
+
     double gflops()
     {
-		return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount);
+        return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount);
     }
 
-	void setup_apiCallCount(cl_uint apiCallCount)
-	{
-		buffer_.apiCallCount = apiCallCount;
-	}
+    void setup_apiCallCount(cl_uint apiCallCount)
+    {
+        buffer_.apiCallCount = apiCallCount;
+    }
     std::string gflops_formula()
     {
         return "2.0*M*N*K/time";
@@ -322,6 +341,7 @@ public:
         buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
         buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
         buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
+        buffer_.c_copy = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
 
         cl_int err;
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
@@ -366,7 +386,7 @@ public:
         {
             for (size_t j = 0; j < buffer_.ldc_; ++j)
             {
-                buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                buffer_.c_copy[i*buffer_.ldc_+j] = buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
                                                randomScale<T>();
             }
         }
@@ -375,7 +395,7 @@ public:
     void initialize_gpu_buffer()
     {
 
-		cl_int err;
+        cl_int err;
 
         err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
                                    buffer_.offA_ * sizeof(T),
@@ -408,19 +428,19 @@ public:
                                    buffer_.c_, 0, NULL, NULL);
     }
 
-	void read_gpu_buffer()
-	{
-		cl_int err;
-		err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+    void read_gpu_buffer()
+    {
+        cl_int err;
+        err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+                                  buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
                                        sizeof(T),
-								  buffer_.c_, 0, NULL, NULL);
-	}
+                                  buffer_.c_, 0, NULL, NULL);
+    }
 
-	void roundtrip_func()
-	{
-	timer.Start(timer_id);
-		cl_int err;
+    void roundtrip_func()
+    {
+    timer.Start(timer_id);
+        cl_int err;
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                        (buffer_.lda_*buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
@@ -452,42 +472,42 @@ public:
                                    buffer_.ldc_ * buffer_.c_num_vectors_ *
                                    sizeof(T),
                                    buffer_.c_, 0, NULL, NULL);
-		xGemm_Function(false);
-		err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+        xGemm_Function(false);
+        err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+                                  buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
                                        sizeof(T),
-								  buffer_.c_, 0, NULL, &event_);
-		clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
-	}
-	void roundtrip_func_rect()
-	{
-	timer.Start(timer_id);
-		cl_int err;
-		//rect
-		size_t a_buffer_origin[3] = {0,0,0}; 
-		size_t a_host_origin[3] = {0,0,0};
-		size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
-		size_t a_buffer_row_pitch=0*sizeof(T);//lda
-		size_t a_buffer_slice_pitch=0;
-		size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
-		size_t a_host_slice_pitch=0;
-
-		size_t b_buffer_origin[3] = {0,0,0}; 
-		size_t b_host_origin[3] = {0,0,0};
-		size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
-		size_t b_buffer_row_pitch=0*sizeof(T);//ldb
-		size_t b_buffer_slice_pitch=0;
-		size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
-		size_t b_host_slice_pitch=0;
-
-		size_t c_buffer_origin[3] = {0,0,0}; 
-		size_t c_host_origin[3] = {0,0,0};
-		size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
-		size_t c_buffer_row_pitch=0*sizeof(T);//ldc
-		size_t c_buffer_slice_pitch=0;
-		size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
-		size_t c_host_slice_pitch=0;
+                                  buffer_.c_, 0, NULL, &event_);
+        clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+    }
+    void roundtrip_func_rect()
+    {
+    timer.Start(timer_id);
+        cl_int err;
+        //rect
+        size_t a_buffer_origin[3] = {0,0,0};
+        size_t a_host_origin[3] = {0,0,0};
+        size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
+        size_t a_buffer_row_pitch=0*sizeof(T);//lda
+        size_t a_buffer_slice_pitch=0;
+        size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
+        size_t a_host_slice_pitch=0;
+
+        size_t b_buffer_origin[3] = {0,0,0};
+        size_t b_host_origin[3] = {0,0,0};
+        size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
+        size_t b_buffer_row_pitch=0*sizeof(T);//ldb
+        size_t b_buffer_slice_pitch=0;
+        size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
+        size_t b_host_slice_pitch=0;
+
+        size_t c_buffer_origin[3] = {0,0,0};
+        size_t c_host_origin[3] = {0,0,0};
+        size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
+        size_t c_buffer_row_pitch=0*sizeof(T);//ldc
+        size_t c_buffer_slice_pitch=0;
+        size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
+        size_t c_host_slice_pitch=0;
 
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                        (buffer_.k_*buffer_.m_ +
@@ -504,12 +524,12 @@ public:
                                             buffer_.offC_) * sizeof(T),
                                         NULL, &err);
         /*
-		err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+        err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
                                    buffer_.offA_ * sizeof(T),
                                    buffer_.lda_ * buffer_.a_num_vectors_ *
                                        sizeof(T),
                                    buffer_.a_, 0, NULL, NULL);
-		
+
         err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
                                    buffer_.offB_ * sizeof(T),
                                    buffer_.ldb_ * buffer_.b_num_vectors_ *
@@ -522,47 +542,47 @@ public:
                                    sizeof(T),
                                    buffer_.c_, 0, NULL, NULL);*/
         err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch,
-										a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
+                                        a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
         err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch,
-										b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
+                                        b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
         err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
-										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
-
-		if(buffer_.trans_a_==clblasNoTrans)
-		{
-			buffer_.lda_=buffer_.m_;
-		}
-		else
-		{
-			buffer_.lda_=buffer_.k_;
-		}
-		if(buffer_.trans_b_==clblasNoTrans)
-		{
-			buffer_.ldb_=buffer_.k_;
-		}
-		else
-		{
-			buffer_.ldb_=buffer_.n_;
-		}
-		buffer_.ldc_=buffer_.m_;
-		xGemm_Function(false);
-		/*
-		err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                        c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
+
+        if(buffer_.trans_a_==clblasNoTrans)
+        {
+            buffer_.lda_=buffer_.m_;
+        }
+        else
+        {
+            buffer_.lda_=buffer_.k_;
+        }
+        if(buffer_.trans_b_==clblasNoTrans)
+        {
+            buffer_.ldb_=buffer_.k_;
+        }
+        else
+        {
+            buffer_.ldb_=buffer_.n_;
+        }
+        buffer_.ldc_=buffer_.m_;
+        xGemm_Function(false);
+        /*
+        err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+                                  buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
                                        sizeof(T),
-								  buffer_.c_, 0, NULL, &event_);
-		*/
-		err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
-										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
-		clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
-	}	
-	void allochostptr_roundtrip_func()
-	{
-	timer.Start(timer_id);
-
-		cl_int err;
-		// Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
+                                  buffer_.c_, 0, NULL, &event_);
+        */
+        err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+                                        c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
+        clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+    }
+    void allochostptr_roundtrip_func()
+    {
+    timer.Start(timer_id);
+
+        cl_int err;
+        // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                        (buffer_.lda_*buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
@@ -578,45 +598,45 @@ public:
                                             buffer_.offC_) * sizeof(T),
                                         NULL, &err);
 
-		// map the buffers to pointers at host device
-		T *map_a,*map_b,*map_c;
-		map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, 
-										  (buffer_.lda_*buffer_.a_num_vectors_ +
+        // map the buffers to pointers at host device
+        T *map_a,*map_b,*map_c;
+        map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.lda_*buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
-										   0, NULL, NULL, &err);
-		map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, 
-										  (buffer_.ldb_*buffer_.b_num_vectors_ +
+                                           0, NULL, NULL, &err);
+        map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_*buffer_.b_num_vectors_ +
                                            buffer_.offB_) * sizeof(T),
-										   0, NULL, NULL, &err);
-	    map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, 
-										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           0, NULL, NULL, &err);
+        map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.lda_*buffer_.c_num_vectors_ +
                                            buffer_.offC_) * sizeof(T),
-										   0, NULL, NULL, &err);
-		// memcpy the input A, B, C to the host pointers
-		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
-		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
-		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
-		// unmap the buffers
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
-		// calling clBLAS
-		xGemm_Function(false);
-		// map the C buffer again to read output
-	    map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, 
-										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           0, NULL, NULL, &err);
+        // memcpy the input A, B, C to the host pointers
+        memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+        memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+        memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+        // unmap the buffers
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
+        // calling clBLAS
+        xGemm_Function(false);
+        // map the C buffer again to read output
+        map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+                                          (buffer_.lda_*buffer_.c_num_vectors_ +
                                            buffer_.offC_) * sizeof(T),
-										   0, NULL, NULL, &err);
-		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
-		clWaitForEvents(1, &event_);
-
-	timer.Stop(timer_id);
-	}
-	void usehostptr_roundtrip_func()
-	{
-	timer.Start(timer_id);
-		cl_int err;
+                                           0, NULL, NULL, &err);
+        memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
+        clWaitForEvents(1, &event_);
+
+        timer.Stop(timer_id);
+    }
+    void usehostptr_roundtrip_func()
+    {
+        timer.Start(timer_id);
+        cl_int err;
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                        (buffer_.lda_*buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
@@ -631,13 +651,13 @@ public:
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         buffer_.c_, &err);
-		xGemm_Function(true);
-	timer.Stop(timer_id);
-	}
-	void copyhostptr_roundtrip_func()
-	{
-	timer.Start(timer_id);
-		cl_int err;
+        xGemm_Function(true);
+    timer.Stop(timer_id);
+    }
+    void copyhostptr_roundtrip_func()
+    {
+    timer.Start(timer_id);
+        cl_int err;
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                        (buffer_.lda_*buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
@@ -652,20 +672,20 @@ public:
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         buffer_.c_, &err);
-		xGemm_Function(false);
-		err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+        xGemm_Function(false);
+        err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE,
+                                  buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
                                        sizeof(T),
-								  buffer_.c_, 0, NULL, &event_);
-		clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
-	}
-	void usepersismem_roundtrip_func()
-	{
+                                  buffer_.c_, 0, NULL, &event_);
+        clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+    }
+    void usepersismem_roundtrip_func()
+    {
 #if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
-	timer.Start(timer_id);
+    timer.Start(timer_id);
 
-		cl_int err;
+        cl_int err;
 
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
                                        (buffer_.lda_*buffer_.a_num_vectors_ +
@@ -682,46 +702,46 @@ public:
                                             buffer_.offC_) * sizeof(T),
                                         NULL, &err);
 
-		// map the buffers to pointers at host devices
-		T *map_a,*map_b,*map_c;
-		map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, 
-										  (buffer_.lda_*buffer_.a_num_vectors_ +
+        // map the buffers to pointers at host devices
+        T *map_a,*map_b,*map_c;
+        map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.lda_*buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
-										   0, NULL, NULL, &err);
-		map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, 
-										  (buffer_.ldb_*buffer_.b_num_vectors_ +
+                                           0, NULL, NULL, &err);
+        map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_*buffer_.b_num_vectors_ +
                                            buffer_.offB_) * sizeof(T),
-										   0, NULL, NULL, &err);
-	    map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, 
-										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           0, NULL, NULL, &err);
+        map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.lda_*buffer_.c_num_vectors_ +
                                            buffer_.offC_) * sizeof(T),
-										   0, NULL, NULL, &err);
-		// memcpy the input A, B, C to the host pointers
-		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
-		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
-		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
-		// unmap the buffers
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
-		// calling clBLAS
-		xGemm_Function(false);
-		// map the C buffer again to read output
-	    map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, 
-										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           0, NULL, NULL, &err);
+        // memcpy the input A, B, C to the host pointers
+        memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+        memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+        memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+        // unmap the buffers
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL);
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL);
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL);
+        // calling clBLAS
+        xGemm_Function(false);
+        // map the C buffer again to read output
+        map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+                                          (buffer_.lda_*buffer_.c_num_vectors_ +
                                            buffer_.offC_) * sizeof(T),
-										   0, NULL, NULL, &err);
-		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
-		clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
-		clWaitForEvents(1, &event_);
+                                           0, NULL, NULL, &err);
+        memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+        clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_);
+        clWaitForEvents(1, &event_);
 
-	timer.Stop(timer_id);
+    timer.Stop(timer_id);
 #else
-		std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+        std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
 #endif
 
-	}
-	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+    }
+    void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
@@ -958,20 +978,21 @@ public:
         buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
 
     }
-	void releaseGPUBuffer_deleteCPUBuffer()
-	{
-		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
-		//need to do this before we eventually hit the destructor
-		delete buffer_.a_;
+    void releaseGPUBuffer_deleteCPUBuffer()
+    {
+        //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+        //need to do this before we eventually hit the destructor
+        delete buffer_.a_;
         delete buffer_.b_;
         delete buffer_.c_;
+        delete buffer_.c_copy;
         OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
                         "releasing buffer A");
         OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
                         "releasing buffer B");
         OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
                         "releasing buffer C");
-	}
+    }
 
 protected:
     void initialize_scalars(double alpha, double beta)
@@ -982,33 +1003,37 @@ protected:
 
 private:
     xGemmBuffer<T> buffer_;
-	void xGemm_Function(bool flush, cl_uint apiCallCount = 1);
-  unsigned int numQueuesToUse;
-  cl_event events_[numQueues];
+    void xGemm_Function(bool flush, cl_uint apiCallCount = 1);
+    unsigned int numQueuesToUse;
+    cl_event events_[numQueues];
 
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+    void validation();
+#endif
 }; // class xgemm
 
 template<>
-void 
+void
 xGemm<cl_float>::
 xGemm_Function(bool flush, cl_uint apiCallCount )
 {
   for (unsigned int i = 0; i < numQueues; i++) {
     events_[i] = NULL;
   }
-	for (unsigned int i = 0; i < apiCallCount; i++)
-	{
-		clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-			buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-			buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-			buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-			buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-			buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
-	}
-	//flush==true if only the kernel time (library call) is timed
-	//flush==false if memory time is also timed
-	if (flush==true)
-	{
+    for (unsigned int i = 0; i < apiCallCount; i++)
+    {
+        clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+            buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+            buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+            buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+            buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+            buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
+    }
+    //flush==true if only the kernel time (library call) is timed
+    //flush==false if memory time is also timed
+    if (flush==true)
+    {
     // check if any valid events returned
     cl_uint numValidEvents = 0;
     for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1025,16 +1050,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
         //printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
       }
     }
-    
+
     for (unsigned int i = 0; i < numQueuesToUse; i++) {
       clFlush(queues_[i]);
     }
-		clWaitForEvents(numValidEvents, events_);
-	}
+        clWaitForEvents(numValidEvents, events_);
+    }
 }
 
 template<>
-void 
+void
 xGemm<cl_double>::
 xGemm_Function(bool flush, cl_uint apiCallCount )
 {
@@ -1042,18 +1067,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
     events_[i] = NULL;
   }
   for (unsigned int i = 0; i < apiCallCount; i++)
-	{
-	  clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+    {
+      clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
   }
-	//flush==true if only the kernel time (library call) is timed
-	//flush==false if memory time is also timed
-	if (flush==true)
-	{
+    //flush==true if only the kernel time (library call) is timed
+    //flush==false if memory time is also timed
+    if (flush==true)
+    {
     // check if any valid events returned
     cl_uint numValidEvents = 0;
     for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1070,16 +1095,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
         //printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
       }
     }
-    
+
     for (unsigned int i = 0; i < numQueuesToUse; i++) {
       clFlush(queues_[i]);
     }
-		clWaitForEvents(numValidEvents, events_);
-	}
+        clWaitForEvents(numValidEvents, events_);
+    }
 }
 
 template<>
-void 
+void
 xGemm<cl_float2>::
 xGemm_Function(bool flush, cl_uint apiCallCount )
 {
@@ -1087,18 +1112,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
     events_[i] = NULL;
   }
   for (unsigned int i = 0; i < apiCallCount; i++)
-	{
-	  clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+    {
+      clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
   }
-	//flush==true if only the kernel time (library call) is timed
-	//flush==false if memory time is also timed
-	if (flush==true)
-	{
+    //flush==true if only the kernel time (library call) is timed
+    //flush==false if memory time is also timed
+    if (flush==true)
+    {
     // check if any valid events returned
     cl_uint numValidEvents = 0;
     for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1115,16 +1140,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
         //printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
       }
     }
-    
+
     for (unsigned int i = 0; i < numQueuesToUse; i++) {
       clFlush(queues_[i]);
     }
-		clWaitForEvents(numValidEvents, events_);
-	}
+        clWaitForEvents(numValidEvents, events_);
+    }
 }
 
 template<>
-void 
+void
 xGemm<cl_double2>::
 xGemm_Function(bool flush, cl_uint apiCallCount )
 {
@@ -1132,18 +1157,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
     events_[i] = NULL;
   }
   for (unsigned int i = 0; i < apiCallCount; i++)
-	{
-	  clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+    {
+      clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
   }
-	//flush==true if only the kernel time (library call) is timed
-	//flush==false if memory time is also timed
-	if (flush==true)
-	{
+    //flush==true if only the kernel time (library call) is timed
+    //flush==false if memory time is also timed
+    if (flush==true)
+    {
     // check if any valid events returned
     cl_uint numValidEvents = 0;
     for (unsigned int i = 0; i < numQueuesToUse; i++) {
@@ -1164,8 +1189,8 @@ xGemm_Function(bool flush, cl_uint apiCallCount )
       clFlush(queues_[i]);
     }
 
-		clWaitForEvents(numValidEvents, events_);
-	}
+        clWaitForEvents(numValidEvents, events_);
+    }
 }
 
 template<>
@@ -1200,4 +1225,84 @@ gflops_formula()
     return "8.0*M*N*K/time";
 }
 
+#if defined ( _WIN32 ) || defined (_WIN64 )
+
+#else
+
+template<>
+void
+xGemm<cl_float>::
+validation()
+{
+    cblas_sgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+        buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+        buffer_.a_ + buffer_.offA_, buffer_.lda_,
+        buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+        buffer_.beta_,
+        buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+    cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1);
+    float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+		cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+    printf("Error of clblas_sgemm against cblas_sgemm = %f \n", norm_error);
+}
+
+template<>
+void
+xGemm<cl_double>::
+validation()
+{
+    cblas_dgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+        buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+        buffer_.a_ + buffer_.offA_, buffer_.lda_,
+        buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+        buffer_.beta_,
+        buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+    cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1);
+    double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+		cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+    printf("Error of clblas_dgemm against cblas_dgemm = %f \n", norm_error);
+}
+
+template<>
+void
+xGemm<cl_float2>::
+validation()
+{
+    cblas_cgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+        buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_),
+        buffer_.a_ + buffer_.offA_, buffer_.lda_,
+        buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+        &(buffer_.beta_),
+        buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+    cl_float2 neg_one = makeScalar<cl_float2>(-1.0);
+    cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1);
+    float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+		cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+    printf("Error of clblas_cgemm against cblas_cgemm = %f \n", norm_error);
+}
+
+template<>
+void
+xGemm<cl_double2>::
+validation()
+{
+    cblas_zgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_),
+        buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_),
+        buffer_.a_ + buffer_.offA_, buffer_.lda_,
+        buffer_.b_ + buffer_.offB_, buffer_.ldb_,
+        &(buffer_.beta_),
+        buffer_.c_copy + buffer_.offC_, buffer_.ldc_);
+
+    cl_double2 neg_one = makeScalar<cl_double2>(-1.0);
+    cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1);
+    double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/
+		cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1);
+    printf("Error of clblas_zgemm against cblas_zgemm = %f \n", norm_error);
+}
+
+#endif
+
 #endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index 92d883c..0cd1ff4 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -40,6 +40,7 @@ struct xTrmmBuffer
     clblasDiag diag_;
     T* a_;
     T* b_;
+    T* b_copy;
     cl_mem buf_a_;
     cl_mem buf_b_;
     T alpha_;
@@ -64,6 +65,23 @@ public:
         std::cout << "xtrmm::call_func\n";
     }
 
+
+    void validate_with_cblas(int validate)
+    {
+        #if defined ( _WIN32 ) || defined ( _WIN64 )
+        #else
+        if(validate)
+        {
+            initialize_cpu_buffer();
+            initialize_gpu_buffer();
+            call_func();
+            read_gpu_buffer();
+            validation();
+        }
+        #endif
+    }
+
+
     double gflops()
     {
         if (buffer_.side_ == clblasLeft)
@@ -225,6 +243,7 @@ public:
 
         buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
         buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.b_copy = new T[buffer_.ldb_*buffer_.b_num_vectors_];
 
         cl_int err;
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
@@ -246,7 +265,7 @@ public:
         {
             for (size_t j = 0; j < buffer_.ldb_; ++j)
             {
-                buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
+                buffer_.b_copy[i*buffer_.ldb_+j] = buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
                                                randomScale<T>();
             }
         }
@@ -294,29 +313,29 @@ public:
                                        sizeof(T),
                                    buffer_.b_, 0, NULL, NULL);
     }
-	void read_gpu_buffer()
-	{
-		cl_int err;
-		err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(T),
-								  buffer_.b_, 0, NULL, NULL);
-	}
-	void roundtrip_func()
-	{
-		std::cout << "xTrmm::roundtrip_func\n";
-	}
-	void zerocopy_roundtrip_func()
-	{
-		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
-	}
-	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+    void read_gpu_buffer()
+    {
+    cl_int err;
+    err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                  sizeof(T),
+                  buffer_.b_, 0, NULL, NULL);
+    }
+    void roundtrip_func()
+    {
+        std::cout << "xTrmm::roundtrip_func\n";
+    }
+    void zerocopy_roundtrip_func()
+    {
+        std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+    }
+    void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
-	{
-		DUMMY_ARGS_USAGE_3(transB_option, K, beta);
+    {
+    DUMMY_ARGS_USAGE_3(transB_option, K, beta);
         DUMMY_ARGS_USAGE_2(ldc, offCY);
 
         initialize_scalars(alpha, beta);
@@ -447,18 +466,20 @@ public:
 
         buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
         buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
-	}
-	void releaseGPUBuffer_deleteCPUBuffer()
-	{
-		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
-		//need to do this before we eventually hit the destructor
+   }
+
+   void releaseGPUBuffer_deleteCPUBuffer()
+   {
+        //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+        //need to do this before we eventually hit the destructor
         delete buffer_.a_;
         delete buffer_.b_;
+        delete buffer_.b_copy;
         OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
                        "releasing buffer A");
         OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
                        "releasing buffer B");
-	}
+    }
 protected:
     void initialize_scalars(double alpha, double beta)
     {
@@ -468,7 +489,10 @@ protected:
 
 private:
     xTrmmBuffer<T> buffer_;
-
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+    void validation();
+#endif
 }; // class xTrmm
 
 template<>
@@ -494,9 +518,9 @@ void
 xTrmm<cl_float>::
 roundtrip_func()
 {
-	    timer.Start(timer_id);
-	    cl_int err;
-			//set up buffer
+        timer.Start(timer_id);
+        cl_int err;
+            //set up buffer
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                         (buffer_.lda_ * buffer_.a_num_vectors_ +
                                             buffer_.offA_) * sizeof(cl_float),
@@ -506,8 +530,8 @@ roundtrip_func()
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float),
                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+        //initialize gpu buffer
+        err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
                                    buffer_.offA_ * sizeof(cl_float),
                                    buffer_.lda_ * buffer_.a_num_vectors_ *
                                        sizeof(cl_float),
@@ -518,20 +542,20 @@ roundtrip_func()
                                    buffer_.ldb_ *buffer_.b_num_vectors_ *
                                        sizeof(cl_float),
                                    buffer_.b_, 0, NULL, NULL);
-		//call_func
-		    clblasStrmm(order_, buffer_.side_, buffer_.uplo_,
+        //call_func
+            clblasStrmm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      numQueues, queues_, 0, NULL, NULL);
-		//read gpu buffer
-			err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
+        //read gpu buffer
+            err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+                                  buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
                                        sizeof(cl_float),
-								  buffer_.b_, 0, NULL, &event_);
-			clWaitForEvents(1, &event_);
-			timer.Stop(timer_id);
+                                  buffer_.b_, 0, NULL, &event_);
+            clWaitForEvents(1, &event_);
+            timer.Stop(timer_id);
 
 }
 
@@ -558,9 +582,9 @@ void
 xTrmm<cl_double>::
 roundtrip_func()
 {
-	    timer.Start(timer_id);
-	    cl_int err;
-			//set up buffer
+        timer.Start(timer_id);
+        cl_int err;
+            //set up buffer
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                         (buffer_.lda_ * buffer_.a_num_vectors_ +
                                             buffer_.offA_) * sizeof(cl_double),
@@ -570,8 +594,8 @@ roundtrip_func()
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double),
                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+        //initialize gpu buffer
+        err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
                                    buffer_.offA_ * sizeof(cl_double),
                                    buffer_.lda_ * buffer_.a_num_vectors_ *
                                        sizeof(cl_double),
@@ -582,20 +606,20 @@ roundtrip_func()
                                    buffer_.ldb_ *buffer_.b_num_vectors_ *
                                        sizeof(cl_double),
                                    buffer_.b_, 0, NULL, NULL);
-		//call_func
-		    clblasDtrmm(order_, buffer_.side_, buffer_.uplo_,
+        //call_func
+        clblasDtrmm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      numQueues, queues_, 0, NULL, NULL);
-		//read gpu buffer
-			err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
+        //read gpu buffer
+        err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+                                  buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
                                        sizeof(cl_double),
-								  buffer_.b_, 0, NULL, &event_);
-			clWaitForEvents(1, &event_);
-			timer.Stop(timer_id);
+                                  buffer_.b_, 0, NULL, &event_);
+        clWaitForEvents(1, &event_);
+        timer.Stop(timer_id);
 
 }
 
@@ -622,9 +646,9 @@ void
 xTrmm<cl_float2>::
 roundtrip_func()
 {
-	    timer.Start(timer_id);
-	    cl_int err;
-			//set up buffer
+        timer.Start(timer_id);
+        cl_int err;
+            //set up buffer
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                         (buffer_.lda_ * buffer_.a_num_vectors_ +
                                             buffer_.offA_) * sizeof(cl_float2),
@@ -634,8 +658,8 @@ roundtrip_func()
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float2),
                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+        //initialize gpu buffer
+        err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
                                    buffer_.offA_ * sizeof(cl_float2),
                                    buffer_.lda_ * buffer_.a_num_vectors_ *
                                        sizeof(cl_float2),
@@ -646,20 +670,20 @@ roundtrip_func()
                                    buffer_.ldb_ *buffer_.b_num_vectors_ *
                                        sizeof(cl_float2),
                                    buffer_.b_, 0, NULL, NULL);
-		//call_func
-		    clblasCtrmm(order_, buffer_.side_, buffer_.uplo_,
+        //call_func
+        clblasCtrmm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      numQueues, queues_, 0, NULL, NULL);
-		//read gpu buffer
-			err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+        //read gpu buffer
+        err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+                                  buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
                                        sizeof(cl_float2),
-								  buffer_.b_, 0, NULL, &event_);
-			clWaitForEvents(1, &event_);
-			timer.Stop(timer_id);
+                                  buffer_.b_, 0, NULL, &event_);
+        clWaitForEvents(1, &event_);
+        timer.Stop(timer_id);
 
 }
 
@@ -686,9 +710,9 @@ void
 xTrmm<cl_double2>::
 roundtrip_func()
 {
-	    timer.Start(timer_id);
-	    cl_int err;
-			//set up buffer
+        timer.Start(timer_id);
+        cl_int err;
+            //set up buffer
         buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                         (buffer_.lda_ * buffer_.a_num_vectors_ +
                                             buffer_.offA_) * sizeof(cl_double2),
@@ -698,8 +722,8 @@ roundtrip_func()
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double2),
                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
+        //initialize gpu buffer
+        err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE,
                                    buffer_.offA_ * sizeof(cl_double2),
                                    buffer_.lda_ * buffer_.a_num_vectors_ *
                                        sizeof(cl_double2),
@@ -710,20 +734,20 @@ roundtrip_func()
                                    buffer_.ldb_ *buffer_.b_num_vectors_ *
                                        sizeof(cl_double2),
                                    buffer_.b_, 0, NULL, NULL);
-		//call_func
-		    clblasZtrmm(order_, buffer_.side_, buffer_.uplo_,
+        //call_func
+            clblasZtrmm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      numQueues, queues_, 0, NULL, NULL);
-		//read gpu buffer
-			err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+        //read gpu buffer
+            err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE,
+                                  buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
                                        sizeof(cl_double2),
-								  buffer_.b_, 0, NULL, &event_);
-			clWaitForEvents(1, &event_);
-			timer.Stop(timer_id);
+                                  buffer_.b_, 0, NULL, &event_);
+            clWaitForEvents(1, &event_);
+            timer.Stop(timer_id);
 
 }
 
@@ -790,5 +814,89 @@ gflops_formula()
     }
 }
 
+#if defined ( _WIN32 ) || defined ( _WIN64 )
+#else
+
+template<>
+void
+xTrmm<cl_float>::
+validation()
+{
+    cblas_strmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+                clblasToCblas_fill(buffer_.uplo_),
+                clblasToCblas_operation(buffer_.trans_a_),
+		clblasToCblas_diag(buffer_.diag_),
+                buffer_.m_, buffer_.n_, buffer_.alpha_,
+                buffer_.a_ + buffer_.offA_, buffer_.lda_,
+                buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+    cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1);
+    float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+                cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+    printf("Error of clblas_strmm against cblas_strmm = %f \n", norm_error);
+}
+
+
+template<>
+void
+xTrmm<cl_double>::
+validation()
+{
+    cblas_dtrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+                clblasToCblas_fill(buffer_.uplo_),
+                clblasToCblas_operation(buffer_.trans_a_),
+		clblasToCblas_diag(buffer_.diag_),
+                buffer_.m_, buffer_.n_, buffer_.alpha_,
+                buffer_.a_ + buffer_.offA_, buffer_.lda_,
+                buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+    cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1);
+    double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+                cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+    printf("Error of clblas_dtrmm against cblas_dtrmm = %f \n", norm_error);
+}
+
+template<>
+void
+xTrmm<cl_float2>::
+validation()
+{
+    cblas_ctrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+                clblasToCblas_fill(buffer_.uplo_),
+                clblasToCblas_operation(buffer_.trans_a_),
+		clblasToCblas_diag(buffer_.diag_),
+                buffer_.m_, buffer_.n_, &(buffer_.alpha_),
+                buffer_.a_ + buffer_.offA_, buffer_.lda_,
+                buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+    cl_float2 neg_one = makeScalar<cl_float2>(-1.0);
+    cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1);
+    float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+                cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+    printf("Error of clblas_ctrmm against cblas_ctrmm = %f \n", norm_error);
+}
+
+
+template<>
+void
+xTrmm<cl_double2>::
+validation()
+{
+    cblas_ztrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_),
+                clblasToCblas_fill(buffer_.uplo_),
+                clblasToCblas_operation(buffer_.trans_a_),
+		clblasToCblas_diag(buffer_.diag_),
+                buffer_.m_, buffer_.n_, &(buffer_.alpha_),
+                buffer_.a_ + buffer_.offA_, buffer_.lda_,
+                buffer_.b_copy + buffer_.offB_, buffer_.ldb_);
+
+    cl_double2 neg_one = makeScalar<cl_double2>(-1.0);
+    cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1);
+    double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/
+                cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1);
+    printf("Error of clblas_ztrmm against cblas_ztrmm = %f \n", norm_error);
+}
+
+#endif
 
 #endif // ifndef CLBLAS_BENCHMARK_XTRMM_HXX__
diff --git a/src/client/client.cpp b/src/client/client.cpp
index d067c3d..ba9c5fc 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -46,547 +46,552 @@ namespace po = boost::program_options;
 
 int main(int argc, char *argv[])
 {
-  size_t M;
-  size_t N;
-  size_t K;
-  cl_double alpha;
-  cl_double beta;
-  cl_uint profileCount;
-  cl_uint apiCallCount;
-  cl_uint commandQueueFlags = 0;
-  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-  int order_option;
-  //clblasOrder order;
-  //clblasTranspose transA;
-  //clblasTranspose transB;
-  int transA_option;
-  int transB_option;
-  size_t lda;
-  size_t ldb;
-  size_t ldc;
-  size_t offA;
-  size_t offBX;
-  size_t offCY;
-  std::string function;
-  std::string precision;
-  std::string roundtrip;
-  std::string memalloc;
-  int side_option;
-  int uplo_option;
-  int diag_option;
-  unsigned int numQueuesToUse;
+    size_t M;
+    size_t N;
+    size_t K;
+    cl_double alpha;
+    cl_double beta;
+    cl_uint profileCount;
+    cl_uint apiCallCount;
+    cl_uint commandQueueFlags = 0;
+    cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+    int order_option;
+    //clblasOrder order;
+    //clblasTranspose transA;
+    //clblasTranspose transB;
+    int transA_option;
+    int transB_option;
+    size_t lda;
+    size_t ldb;
+    size_t ldc;
+    size_t offA;
+    size_t offBX;
+    size_t offCY;
+    std::string function;
+    std::string precision;
+    std::string roundtrip;
+    std::string memalloc;
+    int side_option;
+    int uplo_option;
+    int diag_option;
+    unsigned int numQueuesToUse;
+    int validate;
 
-  po::options_description desc( "clBLAS client command line options" );
-  desc.add_options()
-    ( "help,h", "produces this help message" )
-    ( "gpu,g", "Force instantiation of an OpenCL GPU device" )
-    ( "cpu,c", "Force instantiation of an OpenCL CPU device" )
-    ( "all,a", "Force instantiation of all OpenCL devices" )
-    ( "useimages", "Use an image-based kernel" )
-    ( "sizem,m", po::value<size_t>( &M )->default_value(128), "number of rows in A and C" )
-    ( "sizen,n", po::value<size_t>( &N )->default_value(128), "number of columns in B and C" )
-    ( "sizek,k", po::value<size_t>( &K )->default_value(128), "number of columns in A and rows in B" )
-    ( "lda", po::value<size_t>( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" )
-    ( "ldb", po::value<size_t>( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" )
-    ( "ldc", po::value<size_t>( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" )
-    ( "offA", po::value<size_t>( &offA )->default_value(0), "offset of the matrix A in memory object" )
-    ( "offBX", po::value<size_t>( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" )
-    ( "offCY", po::value<size_t>( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" )
-    ( "alpha", po::value<cl_double>( &alpha )->default_value(1.0f), "specifies the scalar alpha" )
-    ( "beta", po::value<cl_double>( &beta )->default_value(1.0f), "specifies the scalar beta" )
-    ( "order,o", po::value<int>( &order_option )->default_value(0), "0 = row major, 1 = column major" )
-    ( "transposeA", po::value<int>( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
-    ( "transposeB", po::value<int>( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
-    ( "function,f", po::value<std::string>( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" )
-    ( "precision,r", po::value<std::string>( &precision )->default_value("s"), "Options: s,d,c,z" )
-    ( "side", po::value<int>( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm
-    ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" )  // xsymv xsyrk xsyr2k xtrsm xtrmm
-    ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
-    ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" )
-	( "apiCallCount", po::value<cl_uint>(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)")
-	( "numQueues", po::value<unsigned int>(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)")
-	( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
-	( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem")
-    ;
+    po::options_description desc( "clBLAS client command line options" );
+    desc.add_options()
+        ( "help,h", "produces this help message" )
+        ( "gpu,g", "Force instantiation of an OpenCL GPU device" )
+        ( "cpu,c", "Force instantiation of an OpenCL CPU device" )
+        ( "all,a", "Force instantiation of all OpenCL devices" )
+        ( "useimages", "Use an image-based kernel" )
+        ( "sizem,m", po::value<size_t>( &M )->default_value(128), "number of rows in A and C" )
+        ( "sizen,n", po::value<size_t>( &N )->default_value(128), "number of columns in B and C" )
+        ( "sizek,k", po::value<size_t>( &K )->default_value(128), "number of columns in A and rows in B" )
+        ( "lda", po::value<size_t>( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" )
+        ( "ldb", po::value<size_t>( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" )
+        ( "ldc", po::value<size_t>( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" )
+        ( "offA", po::value<size_t>( &offA )->default_value(0), "offset of the matrix A in memory object" )
+        ( "offBX", po::value<size_t>( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" )
+        ( "offCY", po::value<size_t>( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" )
+        ( "alpha", po::value<cl_double>( &alpha )->default_value(1.0f), "specifies the scalar alpha" )
+        ( "beta", po::value<cl_double>( &beta )->default_value(1.0f), "specifies the scalar beta" )
+        ( "order,o", po::value<int>( &order_option )->default_value(1), "0 = row major, 1 = column major" )
+        ( "transposeA", po::value<int>( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+        ( "transposeB", po::value<int>( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+        ( "function,f", po::value<std::string>( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" )
+        ( "precision,r", po::value<std::string>( &precision )->default_value("s"), "Options: s,d,c,z" )
+        ( "side", po::value<int>( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm
+        ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" )    // xsymv xsyrk xsyr2k xtrsm xtrmm
+        ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
+        ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" )
+        ( "apiCallCount", po::value<cl_uint>(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)")
+        ( "numQueues", po::value<unsigned int>(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)")
+        ( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
+        ( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem")
+        ( "validate,v", po::value<int>(&validate)->default_value(0), "Validate GPU results with CPU BLAS? 0 = No, 1 = Yes (default: No): currently only available for gemm and trmm")
+        ;
 
-  po::variables_map vm;
-  po::store( po::parse_command_line( argc, argv, desc ), vm );
-  po::notify( vm );
+    po::variables_map vm;
+    po::store( po::parse_command_line( argc, argv, desc ), vm );
+    po::notify( vm );
 
-  if( vm.count( "help" ) )
-  {
-    std::cout << desc << std::endl;
-    return 0;
-  }
+    if( vm.count( "help" ) )
+    {
+        std::cout << desc << std::endl;
+        return 0;
+    }
 
-  if( function != "gemm"
-      && function != "trsm"
-      && function != "trmm"
-      && function != "gemv"
-      && function != "symv"
-      && function != "syrk"
-      && function != "syr2k"
-      && function != "trsv"
-      && function != "trmv"
-      && function != "ger"
-      && function != "syr"
-      && function != "syr2"
-      && function != "geru"
-      && function != "gerc"
-      && function != "her"
-      && function != "her2"
-      && function != "hemv"
-      && function != "hemm"
-      && function != "symm"
-	  && function != "herk"
-	  && function != "her2k"
-      )
-  {
-    std::cerr << "Invalid value for --function" << std::endl;
-    return -1;
-  }
+    if( function != "gemm"
+            && function != "trsm"
+            && function != "trmm"
+            && function != "gemv"
+            && function != "symv"
+            && function != "syrk"
+            && function != "syr2k"
+            && function != "trsv"
+            && function != "trmv"
+            && function != "ger"
+            && function != "syr"
+            && function != "syr2"
+            && function != "geru"
+            && function != "gerc"
+            && function != "her"
+            && function != "her2"
+            && function != "hemv"
+            && function != "hemm"
+            && function != "symm"
+            && function != "herk"
+            && function != "her2k"
+            )
+    {
+        std::cerr << "Invalid value for --function" << std::endl;
+        return -1;
+    }
 
-  if( precision != "s" && precision != "d" && precision != "c" && precision != "z" )
-  {
-    std::cerr << "Invalid value for --precision" << std::endl;
-    return -1;
-  }
+    if( precision != "s" && precision != "d" && precision != "c" && precision != "z" )
+    {
+        std::cerr << "Invalid value for --precision" << std::endl;
+        return -1;
+    }
 
-  size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
-    | ((vm.count( "cpu" ) > 0) ? 2 : 0)
-    | ((vm.count( "all" ) > 0) ? 4 : 0);
-  if((mutex & (mutex-1)) != 0) {
-    std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl;
-    if (vm.count ( "gpu" )  > 0) std::cerr << "    gpu,g   Force instantiation of an OpenCL GPU device" << std::endl;
-    if (vm.count ( "cpu" )  > 0) std::cerr << "    cpu,c   Force instantiation of an OpenCL CPU device" << std::endl;
-    if (vm.count ( "all" )  > 0) std::cerr << "    all,a   Force instantiation of all OpenCL devices" << std::endl;
-    return 1;
-  }
+    size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
+        | ((vm.count( "cpu" ) > 0) ? 2 : 0)
+        | ((vm.count( "all" ) > 0) ? 4 : 0);
+    if((mutex & (mutex-1)) != 0) {
+        std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl;
+        if (vm.count ( "gpu" )    > 0) std::cerr << "        gpu,g     Force instantiation of an OpenCL GPU device" << std::endl;
+        if (vm.count ( "cpu" )    > 0) std::cerr << "        cpu,c     Force instantiation of an OpenCL CPU device" << std::endl;
+        if (vm.count ( "all" )    > 0) std::cerr << "        all,a     Force instantiation of all OpenCL devices" << std::endl;
+        return 1;
+    }
 
-  if( vm.count( "gpu" ) )
-  {
-    deviceType	= CL_DEVICE_TYPE_GPU;
-  }
+    if( vm.count( "gpu" ) )
+    {
+        deviceType        = CL_DEVICE_TYPE_GPU;
+    }
 
-  if( vm.count( "cpu" ) )
-  {
-    deviceType	= CL_DEVICE_TYPE_CPU;
-  }
+    if( vm.count( "cpu" ) )
+    {
+        deviceType        = CL_DEVICE_TYPE_CPU;
+    }
 
-  if( vm.count( "all" ) )
-  {
-    deviceType	= CL_DEVICE_TYPE_ALL;
-  }
+    if( vm.count( "all" ) )
+    {
+        deviceType        = CL_DEVICE_TYPE_ALL;
+    }
 
-  if( profileCount >= 1 )
-  {
-    commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE;
-  }
+    if( profileCount >= 1 )
+    {
+        commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE;
+    }
 
-  bool useimages;
-  if( vm.count("useimages") )
-    useimages = true;
-  else
-    useimages = false;
+    bool useimages;
+    if( vm.count("useimages") )
+        useimages = true;
+    else
+        useimages = false;
 
-  StatisticalTimer& timer = StatisticalTimer::getInstance( );
-  timer.Reserve( 3, profileCount );
-  timer.setNormalize( true );
+    StatisticalTimer& timer = StatisticalTimer::getInstance( );
+    timer.Reserve( 3, profileCount );
+    timer.setNormalize( true );
 
-  clblasFunc *my_function = NULL;
-  if (function == "gemm")
-  {
-    if (precision == "s")
-      my_function = new xGemm<cl_float>(timer, deviceType, numQueuesToUse);
-    else if (precision == "d")
-      my_function = new xGemm<cl_double>(timer, deviceType, numQueuesToUse);
-    else if (precision == "c")
-      my_function = new xGemm<cl_float2>(timer, deviceType, numQueuesToUse);
-    else if (precision == "z")
-      my_function = new xGemm<cl_double2>(timer, deviceType, numQueuesToUse);
-    else
-    {
-      std::cerr << "Unknown gemm function" << std::endl;
-      return -1;
-    }
-  }
-  else if (function == "trsm")
-  {
-    if (precision == "s")
-      my_function = new xTrsm<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xTrsm<cl_double>(timer, deviceType);
-    else if (precision == "c")
-      my_function = new xTrsm<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xTrsm<cl_double2>(timer, deviceType);
-    else
-    {
-      std::cerr << "Unknown trsm function" << std::endl;
-      return -1;
-    }
-  }
-  else if (function == "trmm")
-  {
-    if (precision == "s")
-      my_function = new xTrmm<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xTrmm<cl_double>(timer, deviceType);
-    else if (precision == "c")
-      my_function = new xTrmm<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xTrmm<cl_double2>(timer, deviceType);
-    else
+    clblasFunc *my_function = NULL;
+    if (function == "gemm")
     {
-      std::cerr << "Unknown trmm function" << std::endl;
-      return -1;
-    }
-  }
-  else if (function == "gemv")
-  {
-    if (precision == "s")
-      my_function = new xGemv<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xGemv<cl_double>(timer, deviceType);
-    else if (precision == "c")
-      my_function = new xGemv<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xGemv<cl_double2>(timer, deviceType);
-    else
+        if (precision == "s")
+            my_function = new xGemm<cl_float>(timer, deviceType, numQueuesToUse);
+        else if (precision == "d")
+            my_function = new xGemm<cl_double>(timer, deviceType, numQueuesToUse);
+        else if (precision == "c")
+            my_function = new xGemm<cl_float2>(timer, deviceType, numQueuesToUse);
+        else if (precision == "z")
+            my_function = new xGemm<cl_double2>(timer, deviceType, numQueuesToUse);
+        else
+        {
+            std::cerr << "Unknown gemm function" << std::endl;
+            return -1;
+        }
+    }
+    else if (function == "trsm")
     {
-      std::cerr << "Unknown gemv function" << std::endl;
-      return -1;
+        if (precision == "s")
+            my_function = new xTrsm<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xTrsm<cl_double>(timer, deviceType);
+        else if (precision == "c")
+            my_function = new xTrsm<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xTrsm<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown trsm function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "symv")
-  {
-    if (precision == "s")
-      my_function = new xSymv<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xSymv<cl_double>(timer, deviceType);
-    else
+    else if (function == "trmm")
     {
-      std::cerr << "Unknown symv function" << std::endl;
-      return -1;
+        if (precision == "s")
+            my_function = new xTrmm<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xTrmm<cl_double>(timer, deviceType);
+        else if (precision == "c")
+            my_function = new xTrmm<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xTrmm<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown trmm function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "syrk")
-  {
-    if (precision == "s")
-      my_function = new xSyrk<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xSyrk<cl_double>(timer, deviceType);
+    else if (function == "gemv")
+    {
+        if (precision == "s")
+            my_function = new xGemv<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xGemv<cl_double>(timer, deviceType);
         else if (precision == "c")
-             my_function = new xSyrk<cl_float2>(timer, deviceType);
+            my_function = new xGemv<cl_float2>(timer, deviceType);
         else if (precision == "z")
-             my_function = new xSyrk<cl_double2>(timer, deviceType);
-    else
+            my_function = new xGemv<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown gemv function" << std::endl;
+            return -1;
+        }
+    }
+    else if (function == "symv")
     {
-      std::cerr << "Unknown syrk function" << std::endl;
-      return -1;
-    }
-  }
-  else if (function == "syr2k")
-  {
-    if (precision == "s")
-      my_function = new xSyr2k<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xSyr2k<cl_double>(timer, deviceType);
-    else if (precision == "c")
-      my_function = new xSyr2k<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xSyr2k<cl_double2>(timer, deviceType);
-    else
+        if (precision == "s")
+            my_function = new xSymv<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xSymv<cl_double>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown symv function" << std::endl;
+            return -1;
+        }
+    }
+    else if (function == "syrk")
     {
-      std::cerr << "Unknown syr2k function" << std::endl;
-      return -1;
-    }
-  }
-  else if (function == "trsv")
-  {
-    if (precision == "s")
-      my_function = new xTrsv<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xTrsv<cl_double>(timer, deviceType);
-    else if (precision == "c")
-      my_function = new xTrsv<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xTrsv<cl_double2>(timer, deviceType);
-    else
+        if (precision == "s")
+            my_function = new xSyrk<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xSyrk<cl_double>(timer, deviceType);
+                else if (precision == "c")
+                         my_function = new xSyrk<cl_float2>(timer, deviceType);
+                else if (precision == "z")
+                         my_function = new xSyrk<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown syrk function" << std::endl;
+            return -1;
+        }
+    }
+    else if (function == "syr2k")
     {
-      std::cerr << "Unknown trsv function" << std::endl;
-      return -1;
-    }
-  }
-  else if (function == "trmv")
-  {
-    if (precision == "s")
-      my_function = new xTrmv<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xTrmv<cl_double>(timer, deviceType);
-    else if (precision == "c")
-      my_function = new xTrmv<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xTrmv<cl_double2>(timer, deviceType);
-    else
+        if (precision == "s")
+            my_function = new xSyr2k<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xSyr2k<cl_double>(timer, deviceType);
+        else if (precision == "c")
+            my_function = new xSyr2k<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xSyr2k<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown syr2k function" << std::endl;
+            return -1;
+        }
+    }
+    else if (function == "trsv")
+    {
+        if (precision == "s")
+            my_function = new xTrsv<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xTrsv<cl_double>(timer, deviceType);
+        else if (precision == "c")
+            my_function = new xTrsv<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xTrsv<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown trsv function" << std::endl;
+            return -1;
+        }
+    }
+    else if (function == "trmv")
     {
-      std::cerr << "Unknown trmv function" << std::endl;
-      return -1;
+        if (precision == "s")
+            my_function = new xTrmv<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xTrmv<cl_double>(timer, deviceType);
+        else if (precision == "c")
+            my_function = new xTrmv<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xTrmv<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown trmv function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "ger")
-  {
-    if (precision == "s")
-      my_function = new xGer<cl_float>(timer, deviceType);
-    else if (precision == "d")
-          my_function = new xGer<cl_double>(timer, deviceType);
-    else
+    else if (function == "ger")
     {
-      std::cerr << "Unknown ger function" << std::endl;
-      return -1;
+        if (precision == "s")
+            my_function = new xGer<cl_float>(timer, deviceType);
+        else if (precision == "d")
+                    my_function = new xGer<cl_double>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown ger function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "syr")
-  {
-    if (precision == "s")
-      my_function = new xSyr<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xSyr<cl_double>(timer, deviceType);
-    else
+    else if (function == "syr")
     {
-      std::cerr << "Unknown syr function" << std::endl;
-      return -1;
+        if (precision == "s")
+            my_function = new xSyr<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xSyr<cl_double>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown syr function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "syr2")
-  {
-    if (precision == "s")
-      my_function = new xSyr2<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xSyr2<cl_double>(timer, deviceType);
-    else
+    else if (function == "syr2")
     {
-      std::cerr << "Unknown syr2 function" << std::endl;
-      return -1;
+        if (precision == "s")
+            my_function = new xSyr2<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xSyr2<cl_double>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown syr2 function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "geru")
-  {
-    if (precision == "c")
-      my_function = new xGeru<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xGeru<cl_double2>(timer, deviceType);
-    else
+    else if (function == "geru")
     {
-      std::cerr << "Unknown geru function" << std::endl;
-      return -1;
+        if (precision == "c")
+            my_function = new xGeru<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xGeru<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown geru function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "gerc")
-  {
-    if (precision == "c")
-      my_function = new xGerc<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xGerc<cl_double2>(timer, deviceType);
-    else
+    else if (function == "gerc")
     {
-      std::cerr << "Unknown gerc function" << std::endl;
-      return -1;
+        if (precision == "c")
+            my_function = new xGerc<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xGerc<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown gerc function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "her")
-  {
-    if (precision == "c")
-      my_function = new xHer<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xHer<cl_double2>(timer, deviceType);
-    else
+    else if (function == "her")
     {
-      std::cerr << "Unknown her function" << std::endl;
-      return -1;
+        if (precision == "c")
+            my_function = new xHer<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xHer<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown her function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "her2")
-  {
-    if (precision == "c")
-      my_function = new xHer2<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xHer2<cl_double2>(timer, deviceType);
-    else
+    else if (function == "her2")
     {
-      std::cerr << "Unknown her2 function" << std::endl;
-      return -1;
+        if (precision == "c")
+            my_function = new xHer2<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xHer2<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown her2 function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "hemv")
-  {
-    if (precision == "c")
-      my_function = new xHemv<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xHemv<cl_double2>(timer, deviceType);
-    else
+    else if (function == "hemv")
     {
-      std::cerr << "Unknown hemv function" << std::endl;
-      return -1;
+        if (precision == "c")
+            my_function = new xHemv<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xHemv<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown hemv function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "hemm")
-  {
-    if (precision == "c")
-      my_function = new xHemm<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xHemm<cl_double2>(timer, deviceType);
-    else
+    else if (function == "hemm")
     {
-      std::cerr << "Unknown hemm function" << std::endl;
-      return -1;
+        if (precision == "c")
+            my_function = new xHemm<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xHemm<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown hemm function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "herk")
-  {
-    if (precision == "c")
-      my_function = new xHerk<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xHerk<cl_double2>(timer, deviceType);
-    else
+    else if (function == "herk")
     {
-      std::cerr << "Unknown her function" << std::endl;
-      return -1;
+        if (precision == "c")
+            my_function = new xHerk<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xHerk<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown her function" << std::endl;
+            return -1;
+        }
     }
-  }
-  else if (function == "her2k")
-  {
-    if (precision == "c")
-      my_function = new xHer2k<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xHer2k<cl_double2>(timer, deviceType);
-    else
+    else if (function == "her2k")
     {
-      std::cerr << "Unknown her2 function" << std::endl;
-      return -1;
-    }
-  }
-  else if (function == "symm")
-  {
-    if (precision == "s")
-      my_function = new xSymm<cl_float>(timer, deviceType);
-    else if (precision == "d")
-      my_function = new xSymm<cl_double>(timer, deviceType);
-    else if (precision == "c")
-      my_function = new xSymm<cl_float2>(timer, deviceType);
-    else if (precision == "z")
-      my_function = new xSymm<cl_double2>(timer, deviceType);
-    else
+        if (precision == "c")
+            my_function = new xHer2k<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xHer2k<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown her2 function" << std::endl;
+            return -1;
+        }
+    }
+    else if (function == "symm")
     {
-      std::cerr << "Unknown symm function" << std::endl;
-      return -1;
+        if (precision == "s")
+            my_function = new xSymm<cl_float>(timer, deviceType);
+        else if (precision == "d")
+            my_function = new xSymm<cl_double>(timer, deviceType);
+        else if (precision == "c")
+            my_function = new xSymm<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+            my_function = new xSymm<cl_double2>(timer, deviceType);
+        else
+        {
+            std::cerr << "Unknown symm function" << std::endl;
+            return -1;
+        }
     }
-  }
-  try
-  {
-      my_function->setup_buffer( order_option, side_option, uplo_option,
-                                 diag_option, transA_option, transB_option,
+    try
+    {
+        my_function->setup_buffer( order_option, side_option, uplo_option,
+                                   diag_option, transA_option, transB_option,
                                    M, N, K, lda, ldb, ldc, offA, offBX, offCY,
                                    alpha, beta );
 
-
-      my_function->initialize_cpu_buffer();
-      my_function->initialize_gpu_buffer();
-	  my_function->setup_apiCallCount(apiCallCount);
-	  my_function->call_func(); // do a calculation first to get any compilation out of the way
-      my_function->reset_gpu_write_buffer(); // reset GPU write buffer
-  }
-  catch( std::exception& exc )
-  {
-      std::cerr << exc.what( ) << std::endl;
-      return 1;
-  }
-  if(roundtrip=="roundtrip"||roundtrip=="both")
-  {
-  timer.Reset();
-  for( cl_uint i = 0; i < profileCount; ++i )
-  {
-    my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option,
-                                 diag_option, transA_option, transB_option,
+        my_function->initialize_cpu_buffer();
+        my_function->initialize_gpu_buffer();
+        my_function->setup_apiCallCount(apiCallCount);
+        my_function->call_func(); // do a calculation first to get any compilation out of the way
+        my_function->reset_gpu_write_buffer(); // reset GPU write buffer
+    }
+    catch( std::exception& exc )
+    {
+        std::cerr << exc.what( ) << std::endl;
+        return 1;
+    }
+    if(roundtrip=="roundtrip"||roundtrip=="both")
+    {
+        timer.Reset();
+        for( cl_uint i = 0; i < profileCount; ++i )
+        {
+            my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option,
+                                   diag_option, transA_option, transB_option,
                                    M, N, K, lda, ldb, ldc, offA, offBX, offCY,
                                    alpha, beta );
 
 
-    my_function->initialize_cpu_buffer();
-    /*my_function->initialize_gpu_buffer();
-    my_function->call_func();
-	my_function->read_gpu_buffer();
-    my_function->reset_gpu_write_buffer();*/
-	
-	if(memalloc=="default")
-	{
-		my_function->roundtrip_func();
-	}
-	else if (memalloc=="alloc_host_ptr")
-	{
-		my_function->allochostptr_roundtrip_func();
-	}
-	else if (memalloc=="use_host_ptr")
-	{
-		my_function->usehostptr_roundtrip_func();
-	}
-	else if (memalloc=="copy_host_ptr")
-	{
-		my_function->copyhostptr_roundtrip_func();
-	}
-	else if (memalloc=="use_persistent_mem_amd")
-	{
-		my_function->usepersismem_roundtrip_func();
-	}
-	else if (memalloc=="rect_mem")
-	{
-		my_function->roundtrip_func_rect();
-	}
-	//my_function->reset_gpu_write_buffer();
-	my_function->releaseGPUBuffer_deleteCPUBuffer();
-  }
+            my_function->initialize_cpu_buffer();
+            /*my_function->initialize_gpu_buffer();
+            my_function->call_func();
+            my_function->read_gpu_buffer();
+            my_function->reset_gpu_write_buffer();*/
+
+            if(memalloc=="default")
+            {
+                    my_function->roundtrip_func();
+            }
+            else if (memalloc=="alloc_host_ptr")
+            {
+                    my_function->allochostptr_roundtrip_func();
+            }
+            else if (memalloc=="use_host_ptr")
+            {
+                    my_function->usehostptr_roundtrip_func();
+            }
+            else if (memalloc=="copy_host_ptr")
+            {
+                    my_function->copyhostptr_roundtrip_func();
+            }
+            else if (memalloc=="use_persistent_mem_amd")
+            {
+                    my_function->usepersismem_roundtrip_func();
+            }
+            else if (memalloc=="rect_mem")
+            {
+                    my_function->roundtrip_func_rect();
+            }
+        //my_function->reset_gpu_write_buffer();
+            my_function->releaseGPUBuffer_deleteCPUBuffer();
+        }
 
-  if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
-  {
-    //std::cout << timer << std::endl;
-    timer.pruneOutliers( 3.0 );
-    std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl;
-    std::cout << "BLAS (round trip) execution Gflops < " <<
-      my_function->gflops_formula() << " >: " << my_function->gflops() <<
-      std::endl;
-  }
-  }
-  if(roundtrip=="noroundtrip"||roundtrip=="both")
-  {
-    timer.Reset();
-    my_function->setup_buffer( order_option, side_option, uplo_option,
-                                 diag_option, transA_option, transB_option,
+        if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
+        {
+            //std::cout << timer << std::endl;
+            timer.pruneOutliers( 3.0 );
+            std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl;
+            std::cout << "BLAS (round trip) execution Gflops < " <<
+                my_function->gflops_formula() << " >: " << my_function->gflops() <<
+                std::endl;
+        }
+    }
+    if(roundtrip=="noroundtrip"||roundtrip=="both")
+    {
+        timer.Reset();
+        my_function->setup_buffer( order_option, side_option, uplo_option,
+                                   diag_option, transA_option, transB_option,
                                    M, N, K, lda, ldb, ldc, offA, offBX, offCY,
                                    alpha, beta );
 
 
-    my_function->initialize_cpu_buffer();
-    my_function->initialize_gpu_buffer();
-	my_function->setup_apiCallCount( apiCallCount );
+        my_function->initialize_cpu_buffer();
+        my_function->initialize_gpu_buffer();
+        my_function->setup_apiCallCount( apiCallCount );
+        
+        
 	for (cl_uint i = 0; i < profileCount; ++i)
-    {
-		my_function->call_func();
-	}
-	my_function->read_gpu_buffer();
-    //my_function->reset_gpu_write_buffer();
-	my_function->releaseGPUBuffer_deleteCPUBuffer();
+        {
+            my_function->call_func();
+        }
+        my_function->read_gpu_buffer();
 
-  if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
-  {
-    //std::cout << timer << std::endl;
-    timer.pruneOutliers( 3.0 );
-    std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl;
-    std::cout << "BLAS kernel execution Gflops < " <<
-      my_function->gflops_formula() << " >: " << my_function->gflops() <<
-      std::endl;
-  }
-  }
-  delete my_function;
-  return 0;
-}
+	my_function->validate_with_cblas(validate);
+
+        //my_function->reset_gpu_write_buffer();
+        my_function->releaseGPUBuffer_deleteCPUBuffer();
 
+        if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
+        {
+            //std::cout << timer << std::endl;
+            timer.pruneOutliers( 3.0 );
+            std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl;
+            std::cout << "BLAS kernel execution Gflops < " <<
+                my_function->gflops_formula() << " >: " << my_function->gflops() <<
+                std::endl;
+        }
+    }
+    delete my_function;
+    return 0;
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git



More information about the debian-science-commits mailing list