[clblas] 54/125: add memalloc options to client (gemm and trsm)
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri May 29 06:57:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit d481f72992b552724c3c8b2596263a0ee8bda2a2
Author: Timmy <timmy.liu at amd.com>
Date: Wed Feb 26 15:43:08 2014 -0600
add memalloc options to client (gemm and trsm)
---
src/client/clfunc_common.hpp | 10 +
src/client/clfunc_xgemm.hpp | 490 ++++++++++++++++-----------------
src/client/clfunc_xher2k.hpp | 4 +
src/client/clfunc_xherk.hpp | 4 +
src/client/clfunc_xsymm.hpp | 4 +
src/client/clfunc_xsyr2k.hpp | 4 +
src/client/clfunc_xsyrk.hpp | 4 +
src/client/clfunc_xtrmm.hpp | 6 +-
src/client/clfunc_xtrsm.hpp | 409 ++++++++++++++-------------
src/client/client.cpp | 26 +-
src/scripts/perf/measurePerformance.py | 9 +-
11 files changed, 512 insertions(+), 458 deletions(-)
diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index 293a3b6..5f73613 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -28,6 +28,11 @@
#include "dis_warning.h"
#include "clBLAS.h"
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl_ext.h>
+#endif
template<typename T>
static T
@@ -243,6 +248,7 @@ public:
OPENCL_V_THROW(err, "creating context");
queue_ = clCreateCommandQueue(ctx_, device_, 0, &err);
+
timer_id = timer.getUniqueID( "clfunc", 0 );
@@ -307,6 +313,10 @@ public:
virtual void reset_gpu_write_buffer() = 0;
virtual void read_gpu_buffer() = 0;
virtual void roundtrip_func() = 0;
+ virtual void allochostptr_roundtrip_func() {}
+ virtual void usehostptr_roundtrip_func() {}
+ virtual void copyhostptr_roundtrip_func() {}
+ virtual void usepersismem_roundtrip_func() {}
virtual void roundtrip_setup_buffer(int order_option, int side_option,
int uplo_option, int diag_option, int
transA_option, int transB_option,
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index c5f706c..df84392 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -66,7 +66,9 @@ public:
void call_func()
{
- std::cout << "xGemm::call_func\n";
+ timer.Start(timer_id);
+ xGemm_Function(true);
+ timer.Stop(timer_id);
}
double gflops()
@@ -411,7 +413,215 @@ public:
void roundtrip_func()
{
- std::cout << "xGemm::roundtrip_func\n";
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ NULL, &err);
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+ buffer_.offA_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.a_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T),
+ buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, NULL);
+ xGemm_Function(false);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void allochostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+
+ cl_int err;
+ // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ NULL, &err);
+
+ // map the buffers to pointers at host device
+ T *map_a,*map_b,*map_c;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_*buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B, C to the host pointers
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+ // calling clBLAS
+ xGemm_Function(false);
+ // map the C buffer again to read output
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+
+ timer.Stop(timer_id);
+ }
+ void usehostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ buffer_.c_, &err);
+ xGemm_Function(false);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void copyhostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ buffer_.c_, &err);
+ xGemm_Function(false);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void usepersismem_roundtrip_func()
+ {
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+ timer.Start(timer_id);
+
+ cl_int err;
+
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ NULL, &err);
+
+ // map the buffers to pointers at host devices
+ T *map_a,*map_b,*map_c;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_*buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B, C to the host pointers
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+ // calling clBLAS
+ xGemm_Function(false);
+ // map the C buffer again to read output
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+
+ timer.Stop(timer_id);
+#else
+ std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+#endif
+
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
@@ -674,296 +884,86 @@ protected:
private:
xGemmBuffer<T> buffer_;
-
-}; // class xgemm
+ void xGemm_Function(bool flush);
+}; // class xgemm
template<>
-void
+void
xGemm<cl_float>::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_float>::
-roundtrip_func()
-{
- timer.Start(timer_id);
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_float),
- NULL, &err);
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float),
- buffer_.c_, 0, NULL, NULL);
- clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
-
template<>
-void
+void
xGemm<cl_double>::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_double>::
-roundtrip_func()
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
{
- timer.Start(timer_id);
- cl_int err;
- //set up buffer
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_double),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double),
- buffer_.c_, 0, NULL, NULL);
- //call_func
- clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ clWaitForEvents(1, &event_);
}
+}
template<>
-void
+void
xGemm<cl_float2>::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
- clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_float2>::
-roundtrip_func()
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
{
- timer.Start(timer_id);
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float2),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_float2),
- NULL, &err);
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float2),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float2),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float2),
- buffer_.c_, 0, NULL, NULL);
- clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float2),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ clWaitForEvents(1, &event_);
}
+}
template<>
-void
+void
xGemm<cl_double2>::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
- clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_double2>::
-roundtrip_func()
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
{
- timer.Start(timer_id);
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double2),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_double2),
- NULL, &err);
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double2),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double2),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double2),
- buffer_.c_, 0, NULL, NULL);
- clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double2),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ clWaitForEvents(1, &event_);
}
-
+}
template<>
double
diff --git a/src/client/clfunc_xher2k.hpp b/src/client/clfunc_xher2k.hpp
index 088d928..15095fa 100644
--- a/src/client/clfunc_xher2k.hpp
+++ b/src/client/clfunc_xher2k.hpp
@@ -344,6 +344,10 @@ public:
buffer_.cpuC_, 0, NULL, NULL);
}
void roundtrip_func();
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+ }
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xherk.hpp b/src/client/clfunc_xherk.hpp
index 110c107..74871a3 100644
--- a/src/client/clfunc_xherk.hpp
+++ b/src/client/clfunc_xherk.hpp
@@ -273,6 +273,10 @@ public:
buffer_.cpuC_, 0, NULL, NULL);
}
void roundtrip_func();
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+ }
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
index 25a2924..a7558e9 100644
--- a/src/client/clfunc_xsymm.hpp
+++ b/src/client/clfunc_xsymm.hpp
@@ -98,6 +98,10 @@ public:
{
std::cout << "xSymm::roundtrip_func\n";
}
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xSymm::zerocopy_roundtrip_func\n";
+ }
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 414fa09..ae60f9e 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -364,6 +364,10 @@ public:
void roundtrip_func()
{
}
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+ }
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
index c04cc1f..e9b6a7a 100644
--- a/src/client/clfunc_xsyrk.hpp
+++ b/src/client/clfunc_xsyrk.hpp
@@ -282,6 +282,10 @@ public:
void roundtrip_func()
{
}
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xSyrk::zerocopy_roundtrip_func\n";
+ }
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index a018e83..2e05300 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -304,7 +304,11 @@ public:
}
void roundtrip_func()
{
- std::cout << "xGemm::roundtrip_func\n";
+ std::cout << "xTrmm::roundtrip_func\n";
+ }
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
index 456c488..2eb64cf 100644
--- a/src/client/clfunc_xtrsm.hpp
+++ b/src/client/clfunc_xtrsm.hpp
@@ -22,6 +22,7 @@
#include "clfunc_common.hpp"
+
template <typename T>
struct xTrsmBuffer
{
@@ -61,7 +62,9 @@ public:
void call_func()
{
- std::cout << "xtrsm::call_func\n";
+ timer.Start(timer_id);
+ xTrsm_Function(true);
+ timer.Stop(timer_id);
}
double gflops()
@@ -311,7 +314,179 @@ public:
}
void roundtrip_func()
{
- std::cout << "xtrsm::call_func\n";
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+ //initialize gpu buffer
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+ buffer_.offA_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.a_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T),
+ buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, NULL);
+ //call func
+ xTrsm_Function(false);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void allochostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+ // Map the buffers to pointers at host device
+ T *map_a,*map_b;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B to the mapped regions
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ //call func
+ xTrsm_Function(false);
+ // map the B buffer again to read the output
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void usehostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+ //call func
+ xTrsm_Function(false);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void copyhostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+ //call func
+ xTrsm_Function(false);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void usepersismem_roundtrip_func()
+ {
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+ // Map the buffers to pointers at host device
+ T *map_a,*map_b;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B to the mapped regions
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ //call func
+ xTrsm_Function(false);
+ // map the B buffer again to read the output
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+#else
+ std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+#endif
+ }
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
@@ -470,261 +645,79 @@ protected:
private:
xTrsmBuffer<T> buffer_;
+ void xTrsm_Function(bool flush);
}; // class xtrsm
template<>
void
xTrsm<cl_float>::
-call_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_float>::
-roundtrip_func()
-{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
template<>
void
xTrsm<cl_double>::
-call_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_double>::
-roundtrip_func()
-{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
template<>
void
xTrsm<cl_float2>::
-call_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_float2>::
-roundtrip_func()
-{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float2),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float2),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float2),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_double2>::
-call_func()
-{
- timer.Start(timer_id);
-
- clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
template<>
void
xTrsm<cl_double2>::
-roundtrip_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double2),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double2),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
+ clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double2),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ 1, &queue_, 0, NULL, &event_);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
+
template<>
double
xTrsm<cl_float2>::
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 4ce3f34..a55def3 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -69,6 +69,7 @@ int main(int argc, char *argv[])
std::string function;
std::string precision;
std::string roundtrip;
+ std::string memalloc;
int side_option;
int uplo_option;
int diag_option;
@@ -100,7 +101,8 @@ int main(int argc, char *argv[])
( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm
( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: profiling off)" )
- ( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"calculate the time for round trips")
+ ( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
+ ( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd")
;
po::variables_map vm;
@@ -511,7 +513,27 @@ int main(int argc, char *argv[])
my_function->call_func();
my_function->read_gpu_buffer();
my_function->reset_gpu_write_buffer();*/
- my_function->roundtrip_func();
+
+ if(memalloc=="default")
+ {
+ my_function->roundtrip_func();
+ }
+ else if (memalloc=="alloc_host_ptr")
+ {
+ my_function->allochostptr_roundtrip_func();
+ }
+ else if (memalloc=="use_host_ptr")
+ {
+ my_function->usehostptr_roundtrip_func();
+ }
+ else if (memalloc=="copy_host_ptr")
+ {
+ my_function->copyhostptr_roundtrip_func();
+ }
+ else if (memalloc=="use_persistent_mem_amd")
+ {
+ my_function->usepersismem_roundtrip_func();
+ }
//my_function->reset_gpu_write_buffer();
my_function->releaseGPUBuffer_deleteCPUBuffer();
}
diff --git a/src/scripts/perf/measurePerformance.py b/src/scripts/perf/measurePerformance.py
index f89674e..8559e66 100644
--- a/src/scripts/perf/measurePerformance.py
+++ b/src/scripts/perf/measurePerformance.py
@@ -45,6 +45,7 @@ diagvalues = ['unit','nonunit']
functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv', 'symm', 'hemm', 'herk', 'her2k' ]
precisionvalues = ['s', 'd', 'c', 'z']
roundtripvalues = ['roundtrip','noroundtrip','both']
+memallocvalues = ['default','alloc_host_ptr','use_host_ptr','copy_host_ptr','use_persistent_mem_amd']
parser = argparse.ArgumentParser(description='Measure performance of the clblas library')
parser.add_argument('--device',
@@ -125,6 +126,9 @@ parser.add_argument('--tablefile',
parser.add_argument('--roundtrip',
dest='roundtrip', default='noroundtrip',
help='whether measure the roundtrips or not. choices are ' + str(roundtripvalues) + '. (default noroundtrip); should not be specified when calling ACML')
+parser.add_argument('--memalloc',
+ dest='memalloc', default='default',
+ help='set the flags for OpenCL memory allocation. Choices are ' + str(memallocvalues) + '. (default is default); do not need to set when calling ACML or if roundtrip is not set')
ini_group = parser.add_mutually_exclusive_group()
ini_group.add_argument('--createini',
dest='createIniFilename', default=None, type=argparse.FileType('w'),
@@ -138,6 +142,7 @@ args = parser.parse_args()
label = str(args.label)
roundtrip = str(args.roundtrip)
library = str(args.library)
+memalloc = str(args.memalloc)
subprocess.call('mkdir perfLog', shell = True)
logfile = os.path.join('perfLog', (label+'-'+'blasMeasurePerfLog.txt'))
@@ -145,7 +150,6 @@ logfile = os.path.join('perfLog', (label+'-'+'blasMeasurePerfLog.txt'))
def printLog(txt):
print txt
log(logfile, txt)
-printLog(roundtrip)
printLog("=========================MEASURE PERFORMANCE START===========================")
printLog("Process id of Measure Performance:"+str(os.getpid()))
@@ -449,7 +453,8 @@ for params in test_combinations:
'--function', function,
'--precision', precision,
'-p', '10',
- '--roundtrip', roundtrip]
+ '--roundtrip', roundtrip,
+ '--memalloc', memalloc]
else:
printLog( 'ERROR: unknown library:"' +library+ '" can\'t assemble command')
quit()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list