[clblas] 45/125: add support of hemm, herk, her2k, syrk, syr2k to performance test suite(client)
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri May 29 06:57:21 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 7e7fa103ef320169008493fa291f299922b30c04
Author: Timmy <timmy.liu at amd.com>
Date: Wed Dec 11 14:05:06 2013 -0600
add support of hemm, herk, her2k, syrk, syr2k to performance test suite(client)
---
src/client/CMakeLists.txt | 6 +-
src/client/clfunc_xhemm.hpp | 252 +++++++++++--
src/client/clfunc_xher2k.hpp | 672 +++++++++++++++++++++++++++++++++
src/client/clfunc_xherk.hpp | 531 ++++++++++++++++++++++++++
src/client/clfunc_xsymm.hpp | 12 +-
src/client/clfunc_xsyr2k.hpp | 394 ++++++++++++++++++-
src/client/clfunc_xsyrk.hpp | 308 ++++++++++++++-
src/client/clfunc_xtrmm.hpp | 10 +-
src/client/clfunc_xtrsm.hpp | 10 +-
src/client/client.cpp | 28 ++
src/scripts/perf/measurePerformance.py | 2 +-
11 files changed, 2154 insertions(+), 71 deletions(-)
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index a647da0..2ebebf1 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -26,7 +26,11 @@ set(CLIENT_HEADER
clfunc_xtrmm.hpp
clfunc_xtrsm.hpp
clfunc_xsyrk.hpp
- clfunc_xsyr2k.hpp)
+ clfunc_xsyr2k.hpp
+ clfunc_xhemm.hpp
+ clfunc_xsymm.hpp
+ clfunc_xherk.hpp
+ clfunc_xher2k.hpp)
set(WRAPPER_SRC testPerfWrapper.cpp)
diff --git a/src/client/clfunc_xhemm.hpp b/src/client/clfunc_xhemm.hpp
index 8a0c555..9f4047e 100644
--- a/src/client/clfunc_xhemm.hpp
+++ b/src/client/clfunc_xhemm.hpp
@@ -45,7 +45,7 @@
template <typename T>
struct xHemmBuffer
{
- clblasOrder order;
+ clblasOrder order;
clblasSide side;
clblasUplo uplo;
size_t M;
@@ -78,22 +78,30 @@ public:
~xHemm()
{
- delete buffer.cpuA;
- delete buffer.cpuB;
- delete buffer.cpuC;
- OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
- OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
}
double gflops()
{
- return (buffer.N*(buffer.N+1))/time_in_ns();
+ if (buffer.side == clblasLeft)
+ {
+ return (8*buffer.M*buffer.M*buffer.N)/time_in_ns();
+ }
+ else
+ {
+ return (8*buffer.N*buffer.N*buffer.M)/time_in_ns();
+ }
}
std::string gflops_formula()
{
- return "M*(M+1)/time";
+ if (buffer.side == clblasLeft)
+ {
+ return "8*M*M*N/time";
+ }
+ else
+ {
+ return "8*N*N*M/time";
+ }
}
void setup_buffer(int order_option, int side_option, int
@@ -106,25 +114,136 @@ public:
void initialize_gpu_buffer();
void reset_gpu_write_buffer();
void call_func();
- void read_gpu_buffer()
+ void read_gpu_buffer()
{
- //cl_int err;
- //to-do need to fill up
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(T),
+ buffer.ldc*buffer.N*sizeof(T),
+ buffer.cpuC,0,NULL,NULL);
}
- void roundtrip_func()
- {//to-do need to fill up
+ void roundtrip_func()
+ {
+ std::cout << "xHemm::roundtrip_func" <<std::endl;
}
- void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+ void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
- size_t ldc, size_t offA, size_t offBX, size_t offCY,
+ size_t ldc, size_t offA, size_t offB, size_t offC,
double alpha, double beta)
- {}
- void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ initialize_scalars(alpha, beta);
+ buffer.offa = offA;
+ buffer.offb = offB;
+ buffer.offc = offC;
+ buffer.M = M;
+ buffer.N = N;
+ if (order_option == 0)
+ {
+ buffer.order = clblasRowMajor;
+ }
+ else
+ {
+ buffer.order = clblasColumnMajor;
+ }
+ if (uplo_option == 0)
+ {
+ buffer.uplo = clblasUpper;
+ }
+ else
+ {
+ buffer.uplo = clblasLower;
+ }
+ if (side_option == 0)
+ {
+ buffer.side = clblasLeft;
+ buffer.a_num_vectors = M;
+ if (lda == 0)
+ {
+ buffer.lda = buffer.M;
+ }
+ else if (lda < buffer.M)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer.lda = lda;
+ }
+ }
+ else
+ {
+ buffer.side = clblasRight;
+ buffer.a_num_vectors = N;
+ if (lda == 0)
+ {
+ buffer.lda = buffer.N;
+ }
+ else if (lda < buffer.N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer.lda = lda;
+ }
+ }
+ /*}
+ if (lda == 0)
+ {
+ buffer.lda = buffer.M;
+ }
+ else if (lda < buffer.M)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer.lda = lda;
+ }*/
+ if (ldb == 0)
+ {
+ buffer.ldb = buffer.M;
+ }
+ else if (ldb < buffer.M)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer.ldb = ldb;
+ }
+ if (ldc == 0)
+ {
+ buffer.ldc = buffer.M;
+ }
+ else if (ldc < buffer.M)
+ {
+ std::cerr << "ldc:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer.ldc = ldc;
+ }
+ buffer.cpuB = new T[buffer.N * buffer.ldb];
+ buffer.cpuC = new T[buffer.N * buffer.ldc];
+ buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
+ }
+ void releaseGPUBuffer_deleteCPUBuffer()
{
//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
//need to do this before we eventually hit the destructor
- //to do
+ delete buffer.cpuA;
+ delete buffer.cpuB;
+ delete buffer.cpuC;
+ OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+ OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
}
protected:
@@ -253,7 +372,7 @@ void xHemm<T>::setup_buffer(int order_option, int side_option, int
buffer.a_num_vectors * buffer.lda*sizeof(T),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(T),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -356,10 +475,12 @@ void xHemm<T>::initialize_gpu_buffer()
buffer.a_num_vectors * buffer.lda*sizeof(T),
buffer.cpuA, 0, NULL, NULL);
- err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+ err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+ buffer.offb * sizeof(T),
buffer.ldb*buffer.N*sizeof(T),
buffer.cpuB, 0, NULL, NULL);
- err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+ err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(T),
buffer.ldc*buffer.N*sizeof(T),
buffer.cpuC, 0, NULL, NULL);
}
@@ -386,6 +507,50 @@ void xHemm<cl_float2>::call_func()
}
template <>
+void xHemm<cl_float2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ //create buffer
+ buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+ NULL, &err);
+
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.N*buffer.ldb*sizeof(cl_float2),
+ NULL, &err);
+ buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.N*buffer.ldc*sizeof(cl_float2),
+ NULL, &err);
+ //write gpu buffer
+ err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+ buffer.offa * sizeof(cl_float2),
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+ buffer.cpuA, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+ buffer.offb * sizeof(cl_float2),
+ buffer.ldb*buffer.N*sizeof(cl_float2),
+ buffer.cpuB, 0, NULL, NULL);
+ err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_float2),
+ buffer.ldc*buffer.N*sizeof(cl_float2),
+ buffer.cpuC, 0, NULL, NULL);
+
+ clblasChemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+ buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+ buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+ 0, NULL,NULL);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_float2),
+ buffer.ldc*buffer.N*sizeof(cl_float2),
+ buffer.cpuC, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+
+}
+template <>
void xHemm<cl_double2>::call_func()
{
timer.Start(timer_id);
@@ -396,5 +561,48 @@ void xHemm<cl_double2>::call_func()
clWaitForEvents(1, &event_);
timer.Stop(timer_id);
}
+template <>
+void xHemm<cl_double2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ //create buffer
+ buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+ NULL, &err);
+
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.N*buffer.ldb*sizeof(cl_double2),
+ NULL, &err);
+ buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.N*buffer.ldc*sizeof(cl_double2),
+ NULL, &err);
+ //write gpu buffer
+ err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+ buffer.offa * sizeof(cl_double2),
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+ buffer.cpuA, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+ buffer.offb * sizeof(cl_double2),
+ buffer.ldb*buffer.N*sizeof(cl_double2),
+ buffer.cpuB, 0, NULL, NULL);
+ err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_double2),
+ buffer.ldc*buffer.N*sizeof(cl_double2),
+ buffer.cpuC, 0, NULL, NULL);
+ clblasZhemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+ buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+ buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+ 0, NULL,NULL);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_double2),
+ buffer.ldc*buffer.N*sizeof(cl_double2),
+ buffer.cpuC, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+
+}
#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xher2k.hpp b/src/client/clfunc_xher2k.hpp
new file mode 100644
index 0000000..088d928
--- /dev/null
+++ b/src/client/clfunc_xher2k.hpp
@@ -0,0 +1,672 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHER2K_HXX__
+#define CLBLAS_BENCHMARK_XHER2K_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHer2kBuffer
+{
+ clblasOrder order_;
+ clblasUplo uplo_;
+ clblasTranspose transA_;
+ size_t N_;
+ size_t K_;
+ T alpha_;
+ cl_mem A_;
+ size_t offa_;
+ size_t lda_;
+ cl_mem B_;
+ size_t offb_;
+ size_t ldb_;
+ T beta_;
+ cl_mem C_;
+ size_t offc_;
+ size_t ldc_;
+ size_t a_num_vectors_;
+ size_t b_num_vectors_;
+ size_t c_num_vectors_;
+ T* cpuA_;
+ T* cpuB_;
+ T* cpuC_;
+}; // struct buffer
+
+template <typename T>
+class xHer2k : public clblasFunc
+{
+public:
+ xHer2k(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType)
+ {
+ timer.getUniqueID("clHer2k", 0);
+ }
+
+ ~xHer2k()
+ {
+ }
+
+ double gflops()
+ {
+ return static_cast<double>(8*(buffer_.K_ * buffer_.N_ * buffer_.N_)/time_in_ns()+2*buffer_.N_/time_in_ns());
+ }
+
+ std::string gflops_formula()
+ {
+ return "(8*K*N*N+2*N)/time";
+ }
+
+ void setup_buffer(int order_option, int side_option, int
+ uplo_option, int diag_option, int transA_option, int
+ transB_option, size_t M, size_t N, size_t K,
+ size_t lda, size_t ldb, size_t ldc,size_t offA,
+ size_t offB, size_t offC, double alpha,
+ double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offb_ = offB;
+ buffer_.offc_ = offC;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offb_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(T),
+ NULL, &err);
+ }
+ void initialize_cpu_buffer()
+ {
+ srand(10);
+ for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.lda_; ++j)
+ {
+ buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+ randomScale<T>();
+ }
+ }
+ for (size_t i = 0; i < buffer_.N_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.ldc_; ++j)
+ {
+ buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+ randomScale<T>();
+ }
+ }
+ }
+ void initialize_gpu_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuA_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void reset_gpu_write_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void call_func();
+ void read_gpu_buffer()
+ {
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void roundtrip_func();
+ void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+ int diag_option, int transA_option, int transB_option,
+ size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+ size_t ldc, size_t offA, size_t offBX, size_t offCY,
+ double alpha, double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offb_ = offBX;
+ buffer_.offc_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+ }
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.cpuA_;
+ delete buffer_.cpuB_;
+ delete buffer_.cpuC_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.B_), "releasing buffer B");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+ }
+protected:
+protected:
+ void initialize_scalars(double alpha, double beta)
+ {
+ buffer_.alpha_ = makeScalar<T>(alpha);
+ buffer_.beta_ = makeScalar<T>(beta);
+ }
+
+private:
+ xHer2kBuffer<T> buffer_;
+};
+
+template<>
+void
+xHer2k<cl_float2>::call_func()
+{
+ timer.Start(timer_id);
+ clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k<cl_float2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offb_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_float2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_float2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k<cl_double2>::call_func()
+{
+ timer.Start(timer_id);
+
+ clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k<cl_double2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offb_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_double2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_double2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xherk.hpp b/src/client/clfunc_xherk.hpp
new file mode 100644
index 0000000..110c107
--- /dev/null
+++ b/src/client/clfunc_xherk.hpp
@@ -0,0 +1,531 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHERK_HXX__
+#define CLBLAS_BENCHMARK_XHERK_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHerkBuffer
+{
+ clblasOrder order_;
+ clblasUplo uplo_;
+ clblasTranspose transA_;
+ size_t N_;
+ size_t K_;
+ T alpha_;
+ cl_mem A_;
+ size_t offa_;
+ size_t lda_;
+ T beta_;
+ cl_mem C_;
+ size_t offc_;
+ size_t ldc_;
+ size_t a_num_vectors_;
+ size_t c_num_vectors_;
+ T* cpuA_;
+ T* cpuC_;
+}; // struct buffer
+
+template <typename T>
+class xHerk : public clblasFunc
+{
+public:
+ xHerk(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType)
+ {
+ timer.getUniqueID("clHerk", 0);
+ }
+
+ ~xHerk()
+ {
+ }
+
+ double gflops()
+ {
+ return static_cast<double>(4*(buffer_.K_ * buffer_.N_ * (buffer_.N_+1))/time_in_ns());
+ }
+
+ std::string gflops_formula()
+ {
+ return "4*K*N*(N+1)/time";
+ }
+
+ void setup_buffer(int order_option, int side_option, int
+ uplo_option, int diag_option, int transA_option, int
+ transB_option, size_t M, size_t N, size_t K,
+ size_t lda, size_t ldb, size_t ldc,size_t offA,
+ size_t offB, size_t offC, double alpha,
+ double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+ DUMMY_ARGS_USAGE_2(ldb, offB);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offc_ = offC;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(T),
+ NULL, &err);
+ }
+ void initialize_cpu_buffer()
+ {
+ srand(10);
+ for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.lda_; ++j)
+ {
+ buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+ randomScale<T>();
+ }
+ }
+ for (size_t i = 0; i < buffer_.N_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.ldc_; ++j)
+ {
+ buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+ randomScale<T>();
+ }
+ }
+ }
+ void initialize_gpu_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuA_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void reset_gpu_write_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void call_func();
+ void read_gpu_buffer()
+ {
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void roundtrip_func();
+ void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+ int diag_option, int transA_option, int transB_option,
+ size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+ size_t ldc, size_t offA, size_t offBX, size_t offCY,
+ double alpha, double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+ DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offc_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+ }
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.cpuA_;
+ delete buffer_.cpuC_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+ }
+protected:
+protected:
+ void initialize_scalars(double alpha, double beta)
+ {
+ buffer_.alpha_ = makeScalar<T>(alpha);
+ buffer_.beta_ = makeScalar<T>(beta);
+ }
+
+private:
+ xHerkBuffer<T> buffer_;
+};
+
+template<>
+void
+xHerk<cl_float2>::call_func()
+{
+ timer.Start(timer_id);
+
+ clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk<cl_float2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_float2),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_float2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_float2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk<cl_double2>::call_func()
+{
+ timer.Start(timer_id);
+
+ clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk<cl_double2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_double2),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_double2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_double2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
index d067870..25a2924 100644
--- a/src/client/clfunc_xsymm.hpp
+++ b/src/client/clfunc_xsymm.hpp
@@ -342,7 +342,7 @@ void xSymm<T>::setup_buffer(int order_option, int side_option, int
buffer.a_num_vectors * buffer.lda*sizeof(T),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(T),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -428,7 +428,7 @@ void xSymm<cl_float>::roundtrip_func()
buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_float),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -481,7 +481,7 @@ void xSymm<cl_double>::roundtrip_func()
buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_double),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -534,7 +534,7 @@ void xSymm<cl_float2>::roundtrip_func()
buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_float2),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -582,12 +582,12 @@ void xSymm<cl_double2>::roundtrip_func()
{
timer.Start(timer_id);
//set up buffer
- cl_int err;
+ cl_int err;
buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_double2),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 9fb3381..087329e 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -61,15 +61,6 @@ public:
~xSyr2k()
{
- delete buffer_.a_;
- delete buffer_.b_;
- delete buffer_.c_;
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
- "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
- "releasing buffer B");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
- "releasing buffer C");
}
void call_func()
@@ -293,7 +284,7 @@ public:
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
NULL, &err);
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldc_ * buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
NULL, &err);
@@ -364,23 +355,227 @@ public:
}
void read_gpu_buffer()
{
- //cl_int err;
- //to-do need to fill up
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, NULL);
}
void roundtrip_func()
- {//to-do need to fill up
+ {
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
- {}
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+ initialize_scalars(alpha, beta);
+
+ buffer_.n_ = N;
+ buffer_.k_ = K;
+ buffer_.offA_ = offA;
+ buffer_.offB_ = offBX;
+ buffer_.offC_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.trans_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.trans_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ buffer_.trans_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.trans_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+
+ buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+ buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+ }
void releaseGPUBuffer_deleteCPUBuffer()
{
//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
//need to do this before we eventually hit the destructor
- //to-do
+ delete buffer_.a_;
+ delete buffer_.b_;
+ delete buffer_.c_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+ "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+ "releasing buffer B");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+ "releasing buffer C");
}
protected:
void initialize_scalars(double alpha, double beta)
@@ -413,6 +608,41 @@ call_func()
template<>
void
+xSyr2k<float>::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(float),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(float),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(float),
+ NULL, &err);
+
+ this->initialize_gpu_buffer();
+ clblasSsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(float),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(float),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
xSyr2k<double>::
call_func()
{
@@ -430,6 +660,41 @@ call_func()
template<>
void
+xSyr2k<double>::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(double),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(double),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(double),
+ NULL, &err);
+
+ this->initialize_gpu_buffer();
+ clblasDsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(double),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(double),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
xSyr2k<cl_float2>::
call_func()
{
@@ -447,6 +712,56 @@ call_func()
template<>
void
+xSyr2k<cl_float2>::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasCsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(cl_float2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_float2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k<cl_float2>::gflops()
+{
+ return 8*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string
+xSyr2k<cl_float2>::gflops_formula()
+{
+ return "(8*N*(N+1)*K)/time";
+}
+
+template<>
+void
xSyr2k<cl_double2>::
call_func()
{
@@ -462,4 +777,53 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyr2k<cl_double2>::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasZsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(cl_double2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_double2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k<cl_double2>::gflops()
+{
+ return 8*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string
+xSyr2k<cl_double2>::gflops_formula()
+{
+ return "(8*N*(N+1)*K)/time";
+}
+
#endif // ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
index ec842e2..c04cc1f 100644
--- a/src/client/clfunc_xsyrk.hpp
+++ b/src/client/clfunc_xsyrk.hpp
@@ -56,13 +56,7 @@ public:
~xSyrk()
{
- delete buffer_.a_;
- delete buffer_.c_;
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
- "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
- "releasing buffer C");
- }
+ }
void call_func()
{
@@ -70,13 +64,12 @@ public:
double gflops()
{
- return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
- buffer_.n_*(buffer_.n_+1)/time_in_ns();
+ return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
}
std::string gflops_formula()
{
- return "(N*(N+1)*K+N*(N+1))/time";
+ return "(N*(N+1)*K)/time";
}
void setup_buffer(int order_option, int side_option, int uplo_option,
@@ -224,7 +217,7 @@ public:
buffer_.offA_) * sizeof(T),
NULL, &err);
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldc_ * buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
NULL, &err);
@@ -281,23 +274,163 @@ public:
}
void read_gpu_buffer()
{
- //cl_int err;
- //to-do need to fill up
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+ buffer_.c_, 0, NULL, NULL);
}
void roundtrip_func()
- {//to-do need to fill up
+ {
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
- {}
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+ DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+ initialize_scalars(alpha, beta);
+
+ buffer_.n_ = N;
+ buffer_.k_ = K;
+ buffer_.offA_ = offA;
+ buffer_.offC_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.trans_a_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.trans_a_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_a_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.trans_a_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.trans_a_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_a_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+
+ buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+ }
void releaseGPUBuffer_deleteCPUBuffer()
{
//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
//need to do this before we eventually hit the destructor
- //to-do
+ delete buffer_.a_;
+ delete buffer_.c_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+ "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+ "releasing buffer C");
}
protected:
void initialize_scalars(double alpha, double beta)
@@ -329,6 +462,35 @@ call_func()
template<>
void
+xSyrk<float>::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(float),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(float),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(float), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(float),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
xSyrk<double>::
call_func()
{
@@ -345,6 +507,35 @@ call_func()
template<>
void
+xSyrk<double>::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(double),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(double),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasDsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(double), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(double),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
xSyrk<cl_float2>::
call_func()
{
@@ -361,6 +552,48 @@ call_func()
template<>
void
+xSyrk<cl_float2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_float2),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasCsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(cl_float2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_float2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk<cl_float2>::gflops()
+{
+ return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string
+xSyrk<cl_float2>::gflops_formula()
+{
+ return "(4*N*(N+1)*K)/time";
+}
+template<>
+void
xSyrk<cl_double2>::
call_func()
{
@@ -375,4 +608,47 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyrk<cl_double2>::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_double2),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasZsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(cl_double2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_double2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk<cl_double2>::gflops()
+{
+ return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string
+xSyrk<cl_double2>::gflops_formula()
+{
+ return "(4*N*(N+1)*K)/time";
+}
+
#endif // ifndef CLBLAS_BENCHMARK_XSYRK_HXX__
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index 6803457..a018e83 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -232,7 +232,7 @@ public:
buffer_.offA_) * sizeof(T),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
NULL, &err);
@@ -498,7 +498,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_float),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float),
NULL, &err);
@@ -562,7 +562,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_double),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double),
NULL, &err);
@@ -626,7 +626,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_float2),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float2),
NULL, &err);
@@ -690,7 +690,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_double2),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double2),
NULL, &err);
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
index 7a86be9..456c488 100644
--- a/src/client/clfunc_xtrsm.hpp
+++ b/src/client/clfunc_xtrsm.hpp
@@ -231,7 +231,7 @@ public:
buffer_.offA_) * sizeof(T),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
NULL, &err);
@@ -504,7 +504,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_float),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float),
NULL, &err);
@@ -567,7 +567,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_double),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double),
NULL, &err);
@@ -630,7 +630,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_float2),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float2),
NULL, &err);
@@ -693,7 +693,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_double2),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double2),
NULL, &err);
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 74a8eb8..4ce3f34 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -39,6 +39,8 @@
#include "clfunc_xhemv.hpp"
#include "clfunc_xhemm.hpp"
#include "clfunc_xsymm.hpp"
+#include "clfunc_xherk.hpp"
+#include "clfunc_xher2k.hpp"
namespace po = boost::program_options;
@@ -130,6 +132,8 @@ int main(int argc, char *argv[])
&& function != "hemv"
&& function != "hemm"
&& function != "symm"
+ && function != "herk"
+ && function != "her2k"
)
{
std::cerr << "Invalid value for --function" << std::endl;
@@ -432,6 +436,30 @@ int main(int argc, char *argv[])
return -1;
}
}
+ else if (function == "herk")
+ {
+ if (precision == "c")
+ my_function = new xHerk<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHerk<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "her2k")
+ {
+ if (precision == "c")
+ my_function = new xHer2k<cl_float2>(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHer2k<cl_double2>(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her2 function" << std::endl;
+ return -1;
+ }
+ }
else if (function == "symm")
{
if (precision == "s")
diff --git a/src/scripts/perf/measurePerformance.py b/src/scripts/perf/measurePerformance.py
index 659d4ef..f89674e 100644
--- a/src/scripts/perf/measurePerformance.py
+++ b/src/scripts/perf/measurePerformance.py
@@ -42,7 +42,7 @@ transvalues = ['none','transpose','conj']
sidevalues = ['left','right']
uplovalues = ['upper','lower']
diagvalues = ['unit','nonunit']
-functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv' ]
+functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv', 'symm', 'hemm', 'herk', 'her2k' ]
precisionvalues = ['s', 'd', 'c', 'z']
roundtripvalues = ['roundtrip','noroundtrip','both']
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list