[clblas] 60/61: merge develop branch to master branch. Bump master branch version number to 2.6
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri Jul 24 22:49:50 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 3f032e79c57e6187bd45239d11937ebe550216ac
Merge: a6b3f9d 5005205
Author: Timmy <timmy.liu at amd.com>
Date: Wed Jul 1 02:31:19 2015 -0500
merge develop branch to master branch. Bump master branch version number to 2.6
.gitignore | 3 +
.travis.yml | 22 +-
LICENSE | 25 -
README.md | 65 +-
doc/README-BinaryCacheOnDisk.txt | 69 +
doc/README-FunctorConcepts.txt | 100 +
doc/README-HowToIntroduceFunctors.txt | 402 ++
doc/README-TransformASolverIntoAFunctor.txt | 382 ++
doc/performance/clBLAS_2.6.0/S9150/README.txt | 35 +
doc/performance/clBLAS_2.6.0/S9150/dgemm_32.csv | 181 +
doc/performance/clBLAS_2.6.0/S9150/dgemm_96.csv | 61 +
doc/performance/clBLAS_2.6.0/S9150/dtrsm_192.csv | 31 +
.../clBLAS_2.6.0/S9150/generate_graphs.sh | 92 +
doc/performance/clBLAS_2.6.0/S9150/peak_dp.csv | 181 +
doc/performance/clBLAS_2.6.0/S9150/peak_sp.csv | 181 +
doc/performance/clBLAS_2.6.0/S9150/sgemm_32.csv | 181 +
doc/performance/clBLAS_2.6.0/S9150/zgemm_32.csv | 181 +
doc/performance/clBLAS_2.6.0/S9150/zgemm_64.csv | 91 +
doc/performance/cuBLAS_7.0/Tesla_K40/README.txt | 35 +
doc/performance/cuBLAS_7.0/Tesla_K40/dgemm.csv | 181 +
doc/performance/cuBLAS_7.0/Tesla_K40/dtrsm.csv | 31 +
doc/performance/cuBLAS_7.0/Tesla_K40/peak_dp.csv | 181 +
doc/performance/cuBLAS_7.0/Tesla_K40/peak_sp.csv | 181 +
doc/performance/cuBLAS_7.0/Tesla_K40/sgemm.csv | 181 +
doc/performance/cuBLAS_7.0/Tesla_K40/zgemm.csv | 181 +
src/CMakeLists.txt | 83 +-
src/FindOpenCL.cmake | 3 +-
src/clBLAS.def | 28 +
src/clBLAS.h | 622 ++
src/client/clfunc_common.hpp | 1 +
src/client/clfunc_xgemm.hpp | 53 +-
src/client/clfunc_xtrsm.hpp | 14 +-
src/client/client.cpp | 21 +-
src/flags_public.txt | 4 +
src/include/binary_lookup.h | 273 +
src/include/devinfo.h | 2 +
src/include/md5sum.h | 50 +
src/include/rwlock.h | 117 +
src/library/CMakeLists.txt | 282 +-
src/library/bingen.cmake | 144 +
src/library/blas/fill.cc | 272 +
src/library/blas/functor/bonaire.cc | 90 +
src/library/blas/functor/functor.cc | 117 +
src/library/blas/functor/functor_fill.cc | 156 +
src/library/blas/functor/functor_selector.cc | 344 ++
src/library/blas/functor/functor_xgemm.cc | 323 +
src/library/blas/functor/functor_xscal.cc | 410 ++
src/library/blas/functor/functor_xscal_generic.cc | 439 ++
src/library/blas/functor/functor_xtrsm.cc | 336 ++
src/library/blas/functor/gcn_dgemm.cc | 1035 ++++
src/library/blas/functor/gcn_dgemmCommon.cc | 997 +++
src/library/blas/functor/gcn_dgemmSmallMatrices.cc | 654 ++
src/library/blas/functor/gcn_sgemm.cc | 556 ++
src/library/blas/functor/gcn_sgemmSmallMatrices.cc | 558 ++
src/library/blas/functor/gcn_zgemm.cc | 354 ++
src/library/blas/functor/gpu_dtrsm.cc | 823 +++
src/library/blas/functor/gpu_dtrsm192.cc | 596 ++
src/library/blas/functor/hawaii.cc | 223 +
.../blas/functor/hawaii_dgemmChannelConflict.cc | 159 +
.../blas/functor/hawaii_dgemmSplitKernel.cc | 670 ++
.../blas/functor/hawaii_sgemmBranchKernel.cc | 442 ++
src/library/blas/functor/hawaii_sgemmSplit64_32.cc | 423 ++
.../blas/functor/hawaii_sgemmSplitKernel.cc | 858 +++
src/library/blas/functor/include/BinaryBuild.h | 10 +
src/library/blas/functor/include/atomic_counter.h | 173 +
src/library/blas/functor/include/bonaire.h | 41 +
src/library/blas/functor/include/functor.h | 496 ++
src/library/blas/functor/include/functor_fill.h | 99 +
.../functor/include/functor_hawaii_dgemm_NT_MN48.h | 210 +
.../blas/functor/include/functor_selector.h | 149 +
src/library/blas/functor/include/functor_utils.h | 116 +
src/library/blas/functor/include/functor_xgemm.h | 213 +
src/library/blas/functor/include/functor_xscal.h | 207 +
.../blas/functor/include/functor_xscal_generic.h | 173 +
src/library/blas/functor/include/functor_xtrsm.h | 203 +
src/library/blas/functor/include/gcn_dgemm.h | 59 +
src/library/blas/functor/include/gcn_dgemmCommon.h | 22 +
.../blas/functor/include/gcn_dgemmSmallMatrices.h | 27 +
src/library/blas/functor/include/gcn_sgemm.h | 62 +
.../blas/functor/include/gcn_sgemmSmallMatrices.h | 27 +
src/library/blas/functor/include/gcn_zgemm.h | 62 +
src/library/blas/functor/include/gpu_dtrsm.h | 28 +
src/library/blas/functor/include/gpu_dtrsm192.h | 28 +
src/library/blas/functor/include/hawaii.h | 42 +
.../functor/include/hawaii_dgemmChannelConflict.h | 22 +
.../blas/functor/include/hawaii_dgemmSplitKernel.h | 46 +
.../functor/include/hawaii_sgemmBranchKernel.h | 50 +
.../blas/functor/include/hawaii_sgemmSplit64_32.h | 46 +
.../blas/functor/include/hawaii_sgemmSplitKernel.h | 46 +
src/library/blas/functor/include/tahiti.h | 41 +
src/library/blas/functor/tahiti.cc | 120 +
src/library/blas/generic/binary_lookup.cc | 685 +++
src/library/blas/generic/common.c | 25 +-
src/library/blas/generic/common2.cc | 98 +
src/library/blas/generic/functor_cache.cc | 80 +
src/library/blas/generic/solution_seq_make.c | 4 +-
src/library/blas/gens/blas_kgen.h | 3 -
src/library/blas/gens/blas_subgroup.c | 6 +-
src/library/blas/gens/clTemplates/dgemm_NT_MN48.cl | 347 ++
.../gens/clTemplates/dgemm_gcn_SmallMatrices.cl | 1159 ++++
src/library/blas/gens/clTemplates/dgemm_hawai.cl | 6371 ++++++++++++++++++++
.../clTemplates/dgemm_hawaiiChannelConfilct.cl | 152 +
.../gens/clTemplates/dgemm_hawaiiSplitKernel.cl | 5043 ++++++++++++++++
src/library/blas/gens/clTemplates/dtrsm_gpu.cl | 2004 ++++++
src/library/blas/gens/clTemplates/dtrsm_gpu192.cl | 1031 ++++
src/library/blas/gens/clTemplates/sgemm_gcn.cl | 2083 +++++++
.../gens/clTemplates/sgemm_gcn_SmallMatrices.cl | 1036 ++++
.../gens/clTemplates/sgemm_hawaiiSplit64_32.cl | 530 ++
.../gens/clTemplates/sgemm_hawaiiSplitKernel.cl | 6179 +++++++++++++++++++
src/library/blas/gens/clTemplates/zgemm_gcn.cl | 319 +
src/library/blas/include/clblas-internal.h | 28 +
src/library/blas/init.c | 12 +
src/library/blas/matrix.c | 979 +++
src/library/blas/xgemm.c | 783 ---
src/library/blas/xgemm.cc | 328 +
src/library/blas/xscal.cc | 340 ++
src/library/blas/xtrsm.c | 249 -
src/library/blas/xtrsm.cc | 333 +
src/library/common/devinfo.c | 6 +
src/library/common/md5sum.c | 378 ++
src/library/common/rwlock.c | 172 +
.../perf => library/tools/bingen}/CMakeLists.txt | 23 +-
src/library/tools/bingen/bingen.cpp | 512 ++
src/library/tools/ktest/CMakeLists.txt | 34 +-
src/library/tools/tplgen/tplgen.cpp | 85 +-
src/library/tools/tune/CMakeLists.txt | 33 +-
src/library/tools/tune/tune.c | 5 +-
src/samples/CMakeLists.txt | 21 +-
src/samples/example_csscal.c | 3 +-
src/scripts/perf/CMakeLists.txt | 6 +-
src/scripts/perf/blasPerformanceTesting.py | 4 +-
src/tests/CMakeLists.txt | 28 +-
src/tests/correctness/test-correctness.cpp | 3 +-
src/tests/performance/test-performance.cpp | 5 +-
134 files changed, 48858 insertions(+), 1271 deletions(-)
diff --cc README.md
index 433dde5,9847dd6..0148627
--- a/README.md
+++ b/README.md
@@@ -104,108 -120,96 +119,96 @@@ The simple example below shows how to u
int main( void )
{
- cl_int err;
- cl_platform_id platform = 0;
- cl_device_id device = 0;
- cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
- cl_context ctx = 0;
- cl_command_queue queue = 0;
- cl_mem bufA, bufB, bufC;
- cl_event event = NULL;
- int ret = 0;
-
- /* Setup OpenCL environment. */
- err = clGetPlatformIDs( 1, &platform, NULL );
- err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
-
- props[1] = (cl_context_properties)platform;
- ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
- queue = clCreateCommandQueue( ctx, device, 0, &err );
-
- /* Setup clBLAS */
- err = clblasSetup( );
-
- /* Prepare OpenCL memory objects and place matrices inside them. */
- bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
- NULL, &err );
- bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
- NULL, &err );
- bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
- NULL, &err );
-
- err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
- M * K * sizeof( *A ), A, 0, NULL, NULL );
- err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
- K * N * sizeof( *B ), B, 0, NULL, NULL );
- err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
- M * N * sizeof( *C ), C, 0, NULL, NULL );
-
- /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
- err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
- M, N, K,
- alpha, bufA, 0, lda,
- bufB, 0, ldb, beta,
- bufC, 0, ldc,
- 1, &queue, 0, NULL, &event );
-
- /* Wait for calculations to be finished. */
- err = clWaitForEvents( 1, &event );
-
- /* Fetch results of calculations from GPU memory. */
- err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
- M * N * sizeof(*result),
- result, 0, NULL, NULL );
-
- /* Release OpenCL memory objects. */
- clReleaseMemObject( bufC );
- clReleaseMemObject( bufB );
- clReleaseMemObject( bufA );
-
- /* Finalize work with clBLAS */
- clblasTeardown( );
-
- /* Release OpenCL working objects. */
- clReleaseCommandQueue( queue );
- clReleaseContext( ctx );
-
- return ret;
+ cl_int err;
+ cl_platform_id platform = 0;
+ cl_device_id device = 0;
+ cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+ cl_context ctx = 0;
+ cl_command_queue queue = 0;
+ cl_mem bufA, bufB, bufC;
+ cl_event event = NULL;
+ int ret = 0;
+
+ /* Setup OpenCL environment. */
+ err = clGetPlatformIDs( 1, &platform, NULL );
+ err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
+
+ props[1] = (cl_context_properties)platform;
+ ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
+ queue = clCreateCommandQueue( ctx, device, 0, &err );
+
+ /* Setup clBLAS */
+ err = clblasSetup( );
+
+ /* Prepare OpenCL memory objects and place matrices inside them. */
+ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
+ NULL, &err );
+ bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
+ NULL, &err );
+ bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+ NULL, &err );
+
+ err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
+ M * K * sizeof( *A ), A, 0, NULL, NULL );
+ err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
+ K * N * sizeof( *B ), B, 0, NULL, NULL );
+ err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
+ M * N * sizeof( *C ), C, 0, NULL, NULL );
+
+ /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
+ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
+ M, N, K,
+ alpha, bufA, 0, lda,
+ bufB, 0, ldb, beta,
+ bufC, 0, ldc,
+ 1, &queue, 0, NULL, &event );
+
+ /* Wait for calculations to be finished. */
+ err = clWaitForEvents( 1, &event );
+
+ /* Fetch results of calculations from GPU memory. */
+ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
+ M * N * sizeof(*result),
+ result, 0, NULL, NULL );
+
+ /* Release OpenCL memory objects. */
+ clReleaseMemObject( bufC );
+ clReleaseMemObject( bufB );
+ clReleaseMemObject( bufA );
+
+ /* Finalize work with clBLAS */
+ clblasTeardown( );
+
+ /* Release OpenCL working objects. */
+ clReleaseCommandQueue( queue );
+ clReleaseContext( ctx );
+
+ return ret;
}
+ ```
## Build dependencies
-
### Library for Windows
-
- - Windows® 7/8
-
- - Visual Studio 2010 SP1, 2012
-
- - An OpenCL SDK, such as APP SDK 2.9
-
- - Latest CMake
+ * Windows® 7/8
+ * Visual Studio 2010 SP1, 2012
+ * An OpenCL SDK, such as APP SDK 2.8
+ * Latest CMake
### Library for Linux
-
- - GCC 4.6 and onwards
-
- - An OpenCL SDK, such as APP SDK 2.9
-
- - Latest CMake
+ * GCC 4.6 and onwards
+ * An OpenCL SDK, such as APP SDK 2.9
+ * Latest CMake
### Library for Mac OSX
-
- - Recommended to generate Unix makefiles with cmake
+ * Recommended to generate Unix makefiles with cmake
### Test infrastructure
-
- - Googletest v1.6
-
- - ACML on windows/linux; Accelerate on Mac OSX
-
- - Latest Boost
+ * Googletest v1.6
+ * ACML on windows/linux; Accelerate on Mac OSX
+ * Latest Boost
### Performance infrastructure
-
- - Python
+ * Python
[Library and API documentation]: http://clmathlibraries.github.io/clBLAS/
[clmath at googlegroups.com]: https://groups.google.com/forum/#!forum/clmath
diff --cc src/CMakeLists.txt
index 389a9a2,146ac7a..d4ee66a
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@@ -51,7 -105,7 +105,7 @@@ if( NOT DEFINED clBLAS_VERSION_MAJOR
endif( )
if( NOT DEFINED clBLAS_VERSION_MINOR )
- set( clBLAS_VERSION_MINOR 4 )
- set( clBLAS_VERSION_MINOR 5 )
++ set( clBLAS_VERSION_MINOR 6 )
endif( )
if( NOT DEFINED clBLAS_VERSION_PATCH )
@@@ -121,14 -189,14 +189,14 @@@ endif(
# TODO: maybe this could be written using the FindBLAS module in the future
if( BUILD_TEST )
if(NOT CORR_TEST_WITH_ACML)
- if(APPLE)
- find_library(BLAS_LIBRARIES Accelerate)
+ if(APPLE)
+ find_library(BLAS_LIBRARIES Accelerate HINTS /System/Library/Frameworks/Accelerate.framework)
MARK_AS_ADVANCED(BLAS_LIBRARIES)
message(STATUS "Using Accelerate framework on Mac OS-X")
- else()
+ else()
find_package( Netlib COMPONENTS BLAS REQUIRED )
- endif()
+ endif()
- else( )
+ else( )
# Find ACML BLAS implementation
# platform dependent ACML subdirectory
if (WIN32)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list