[clblas] 84/125: Merge branch 'develop' into master Bump version to v2.2
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri May 29 06:57:25 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 54e949e2db8ee49a170b1bbc829ceed07f16533f
Merge: ac0cb67 f2de5e7
Author: Kent Knox <kent.knox at amd>
Date: Thu Jun 19 15:53:25 2014 -0500
Merge branch 'develop' into master
Bump version to v2.2
Conflicts:
README.md
src/CMakeLists.txt
.travis.yml | 44 ++
CHANGELOG | 31 -
CONTRIBUTING.md | 4 +-
README.md | 147 +++--
doc/clBLAS.doxy | 26 +-
src/CMakeLists.txt | 109 ++--
src/FindOpenCL.cmake | 56 +-
src/clAmdBlas.h | 3 -
src/clBLAS.def | 18 +-
src/clBLAS.h | 4 +
src/{version.h.in => clBLAS.version.h.in} | 0
src/client/CMakeLists.txt | 22 +-
src/client/clfunc_common.hpp | 13 +-
src/client/clfunc_xgemm.hpp | 605 +++++++++++--------
src/client/clfunc_xgemv.hpp | 6 +
src/client/clfunc_xger.hpp | 6 +
src/client/clfunc_xgerc.hpp | 7 +-
src/client/clfunc_xgeru.hpp | 7 +-
src/client/clfunc_xhemm.hpp | 254 +++++++-
src/client/clfunc_xhemv.hpp | 7 +-
src/client/clfunc_xher.hpp | 7 +-
src/client/clfunc_xher2.hpp | 6 +
src/client/clfunc_xher2k.hpp | 676 ++++++++++++++++++++++
src/client/clfunc_xherk.hpp | 535 +++++++++++++++++
src/client/clfunc_xsymm.hpp | 33 +-
src/client/clfunc_xsymv.hpp | 6 +
src/client/clfunc_xsyr.hpp | 6 +
src/client/clfunc_xsyr2.hpp | 7 +-
src/client/clfunc_xsyr2k.hpp | 406 ++++++++++++-
src/client/clfunc_xsyrk.hpp | 316 +++++++++-
src/client/clfunc_xtrmm.hpp | 33 +-
src/client/clfunc_xtrmv.hpp | 6 +
src/client/clfunc_xtrsm.hpp | 428 +++++++-------
src/client/clfunc_xtrsv.hpp | 6 +
src/client/client.cpp | 66 ++-
src/include/defbool.h | 6 +-
src/include/kern_cache.h | 1 +
src/include/kerngen.h | 6 +
src/include/trace_malloc.h | 2 +-
src/library/CMakeLists.txt | 108 ++--
src/library/blas/generic/common.c | 24 +-
src/library/blas/generic/kdump.c | 2 +-
src/library/blas/generic/solution_seq_make.c | 9 +-
src/library/blas/gens/asum.cpp | 8 +-
src/library/blas/gens/axpy_reg.cpp | 6 +-
src/library/blas/gens/copy_reg.cpp | 6 +-
src/library/blas/gens/dot.cpp | 6 +-
src/library/blas/gens/gbmv.cpp | 16 +-
src/library/blas/gens/gemm_cached.cpp | 36 +-
src/library/blas/gens/gemm_tail_cached.cpp | 37 +-
src/library/blas/gens/ger_lds.cpp | 6 +-
src/library/blas/gens/her2_lds.cpp | 10 +-
src/library/blas/gens/her_lds.cpp | 10 +-
src/library/blas/gens/iamax.cpp | 8 +-
src/library/blas/gens/kprintf.cpp | 2 +-
src/library/blas/gens/legacy/tests/CMakeLists.txt | 22 +-
src/library/blas/gens/legacy/tests/t_blkmul.c | 4 +
src/library/blas/gens/nrm2.cpp | 12 +-
src/library/blas/gens/reduction.cpp | 16 +-
src/library/blas/gens/rotg_reg.cpp | 4 +-
src/library/blas/gens/rotm_reg.cpp | 8 +-
src/library/blas/gens/rotmg_reg.cpp | 2 +-
src/library/blas/gens/scal_reg.cpp | 4 +-
src/library/blas/gens/swap_reg.cpp | 6 +-
src/library/blas/gens/symm_cached.cpp | 22 +-
src/library/blas/gens/syr2_lds.cpp | 6 +-
src/library/blas/gens/syr_lds.cpp | 6 +-
src/library/blas/gens/syrxk.c | 26 +-
src/library/blas/gens/tests/CMakeLists.txt | 22 +-
src/library/blas/gens/tests/t_tilemul.c | 5 +-
src/library/blas/gens/trmv_reg.cpp | 14 +-
src/library/blas/gens/trsv_gemv.cpp | 14 +-
src/library/blas/gens/trsv_trtri.cpp | 6 +-
src/library/blas/include/clblas-internal.h | 5 +
src/library/blas/init.c | 2 +-
src/library/blas/xaxpy.c | 4 +-
src/library/blas/xcopy.c | 4 +-
src/library/blas/xdot.c | 8 +-
src/library/blas/xgemm2.c | 8 +-
src/library/blas/xger.c | 8 +-
src/library/blas/xhemv.c | 10 +-
src/library/blas/xher.c | 6 +-
src/library/blas/xher2.c | 8 +-
src/library/blas/xher2k.c | 8 +-
src/library/blas/xherk.c | 6 +-
src/library/blas/xhpmv.c | 12 +-
src/library/blas/xrot.c | 4 +-
src/library/blas/xrotg.c | 12 +-
src/library/blas/xrotm.c | 6 +-
src/library/blas/xrotmg.c | 14 +-
src/library/blas/xscal.c | 2 +-
src/library/blas/xshbmv.c | 10 +-
src/library/blas/xspmv.c | 12 +-
src/library/blas/xswap.c | 4 +-
src/library/blas/xsymm.c | 10 +-
src/library/blas/xsymv.c | 10 +-
src/library/blas/xsyr.c | 6 +-
src/library/blas/xsyr2.c | 8 +-
src/library/blas/xsyr2k.c | 8 +-
src/library/blas/xsyrk.c | 6 +-
src/library/blas/xtbmv.c | 8 +-
src/library/blas/xtrmm.c | 10 +-
src/library/blas/xtrmv.c | 8 +-
src/library/blas/xtrsm.c | 10 +-
src/library/blas/xtrsv.c | 6 +-
src/library/clBLAS.pc.in | 12 +
src/library/common/kern_cache.c | 4 +-
src/library/common/tests/CMakeLists.txt | 23 +-
src/library/common/tests/t_gens_cache.c | 4 +
src/library/tools/ktest/CMakeLists.txt | 22 +-
src/library/tools/ktest/step.h | 4 +
src/library/tools/ktest/var.h | 4 +
src/library/tools/tplgen/configure.bat | 14 -
src/library/tools/tplgen/tplgen.cpp | 2 +-
src/library/tools/tune/CMakeLists.txt | 24 +-
src/library/tools/tune/storage_data.h | 5 +-
src/library/tools/tune/storage_io.c | 3 +-
src/library/tools/tune/subdim.c | 2 +-
src/library/tools/tune/toolslib.c | 1 -
src/library/tools/tune/toolslib.h | 4 +
src/library/tools/tune/tune.c | 92 ++-
src/samples/CMakeLists.pack | 18 +-
src/samples/CMakeLists.txt | 57 +-
src/scripts/perf/CMakeLists.txt | 8 +-
src/scripts/perf/measurePerformance.py | 11 +-
src/tests/BlasBase.cpp | 2 +
src/tests/CMakeLists.txt | 114 ++--
src/tests/copyTestDependencies.cmake.in | 97 ++++
src/tests/correctness/blas-lapack.c | 73 ++-
src/tests/correctness/blas-lapack.h | 2 +-
src/tests/correctness/corr-rotg.cpp | 4 -
src/tests/correctness/test-correctness.cpp | 8 +-
src/tests/include/BlasBase.h | 1 +
src/tests/include/blas-math.h | 6 +-
src/tests/include/timer.h | 6 +
src/tests/timer.c | 59 +-
src/wrappers/python/README.txt | 59 ++
src/wrappers/python/pyclBLAS.pxd | 85 +++
src/wrappers/python/pyclBLAS.pyx | 117 ++++
src/wrappers/python/setup.py | 107 ++++
140 files changed, 4417 insertions(+), 1294 deletions(-)
diff --cc README.md
index dfdaa64,728a3c0..5f0338e
--- a/README.md
+++ b/README.md
@@@ -1,162 -1,217 +1,217 @@@
clBLAS
=====
+ [![Build Status](https://travis-ci.org/clMathLibraries/clBLAS.png)](https://travis-ci.org/clMathLibraries/clBLAS)
+
+
+ This repository houses the code for the OpenCL™ BLAS portion of clMath.
+ The complete set of BLAS level 1, 2 & 3 routines is implemented. Please
+ see Netlib BLAS for the list of supported routines. In addition to GPU
+ devices, the library also supports running on CPU devices to facilitate
+ debugging and multicore programming. APPML 1.10 is the most current
+ generally available pre-packaged binary version of the library available
+ for download for both Linux and Windows platforms.
+
+ The primary goal of clBLAS is to make it easier for developers to
+ utilize the inherent performance and power efficiency benefits of
+ heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL
+ interfaces, but rather leaves OpenCL state management to the control of
+ the user to allow for maximum performance and flexibility. The clBLAS
+ library does generate and enqueue optimized OpenCL kernels, relieving
+ the user from the task of writing, optimizing and maintaining kernel
+ code themselves.
- clMATH is a software library containing FFT and BLAS functions written in OpenCL. In addition to GPU devices, the libraries also support running on CPU devices to facilitate debugging and multicore programming.
+ ## clBLAS library user documentation
- <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available version of the library, and pre-built binaries are available for download on both Linux and Windows platforms.
+ [Library and API documentation][] for developers is available online as
+ a GitHub Pages website
- This repository houses the code for the OpenCL™ BLAS portion of APPML. The complete set of BLAS level 1, 2 & 3 routines has been implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of routines. For more information on supported graphics cards, see the <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/system-requirements-driver-compatibility/">AMD APP System Requirements</a>.
+ ### Google Groups
- The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code [...]
+ Two mailing lists have been created for the clMath projects:
- ## clBLAS library user documentation
- [Library and API documentation]( http://clmathlibraries.github.io/clBLAS/ ) for developers is available online as a GitHub Pages website
+ - [clmath at googlegroups.com][] - group whose focus is to answer
+ questions on using the library or reporting issues
+
+ - [clmath-developers at googlegroups.com][] - group whose focus is for
+ developers interested in contributing to the library code itself
## clBLAS Wiki
- The [project wiki](https://github.com/clMathLibraries/clBLAS/wiki) contains helpful documentation, including a [build primer](https://github.com/clMathLibraries/clBLAS/wiki/Build)
+
+ The [project wiki][] contains helpful documentation, including a [build
+ primer][]
## Contributing code
- Please refer to and read the [Contributing](CONTRIBUTING.md) document for guidelines on how to contribute code to this open source project
+
+ Please refer to and read the [Contributing][] document for guidelines on
+ how to contribute code to this open source project. The code in the
+ /master branch is considered to be stable, and all pull-requests should
+ be made against the /develop branch.
## License
- The source for clFFT is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 )
+
+ The source for clBLAS is licensed under the [Apache License, Version
+ 2.0][]
## Example
- The simple example below shows how to use clBLAS to compute an OpenCL accelerated SGEMM
- ```c
- #include <sys/types.h>
- #include <stdio.h>
+ The simple example below shows how to use clBLAS to compute an OpenCL
+ accelerated SGEMM
- /* Include the clBLAS header. It includes the appropriate OpenCL headers
+ #include <sys/types.h>
+ #include <stdio.h>
+
+ /* Include the clBLAS header. It includes the appropriate OpenCL headers
- */
+ */
- #include <clBLAS.h>
+ #include <clBLAS.h>
- /* This example uses predefined matrices and their characteristics for
+ /* This example uses predefined matrices and their characteristics for
- * simplicity purpose.
- */
+ * simplicity purpose.
+ */
- #define M 4
- #define N 3
- #define K 5
+ #define M 4
+ #define N 3
+ #define K 5
- static const cl_float alpha = 10;
+ static const cl_float alpha = 10;
- static const cl_float A[M*K] = {
+ static const cl_float A[M*K] = {
- 11, 12, 13, 14, 15,
- 21, 22, 23, 24, 25,
- 31, 32, 33, 34, 35,
- 41, 42, 43, 44, 45,
+ 11, 12, 13, 14, 15,
+ 21, 22, 23, 24, 25,
+ 31, 32, 33, 34, 35,
+ 41, 42, 43, 44, 45,
- };
- static const size_t lda = K; /* i.e. lda = K */
+ };
+ static const size_t lda = K; /* i.e. lda = K */
- static const cl_float B[K*N] = {
+ static const cl_float B[K*N] = {
- 11, 12, 13,
- 21, 22, 23,
- 31, 32, 33,
- 41, 42, 43,
- 51, 52, 53,
+ 11, 12, 13,
+ 21, 22, 23,
+ 31, 32, 33,
+ 41, 42, 43,
+ 51, 52, 53,
- };
- static const size_t ldb = N; /* i.e. ldb = N */
+ };
+ static const size_t ldb = N; /* i.e. ldb = N */
- static const cl_float beta = 20;
+ static const cl_float beta = 20;
- static cl_float C[M*N] = {
+ static cl_float C[M*N] = {
- 11, 12, 13,
- 21, 22, 23,
- 31, 32, 33,
- 41, 42, 43,
+ 11, 12, 13,
+ 21, 22, 23,
+ 31, 32, 33,
+ 41, 42, 43,
- };
- static const size_t ldc = N; /* i.e. ldc = N */
+ };
+ static const size_t ldc = N; /* i.e. ldc = N */
- static cl_float result[M*N];
+ static cl_float result[M*N];
- int main( void )
- {
+ int main( void )
+ {
- cl_int err;
- cl_platform_id platform = 0;
- cl_device_id device = 0;
- cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
- cl_context ctx = 0;
- cl_command_queue queue = 0;
- cl_mem bufA, bufB, bufC;
- cl_event event = NULL;
- int ret = 0;
-
- /* Setup OpenCL environment. */
- err = clGetPlatformIDs( 1, &platform, NULL );
- err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
-
- props[1] = (cl_context_properties)platform;
- ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
- queue = clCreateCommandQueue( ctx, device, 0, &err );
-
- /* Setup clBLAS */
- err = clblasSetup( );
-
- /* Prepare OpenCL memory objects and place matrices inside them. */
- bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
- NULL, &err );
- bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
- NULL, &err );
- bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
- NULL, &err );
-
- err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
- M * K * sizeof( *A ), A, 0, NULL, NULL );
- err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
- K * N * sizeof( *B ), B, 0, NULL, NULL );
- err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
- M * N * sizeof( *C ), C, 0, NULL, NULL );
-
- /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
- err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
- M, N, K,
- alpha, bufA, 0, lda,
- bufB, 0, ldb, beta,
- bufC, 0, ldc,
- 1, &queue, 0, NULL, &event );
-
- /* Wait for calculations to be finished. */
- err = clWaitForEvents( 1, &event );
-
- /* Fetch results of calculations from GPU memory. */
- err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
- M * N * sizeof(*result),
- result, 0, NULL, NULL );
-
- /* Release OpenCL memory objects. */
- clReleaseMemObject( bufC );
- clReleaseMemObject( bufB );
- clReleaseMemObject( bufA );
-
- /* Finalize work with clBLAS */
- clblasTeardown( );
-
- /* Release OpenCL working objects. */
- clReleaseCommandQueue( queue );
- clReleaseContext( ctx );
-
- return ret;
+ cl_int err;
+ cl_platform_id platform = 0;
+ cl_device_id device = 0;
+ cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+ cl_context ctx = 0;
+ cl_command_queue queue = 0;
+ cl_mem bufA, bufB, bufC;
+ cl_event event = NULL;
+ int ret = 0;
+
+ /* Setup OpenCL environment. */
+ err = clGetPlatformIDs( 1, &platform, NULL );
+ err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
+
+ props[1] = (cl_context_properties)platform;
+ ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
+ queue = clCreateCommandQueue( ctx, device, 0, &err );
+
+ /* Setup clBLAS */
+ err = clblasSetup( );
+
+ /* Prepare OpenCL memory objects and place matrices inside them. */
+ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
+ NULL, &err );
+ bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
+ NULL, &err );
+ bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+ NULL, &err );
+
+ err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
+ M * K * sizeof( *A ), A, 0, NULL, NULL );
+ err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
+ K * N * sizeof( *B ), B, 0, NULL, NULL );
+ err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
+ M * N * sizeof( *C ), C, 0, NULL, NULL );
+
+ /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
+ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
+ M, N, K,
+ alpha, bufA, 0, lda,
+ bufB, 0, ldb, beta,
+ bufC, 0, ldc,
+ 1, &queue, 0, NULL, &event );
+
+ /* Wait for calculations to be finished. */
+ err = clWaitForEvents( 1, &event );
+
+ /* Fetch results of calculations from GPU memory. */
+ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
+ M * N * sizeof(*result),
+ result, 0, NULL, NULL );
+
+ /* Release OpenCL memory objects. */
+ clReleaseMemObject( bufC );
+ clReleaseMemObject( bufB );
+ clReleaseMemObject( bufA );
+
+ /* Finalize work with clBLAS */
+ clblasTeardown( );
+
+ /* Release OpenCL working objects. */
+ clReleaseCommandQueue( queue );
+ clReleaseContext( ctx );
+
+ return ret;
- }
- ```
+ }
## Build dependencies
+
### Library for Windows
- * Windows® 7/8
- * Visual Studio 2010 SP1
- * An OpenCL SDK, such as APP SDK 2.8
- * Latest CMake
+
+ - Windows® 7/8
+
+ - Visual Studio 2010 SP1, 2012
+
+ - An OpenCL SDK, such as APP SDK 2.9
+
+ - Latest CMake
### Library for Linux
- * GCC 4.6 and onwards
- * An OpenCL SDK, such as APP SDK 2.8
- * Latest CMake
+
+ - GCC 4.6 and onwards
+
+ - An OpenCL SDK, such as APP SDK 2.9
+
+ - Latest CMake
+
+ ### Library for Mac OSX
+
+ - Recommended to generate Unix makefiles with cmake
### Test infrastructure
- * Latest Googletest
- * Latest ACML
- * Latest Boost
+
+ - Googletest v1.6
+
+ - ACML on windows/linux; Accelerate on Mac OSX
+
+ - Latest Boost
### Performance infrastructure
- * Python
+
+ - Python
+
+ [Library and API documentation]: http://clmathlibraries.github.io/clBLAS/
+ [clmath at googlegroups.com]: https://groups.google.com/forum/#!forum/clmath
+ [clmath-developers at googlegroups.com]: https://groups.google.com/forum/#!forum/clmath-developers
+ [project wiki]: https://github.com/clMathLibraries/clBLAS/wiki
+ [build primer]: https://github.com/clMathLibraries/clBLAS/wiki/Build
+ [Contributing]: CONTRIBUTING.md
+ [Apache License, Version 2.0]: http://www.apache.org/licenses/LICENSE-2.0
diff --cc src/CMakeLists.txt
index 6602b79,4e0cc74..41b54ab
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@@ -41,9 -45,18 +45,18 @@@ else(
endif( )
# Define a version for the code
- set( clBLAS_VERSION_MAJOR 2 )
- set( clBLAS_VERSION_MINOR 0 )
- set( clBLAS_VERSION_PATCH 0 )
+ if( NOT DEFINED clBLAS_VERSION_MAJOR )
+ set( clBLAS_VERSION_MAJOR 2 )
+ endif( )
+
+ if( NOT DEFINED clBLAS_VERSION_MINOR )
- set( clBLAS_VERSION_MINOR 1 )
++ set( clBLAS_VERSION_MINOR 2 )
+ endif( )
+
+ if( NOT DEFINED clBLAS_VERSION_PATCH )
+ set( clBLAS_VERSION_PATCH 0 )
+ endif( )
+
set( clBLAS_VERSION "${clBLAS_VERSION_MAJOR}.${clBLAS_VERSION_MINOR}.${clBLAS_VERSION_PATCH}")
# Increment this if we break backward compatibility.
@@@ -98,19 -121,14 +121,14 @@@ endif(
# TODO: maybe this could be written using the FindBLAS module in the future
if( BUILD_TEST )
if(NOT CORR_TEST_WITH_ACML)
- if( WIN32 )
+ if(APPLE)
+ find_library(BLAS_LIBRARIES Accelerate)
+ MARK_AS_ADVANCED(BLAS_LIBRARIES)
+ message(STATUS "Using Accelerate framework on Mac OS-X")
+ else()
find_package( Netlib COMPONENTS BLAS REQUIRED )
+ endif()
- else( )
+ else( )
- if( $ENV{REFBLAS_ROOT} )
- set( REFBLAS_ROOT $ENV{REFBLAS_ROOT} CACHE PATH "NetLib BLAS root path")
- else( )
- message(FATAL_ERROR "Cannot find reference BLAS, please set REFBLAS_ROOT environment variable")
- endif( )
-
- # Find reference BLAS implementation
- include( ${REFBLAS_ROOT}/package/cmake/exportBLAS.cmake )
- endif( )
- else( )
# Find ACML BLAS implementation
# platform dependent ACML subdirectory
if (WIN32)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list