[clblas] 60/61: merge develop branch to master branch. Bump master branch version number to 2.6

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Fri Jul 24 22:49:50 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clblas.

commit 3f032e79c57e6187bd45239d11937ebe550216ac
Merge: a6b3f9d 5005205
Author: Timmy <timmy.liu at amd.com>
Date:   Wed Jul 1 02:31:19 2015 -0500

    merge develop branch to master branch. Bump master branch version number to 2.6

 .gitignore                                         |    3 +
 .travis.yml                                        |   22 +-
 LICENSE                                            |   25 -
 README.md                                          |   65 +-
 doc/README-BinaryCacheOnDisk.txt                   |   69 +
 doc/README-FunctorConcepts.txt                     |  100 +
 doc/README-HowToIntroduceFunctors.txt              |  402 ++
 doc/README-TransformASolverIntoAFunctor.txt        |  382 ++
 doc/performance/clBLAS_2.6.0/S9150/README.txt      |   35 +
 doc/performance/clBLAS_2.6.0/S9150/dgemm_32.csv    |  181 +
 doc/performance/clBLAS_2.6.0/S9150/dgemm_96.csv    |   61 +
 doc/performance/clBLAS_2.6.0/S9150/dtrsm_192.csv   |   31 +
 .../clBLAS_2.6.0/S9150/generate_graphs.sh          |   92 +
 doc/performance/clBLAS_2.6.0/S9150/peak_dp.csv     |  181 +
 doc/performance/clBLAS_2.6.0/S9150/peak_sp.csv     |  181 +
 doc/performance/clBLAS_2.6.0/S9150/sgemm_32.csv    |  181 +
 doc/performance/clBLAS_2.6.0/S9150/zgemm_32.csv    |  181 +
 doc/performance/clBLAS_2.6.0/S9150/zgemm_64.csv    |   91 +
 doc/performance/cuBLAS_7.0/Tesla_K40/README.txt    |   35 +
 doc/performance/cuBLAS_7.0/Tesla_K40/dgemm.csv     |  181 +
 doc/performance/cuBLAS_7.0/Tesla_K40/dtrsm.csv     |   31 +
 doc/performance/cuBLAS_7.0/Tesla_K40/peak_dp.csv   |  181 +
 doc/performance/cuBLAS_7.0/Tesla_K40/peak_sp.csv   |  181 +
 doc/performance/cuBLAS_7.0/Tesla_K40/sgemm.csv     |  181 +
 doc/performance/cuBLAS_7.0/Tesla_K40/zgemm.csv     |  181 +
 src/CMakeLists.txt                                 |   83 +-
 src/FindOpenCL.cmake                               |    3 +-
 src/clBLAS.def                                     |   28 +
 src/clBLAS.h                                       |  622 ++
 src/client/clfunc_common.hpp                       |    1 +
 src/client/clfunc_xgemm.hpp                        |   53 +-
 src/client/clfunc_xtrsm.hpp                        |   14 +-
 src/client/client.cpp                              |   21 +-
 src/flags_public.txt                               |    4 +
 src/include/binary_lookup.h                        |  273 +
 src/include/devinfo.h                              |    2 +
 src/include/md5sum.h                               |   50 +
 src/include/rwlock.h                               |  117 +
 src/library/CMakeLists.txt                         |  282 +-
 src/library/bingen.cmake                           |  144 +
 src/library/blas/fill.cc                           |  272 +
 src/library/blas/functor/bonaire.cc                |   90 +
 src/library/blas/functor/functor.cc                |  117 +
 src/library/blas/functor/functor_fill.cc           |  156 +
 src/library/blas/functor/functor_selector.cc       |  344 ++
 src/library/blas/functor/functor_xgemm.cc          |  323 +
 src/library/blas/functor/functor_xscal.cc          |  410 ++
 src/library/blas/functor/functor_xscal_generic.cc  |  439 ++
 src/library/blas/functor/functor_xtrsm.cc          |  336 ++
 src/library/blas/functor/gcn_dgemm.cc              | 1035 ++++
 src/library/blas/functor/gcn_dgemmCommon.cc        |  997 +++
 src/library/blas/functor/gcn_dgemmSmallMatrices.cc |  654 ++
 src/library/blas/functor/gcn_sgemm.cc              |  556 ++
 src/library/blas/functor/gcn_sgemmSmallMatrices.cc |  558 ++
 src/library/blas/functor/gcn_zgemm.cc              |  354 ++
 src/library/blas/functor/gpu_dtrsm.cc              |  823 +++
 src/library/blas/functor/gpu_dtrsm192.cc           |  596 ++
 src/library/blas/functor/hawaii.cc                 |  223 +
 .../blas/functor/hawaii_dgemmChannelConflict.cc    |  159 +
 .../blas/functor/hawaii_dgemmSplitKernel.cc        |  670 ++
 .../blas/functor/hawaii_sgemmBranchKernel.cc       |  442 ++
 src/library/blas/functor/hawaii_sgemmSplit64_32.cc |  423 ++
 .../blas/functor/hawaii_sgemmSplitKernel.cc        |  858 +++
 src/library/blas/functor/include/BinaryBuild.h     |   10 +
 src/library/blas/functor/include/atomic_counter.h  |  173 +
 src/library/blas/functor/include/bonaire.h         |   41 +
 src/library/blas/functor/include/functor.h         |  496 ++
 src/library/blas/functor/include/functor_fill.h    |   99 +
 .../functor/include/functor_hawaii_dgemm_NT_MN48.h |  210 +
 .../blas/functor/include/functor_selector.h        |  149 +
 src/library/blas/functor/include/functor_utils.h   |  116 +
 src/library/blas/functor/include/functor_xgemm.h   |  213 +
 src/library/blas/functor/include/functor_xscal.h   |  207 +
 .../blas/functor/include/functor_xscal_generic.h   |  173 +
 src/library/blas/functor/include/functor_xtrsm.h   |  203 +
 src/library/blas/functor/include/gcn_dgemm.h       |   59 +
 src/library/blas/functor/include/gcn_dgemmCommon.h |   22 +
 .../blas/functor/include/gcn_dgemmSmallMatrices.h  |   27 +
 src/library/blas/functor/include/gcn_sgemm.h       |   62 +
 .../blas/functor/include/gcn_sgemmSmallMatrices.h  |   27 +
 src/library/blas/functor/include/gcn_zgemm.h       |   62 +
 src/library/blas/functor/include/gpu_dtrsm.h       |   28 +
 src/library/blas/functor/include/gpu_dtrsm192.h    |   28 +
 src/library/blas/functor/include/hawaii.h          |   42 +
 .../functor/include/hawaii_dgemmChannelConflict.h  |   22 +
 .../blas/functor/include/hawaii_dgemmSplitKernel.h |   46 +
 .../functor/include/hawaii_sgemmBranchKernel.h     |   50 +
 .../blas/functor/include/hawaii_sgemmSplit64_32.h  |   46 +
 .../blas/functor/include/hawaii_sgemmSplitKernel.h |   46 +
 src/library/blas/functor/include/tahiti.h          |   41 +
 src/library/blas/functor/tahiti.cc                 |  120 +
 src/library/blas/generic/binary_lookup.cc          |  685 +++
 src/library/blas/generic/common.c                  |   25 +-
 src/library/blas/generic/common2.cc                |   98 +
 src/library/blas/generic/functor_cache.cc          |   80 +
 src/library/blas/generic/solution_seq_make.c       |    4 +-
 src/library/blas/gens/blas_kgen.h                  |    3 -
 src/library/blas/gens/blas_subgroup.c              |    6 +-
 src/library/blas/gens/clTemplates/dgemm_NT_MN48.cl |  347 ++
 .../gens/clTemplates/dgemm_gcn_SmallMatrices.cl    | 1159 ++++
 src/library/blas/gens/clTemplates/dgemm_hawai.cl   | 6371 ++++++++++++++++++++
 .../clTemplates/dgemm_hawaiiChannelConfilct.cl     |  152 +
 .../gens/clTemplates/dgemm_hawaiiSplitKernel.cl    | 5043 ++++++++++++++++
 src/library/blas/gens/clTemplates/dtrsm_gpu.cl     | 2004 ++++++
 src/library/blas/gens/clTemplates/dtrsm_gpu192.cl  | 1031 ++++
 src/library/blas/gens/clTemplates/sgemm_gcn.cl     | 2083 +++++++
 .../gens/clTemplates/sgemm_gcn_SmallMatrices.cl    | 1036 ++++
 .../gens/clTemplates/sgemm_hawaiiSplit64_32.cl     |  530 ++
 .../gens/clTemplates/sgemm_hawaiiSplitKernel.cl    | 6179 +++++++++++++++++++
 src/library/blas/gens/clTemplates/zgemm_gcn.cl     |  319 +
 src/library/blas/include/clblas-internal.h         |   28 +
 src/library/blas/init.c                            |   12 +
 src/library/blas/matrix.c                          |  979 +++
 src/library/blas/xgemm.c                           |  783 ---
 src/library/blas/xgemm.cc                          |  328 +
 src/library/blas/xscal.cc                          |  340 ++
 src/library/blas/xtrsm.c                           |  249 -
 src/library/blas/xtrsm.cc                          |  333 +
 src/library/common/devinfo.c                       |    6 +
 src/library/common/md5sum.c                        |  378 ++
 src/library/common/rwlock.c                        |  172 +
 .../perf => library/tools/bingen}/CMakeLists.txt   |   23 +-
 src/library/tools/bingen/bingen.cpp                |  512 ++
 src/library/tools/ktest/CMakeLists.txt             |   34 +-
 src/library/tools/tplgen/tplgen.cpp                |   85 +-
 src/library/tools/tune/CMakeLists.txt              |   33 +-
 src/library/tools/tune/tune.c                      |    5 +-
 src/samples/CMakeLists.txt                         |   21 +-
 src/samples/example_csscal.c                       |    3 +-
 src/scripts/perf/CMakeLists.txt                    |    6 +-
 src/scripts/perf/blasPerformanceTesting.py         |    4 +-
 src/tests/CMakeLists.txt                           |   28 +-
 src/tests/correctness/test-correctness.cpp         |    3 +-
 src/tests/performance/test-performance.cpp         |    5 +-
 134 files changed, 48858 insertions(+), 1271 deletions(-)

diff --cc README.md
index 433dde5,9847dd6..0148627
--- a/README.md
+++ b/README.md
@@@ -104,108 -120,96 +119,96 @@@ The simple example below shows how to u
  
      int main( void )
      {
 -        cl_int err;
 -        cl_platform_id platform = 0;
 -        cl_device_id device = 0;
 -        cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
 -        cl_context ctx = 0;
 -        cl_command_queue queue = 0;
 -        cl_mem bufA, bufB, bufC;
 -        cl_event event = NULL;
 -        int ret = 0;
 -
 -        /* Setup OpenCL environment. */
 -        err = clGetPlatformIDs( 1, &platform, NULL );
 -        err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
 -
 -        props[1] = (cl_context_properties)platform;
 -        ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
 -        queue = clCreateCommandQueue( ctx, device, 0, &err );
 -
 -        /* Setup clBLAS */
 -        err = clblasSetup( );
 -
 -        /* Prepare OpenCL memory objects and place matrices inside them. */
 -        bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
 -                              NULL, &err );
 -        bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
 -                              NULL, &err );
 -        bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
 -                              NULL, &err );
 -
 -        err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
 -            M * K * sizeof( *A ), A, 0, NULL, NULL );
 -        err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
 -            K * N * sizeof( *B ), B, 0, NULL, NULL );
 -        err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
 -            M * N * sizeof( *C ), C, 0, NULL, NULL );
 -
 -        /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
 -        err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, 
 -                                M, N, K,
 -                                alpha, bufA, 0, lda,
 -                                bufB, 0, ldb, beta,
 -                                bufC, 0, ldc,
 -                                1, &queue, 0, NULL, &event );
 -
 -        /* Wait for calculations to be finished. */
 -        err = clWaitForEvents( 1, &event );
 -
 -        /* Fetch results of calculations from GPU memory. */
 -        err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
 -                                    M * N * sizeof(*result),
 -                                    result, 0, NULL, NULL );
 -
 -        /* Release OpenCL memory objects. */
 -        clReleaseMemObject( bufC );
 -        clReleaseMemObject( bufB );
 -        clReleaseMemObject( bufA );
 -
 -        /* Finalize work with clBLAS */
 -        clblasTeardown( );
 -
 -        /* Release OpenCL working objects. */
 -        clReleaseCommandQueue( queue );
 -        clReleaseContext( ctx );
 -
 -        return ret;
 +    cl_int err;
 +    cl_platform_id platform = 0;
 +    cl_device_id device = 0;
 +    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
 +    cl_context ctx = 0;
 +    cl_command_queue queue = 0;
 +    cl_mem bufA, bufB, bufC;
 +    cl_event event = NULL;
 +    int ret = 0;
 +
 +    /* Setup OpenCL environment. */
 +    err = clGetPlatformIDs( 1, &platform, NULL );
 +    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
 +
 +    props[1] = (cl_context_properties)platform;
 +    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
 +    queue = clCreateCommandQueue( ctx, device, 0, &err );
 +
 +    /* Setup clBLAS */
 +    err = clblasSetup( );
 +
 +    /* Prepare OpenCL memory objects and place matrices inside them. */
 +    bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
 +                          NULL, &err );
 +    bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
 +                          NULL, &err );
 +    bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
 +                          NULL, &err );
 +
 +    err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
 +        M * K * sizeof( *A ), A, 0, NULL, NULL );
 +    err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
 +        K * N * sizeof( *B ), B, 0, NULL, NULL );
 +    err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
 +        M * N * sizeof( *C ), C, 0, NULL, NULL );
 +
 +    /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
 +    err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, 
 +							M, N, K,
 +							alpha, bufA, 0, lda,
 +							bufB, 0, ldb, beta,
 +							bufC, 0, ldc,
 +							1, &queue, 0, NULL, &event );
 +
 +    /* Wait for calculations to be finished. */
 +    err = clWaitForEvents( 1, &event );
 +
 +    /* Fetch results of calculations from GPU memory. */
 +    err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
 +                                M * N * sizeof(*result),
 +                                result, 0, NULL, NULL );
 +
 +    /* Release OpenCL memory objects. */
 +    clReleaseMemObject( bufC );
 +    clReleaseMemObject( bufB );
 +    clReleaseMemObject( bufA );
 +
 +    /* Finalize work with clBLAS */
 +    clblasTeardown( );
 +
 +    /* Release OpenCL working objects. */
 +    clReleaseCommandQueue( queue );
 +    clReleaseContext( ctx );
 +
 +    return ret;
      }
+ ```
  
  ## Build dependencies
- 
  ### Library for Windows
- 
- -   Windows® 7/8
- 
- -   Visual Studio 2010 SP1, 2012
- 
- -   An OpenCL SDK, such as APP SDK 2.9
- 
- -   Latest CMake
+ *  Windows® 7/8
+ *  Visual Studio 2010 SP1, 2012
+ *  An OpenCL SDK, such as APP SDK 2.8
+ *  Latest CMake
  
  ### Library for Linux
- 
- -   GCC 4.6 and onwards
- 
- -   An OpenCL SDK, such as APP SDK 2.9
- 
- -   Latest CMake
+ *  GCC 4.6 and onwards
+ *  An OpenCL SDK, such as APP SDK 2.9
+ *  Latest CMake
  
  ### Library for Mac OSX
- 
- -   Recommended to generate Unix makefiles with cmake
+ *  Recommended to generate Unix makefiles with cmake
  
  ### Test infrastructure
- 
- -   Googletest v1.6
- 
- -   ACML on windows/linux; Accelerate on Mac OSX
- 
- -   Latest Boost
+ *  Googletest v1.6
+ *  ACML on windows/linux; Accelerate on Mac OSX
+ *  Latest Boost
  
  ### Performance infrastructure
- 
- -   Python
+ * Python
  
    [Library and API documentation]: http://clmathlibraries.github.io/clBLAS/
    [clmath at googlegroups.com]: https://groups.google.com/forum/#!forum/clmath
diff --cc src/CMakeLists.txt
index 389a9a2,146ac7a..d4ee66a
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@@ -51,7 -105,7 +105,7 @@@ if( NOT DEFINED clBLAS_VERSION_MAJOR 
  endif( )
  
  if( NOT DEFINED clBLAS_VERSION_MINOR )
-     set( clBLAS_VERSION_MINOR 4 )
 -    set( clBLAS_VERSION_MINOR 5 )
++    set( clBLAS_VERSION_MINOR 6 )
  endif( )
  
  if( NOT DEFINED clBLAS_VERSION_PATCH )
@@@ -121,14 -189,14 +189,14 @@@ endif(
  # TODO: maybe this could be written using the FindBLAS module in the future
  if( BUILD_TEST )
  	if(NOT CORR_TEST_WITH_ACML)
- 	        if(APPLE)
- 			find_library(BLAS_LIBRARIES Accelerate)
+ 	    if(APPLE)
+ 			find_library(BLAS_LIBRARIES Accelerate HINTS /System/Library/Frameworks/Accelerate.framework)
  		       	MARK_AS_ADVANCED(BLAS_LIBRARIES)
  		       	message(STATUS "Using Accelerate framework on Mac OS-X")
- 	       	else()
+ 	    else()
  			find_package( Netlib COMPONENTS BLAS REQUIRED )
-               	endif()
+         endif()
 -	else( )
 +		else( )
  		# Find ACML BLAS implementation
  		# platform dependent ACML subdirectory
  		if (WIN32)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git



More information about the debian-science-commits mailing list