[clfft] 51/128: Precallback - performance callback client first version
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:37 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 7abe254060c777fe555b13964e425a34178c287f
Author: Pradeep <pradeep.rao at amd.com>
Date: Tue Sep 1 15:15:15 2015 +0530
Precallback - performance callback client first version
---
src/CMakeLists.txt | 4 +-
src/callback-client/CMakeLists.txt | 62 ++++
src/callback-client/callback-client.cpp | 638 ++++++++++++++++++++++++++++++++
src/callback-client/client.h | 70 ++++
src/callback-client/openCL.misc.cpp | 533 ++++++++++++++++++++++++++
src/callback-client/openCL.misc.h | 151 ++++++++
src/callback-client/stdafx.cpp | 25 ++
7 files changed, 1481 insertions(+), 2 deletions(-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5fa8836..c2e7b4b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -292,8 +292,8 @@ else( )
endif( )
# We only want to build the following if the user options are set
-if( FFT_CALLBACK_CLIENT AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/client-callback" )
- add_subdirectory( client-callback )
+if( FFT_CALLBACK_CLIENT AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/callback-client" )
+ add_subdirectory( callback-client )
else( )
message( "FFT callback client will NOT be built" )
endif( )
diff --git a/src/callback-client/CMakeLists.txt b/src/callback-client/CMakeLists.txt
new file mode 100644
index 0000000..dc63b8e
--- /dev/null
+++ b/src/callback-client/CMakeLists.txt
@@ -0,0 +1,62 @@
+# ########################################################################
+# Copyright 2015 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# client
+set( Client.Source callback-client.cpp
+ openCL.misc.cpp
+ stdafx.cpp )
+
+set( Client.Headers client.h
+ openCL.misc.h
+ ../statTimer/statisticalTimer.extern.h
+ ../include/unicode.compatibility.h
+ ../include/stdafx.h
+ ../include/targetver.h
+ ../include/clFFT.h )
+
+set( Client.Files ${Client.Source} ${Client.Headers} )
+
+set( DL_LIB "" )
+if( WIN32 )
+ add_definitions( "/D_CONSOLE" )
+elseif( APPLE )
+ set( CMAKE_CXX_FLAGS "-std=c++11 -stdlib=libc++ ${CMAKE_CXX_FLAGS}" )
+else( )
+ # To use the dlopen() and dlclose() functions, we should link with libdl
+ set( DL_LIB "-ldl -lrt" )
+endif( )
+
+# Include standard OpenCL headers
+include_directories( ${Boost_INCLUDE_DIRS} ${OPENCL_INCLUDE_DIRS} ${FFTW_INCLUDE_DIRS} ../../../common ${PROJECT_BINARY_DIR}/include ../include )
+
+add_executable( clFFT-callback-client ${Client.Files} )
+
+target_link_libraries( clFFT-callback-client clFFT ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} ${FFTW_LIBRARIES} ${DL_LIB} )
+
+set_target_properties( clFFT-callback-client PROPERTIES VERSION ${CLFFT_VERSION} )
+set_target_properties( clFFT-callback-client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
+if( APPLE )
+ # properly deal with RPATH on mac
+ set_target_properties( clFFT-callback-client PROPERTIES INSTALL_RPATH "@loader_path/../lib${SUFFIX_LIB}")
+endif()
+
+# CPack configuration; include the executable into the package
+install( TARGETS clFFT-callback-client
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+ )
diff --git a/src/callback-client/callback-client.cpp b/src/callback-client/callback-client.cpp
new file mode 100644
index 0000000..f4cc982
--- /dev/null
+++ b/src/callback-client/callback-client.cpp
@@ -0,0 +1,638 @@
+#include "stdafx.h"
+#include <functional>
+#include <cmath>
+
+#include "client.h"
+#include "../library/private.h"
+#include "openCL.misc.h"
+#include "../statTimer/statisticalTimer.extern.h"
+#include "../include/sharedLibrary.h"
+#include "../include/unicode.compatibility.h"
+
+#include <fftw3.h>
+
+namespace po = boost::program_options;
+
+enum FFTType
+{
+ FFT_C2C,
+ FFT_R2C,
+ FFT_C2R,
+};
+
+#define ZERO_PAD_C2C __attribute__((always_inline)) \n float2 zeroPad (__global void *input, \n \
+ uint inoffset, \n \
+ __global void *userdata) \n \
+ { \n \
+ float2 scalar = 0.0f; \n \
+ if (inoffset < 512) \n \
+ { \n \
+ scalar = *((__global float2*)userdata + inoffset); \n \
+ } \n \
+ return scalar; \n \
+ } \n
+
+#define ZERO_PAD_C2C_KERNEL __kernel void zeroPad (__global void *input, \n \
+ __global void *userdata, uint batchLength) \n \
+ { \n \
+ uint inoffset = get_global_id(0); \n \
+ float2 scalar = 0.0f; \n \
+ if (inoffset < 512) \n \
+ { \n \
+ scalar = *((__global float2*)userdata + inoffset); \n \
+ } \n \
+ *((__global float2*)input + inoffset) = scalar; \n \
+ } \n
+
+//forward declarations
+
+template < typename T >
+void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
+ clfftDim dim, clfftPrecision precision, cl_uint profile_count);
+
+template < typename T >
+void R2C_transform();
+
+template < typename T >
+void C2R_transform();
+
+fftwf_complex* get_fftwf_output(size_t* lengths, size_t fftBatchSize, int batch_size, clfftLayout in_layout,
+ clfftDim dim, clfftDirection dir);
+template < typename T1, typename T2>
+bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
+ size_t length, const float epsilon);
+
+int main(int argc, char **argv)
+{
+ size_t lengths[ 3 ] = {1,1,1};
+ cl_uint profile_count = 0;
+ clfftPrecision precision = CLFFT_SINGLE;
+
+ size_t batchSize = 1; //For simplicity using fixed batch size
+
+ int fftType = 1;
+ FFTType clFFTType = FFT_C2C;
+
+ // Initialize flags for FFT library
+ std::auto_ptr< clfftSetupData > setupData( new clfftSetupData );
+ OPENCL_V_THROW( clfftInitSetupData( setupData.get( ) ),
+ "clfftInitSetupData failed" );
+
+ try
+ {
+ // Declare the supported options.
+ po::options_description desc( "clFFT client command line options" );
+ desc.add_options()
+ ( "help,h", "produces this help message" )
+ ( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" )
+ ( "lenX,x", po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ), "Specify the length of the 1st dimension of a test array" )
+ ( "lenY,y", po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ), "Specify the length of the 2nd dimension of a test array" )
+ ( "lenZ,z", po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ), "Specify the length of the 3rd dimension of a test array" )
+ ( "profile,p", po::value< cl_uint >( &profile_count )->default_value( 10 ), "Time and report the kernel speed of the FFT (default: profiling off)" )
+ ( "type,t", po::value< int >( &fftType )->default_value( 1 ), "Type of FFT:\n1) Complex-Complex\n2) Real-Complex\n3) Complex-Real\n" )
+ ;
+
+ po::variables_map vm;
+ po::store( po::parse_command_line( argc, argv, desc ), vm );
+ po::notify( vm );
+
+ if( vm.count( "help" ) )
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+
+ if( vm.count( "dumpKernels" ) )
+ {
+ setupData->debugFlags |= CLFFT_DUMP_PROGRAMS;
+ }
+
+ switch (fftType)
+ {
+ case 1: clFFTType = FFT_C2C; break;
+ case 2: clFFTType = FFT_R2C; break;
+ case 3: clFFTType = FFT_C2R; break;
+ default:
+ throw std::runtime_error( "Invalid FFT type" );
+ break;
+ }
+
+ clfftDim dim = CLFFT_1D;
+ if( lengths[ 1 ] > 1 )
+ {
+ dim = CLFFT_2D;
+ }
+ if( lengths[ 2 ] > 1 )
+ {
+ dim = CLFFT_3D;
+ }
+
+ if( clFFTType == FFT_C2C) // Complex-Complex cases
+ {
+ C2C_transform<float>(setupData, lengths, batchSize, dim, precision, profile_count);
+ }
+ else if (clFFTType == FFT_R2C) // Complex-Complex cases
+ {
+ R2C_transform<float>();
+ }
+ else // Complex-Complex cases
+ {
+ C2R_transform<float>();
+ }
+ }
+ catch( std::exception& e )
+ {
+ terr << _T( "clFFT error condition reported:" ) << std::endl << e.what() << std::endl;
+ return 1;
+ }
+ return 0;
+}
+
+template < typename T >
+void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
+ clfftDim dim, clfftPrecision precision, cl_uint profile_count)
+{
+ // OpenCL state
+ cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+ cl_int deviceId = 0;
+ std::vector< cl_device_id > device_id;
+ cl_int platformId = 0;
+ cl_context context;
+ cl_uint command_queue_flags = 0;
+ command_queue_flags |= CL_QUEUE_PROFILING_ENABLE;
+
+ size_t vectorLength = inlengths[0] * inlengths[1] * inlengths[2];
+ size_t fftLength = vectorLength * batchSize;
+
+ //OpenCL initializations
+ device_id = initializeCL( deviceType, deviceId, platformId, context, false);
+
+ cl_int status = 0;
+
+ cl_command_queue commandQueue = ::clCreateCommandQueue( context, device_id[0], command_queue_flags, &status );
+ OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
+
+ //Run clFFT with seaparate Pre-process Kernel
+ runPreprocessKernelFFT<T>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+
+ //Run clFFT using pre-callback
+ runPrecallbackFFT<T>(setupData, context, commandQueue, inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+
+ OPENCL_V_THROW( clReleaseCommandQueue( commandQueue ), "Error: In clReleaseCommandQueue\n" );
+ OPENCL_V_THROW( clReleaseContext( context ), "Error: In clReleaseContext\n" );
+}
+
+template < typename T >
+void R2C_transform()
+{
+}
+
+template < typename T >
+void C2R_transform()
+{
+}
+
+// Compute reference output using fftw for float type
+fftwf_complex* get_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size, clfftLayout in_layout,
+ clfftDim dim, clfftDirection dir)
+{
+ //In FFTW last dimension has the fastest changing index
+ int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+
+ fftwf_plan refPlan;
+
+ fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+ fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+
+ size_t fftVectorLength = fftbatchLength/batch_size;
+
+ refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size,
+ refin, &fftwLengths[3 - dim], 1, fftVectorLength,
+ refout, &fftwLengths[3 - dim], 1, fftVectorLength,
+ dir, FFTW_ESTIMATE);
+
+ float scalar;
+
+ for( size_t i = 0; i < fftbatchLength; i++)
+ {
+ scalar = 0.0f;
+ switch (in_layout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ if ( (i % fftVectorLength) < 512)
+ {
+ scalar = 1.0f;
+ }
+ break;
+ default:
+ break;
+ }
+
+ refin[i][0] = scalar;
+ refin[i][1] = 0;
+ }
+
+ fftwf_execute(refPlan);
+
+ fftw_free(refin);
+
+ fftwf_destroy_plan(refPlan);
+
+ return refout;
+}
+
+template < typename T >
+void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+ size_t* inlengths, clfftDim dim, clfftPrecision precision,
+ size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
+{
+ cl_int status = 0;
+
+ size_t userdataLengths[ 3 ] = {512,1,1};
+ size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
+ size_t userdataLength = vectorLength_userdata * batchSize;
+
+ //input/output allocation sizes
+ size_t size_of_buffers = fftLength * sizeof( std::complex< T > );
+ size_t size_of_buffers_userdata = userdataLength * sizeof( std::complex< T > );
+
+ //in-place transform. Same buffer for input and output
+ cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
+
+ //Initialize Data
+ std::vector< std::complex< T > > userdata( userdataLength );
+
+ // impulse test case
+ std::complex< T > impulsedata(1,0);
+ for (size_t idx = 0; idx < userdataLength; ++idx)
+ {
+ userdata[idx] = impulsedata;
+ }
+
+ //user data buffer
+ cl_mem userDatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userDatabuffer) )" );
+
+ //clFFT initializations
+
+ // FFT state
+ clfftResultLocation place = CLFFT_INPLACE;
+ clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED;
+ clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
+ clfftDirection dir = CLFFT_FORWARD;
+
+ clfftPlanHandle plan_handle;
+ OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+ OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
+
+ //Precallback setup
+ char* precallbackstr = STRINGIFY(ZERO_PAD_C2C);
+
+ //Register the callback
+ OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "zeroPad", precallbackstr, NULL, 0, PRECALLBACK, userDatabuffer), "clFFTSetPlanCallback failed");
+
+ // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+ OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+ OPENCL_V_THROW( clfftSetLayout( plan_handle, inLayout, outLayout ), "clfftSetLayout failed" );
+ OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batchSize ), "clfftSetPlanBatchSize failed" );
+ OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+ //Bake Plan
+ OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &commandQueue, NULL, NULL ), "clfftBakePlan failed" );
+
+ //get the buffersize
+ size_t buffersize=0;
+ OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+ //allocate the intermediate buffer
+ cl_mem clMedBuffer=NULL;
+
+ if (buffersize)
+ {
+ cl_int medstatus;
+ clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+ OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+ }
+
+ cl_mem * buffersOut = NULL; //NULL for in-place
+
+ Timer tr;
+ double wtime_acc = 0.0;
+
+ // Loop as many times as the user specifies to average out the timings
+ for( cl_uint i = 0; i < profile_count; ++i )
+ {
+ tr.Start();
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ wtime_acc += tr.Sample();
+
+ //Ignore the first time sample if profiling for more than one iteration
+ if (i == 0 && profile_count > 1) wtime_acc = 0.0;
+ }
+ double wtime = wtime_acc/((double)profile_count);
+ size_t totalLen = 1;
+ for(int i=0; i<dim; i++) totalLen *= inlengths[i];
+ double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+ tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
+ tout << "Execution gflops (with clFFT Pre-callback): " << ((double)batchSize * opsconst)/(1000000000.0*wtime) << std::endl;
+
+ if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+ if (profile_count == 1)
+ {
+ std::vector< std::complex< T > > output( fftLength );
+
+ OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
+ 0, NULL, NULL ), "Reading the result buffer failed" );
+
+ //Reference fftw output
+ fftwf_complex *refout;
+
+ refout = get_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+
+ /*for( cl_uint i = 0; i < fftLength; i++)
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ }*/
+ if (!compare<fftwf_complex, T>(refout, output, fftLength))
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+ }
+ else
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+ }
+
+ fftwf_free(refout);
+ }
+
+ OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+ OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+ //cleanup
+ OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
+ OPENCL_V_THROW( clReleaseMemObject( userDatabuffer ), "Error: In clReleaseMemObject\n" );
+}
+
+template < typename T >
+void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context,
+ cl_command_queue commandQueue, cl_device_id device_id,
+ size_t* inlengths, clfftDim dim, clfftPrecision precision,
+ size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
+{
+ cl_int status = 0;
+
+ size_t userdataLengths[ 3 ] = {512,1,1};
+ size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
+ size_t userdataLength = vectorLength_userdata * batchSize;
+
+ //input/output allocation sizes
+ size_t size_of_buffers = fftLength * sizeof( std::complex< T > );
+ size_t size_of_buffers_userdata = userdataLength * sizeof( std::complex< T > );
+
+ //in-place transform. Same buffer for input and output
+ cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
+
+ //Initialize Data
+ std::vector< std::complex< T > > userdata( userdataLength );
+
+ // impulse test case
+ std::complex< T > impulsedata(1,0);
+ for (size_t idx = 0; idx < userdataLength; ++idx)
+ {
+ userdata[idx] = impulsedata;
+ }
+
+ //user data buffer
+ cl_mem userdatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userdatabuffer) )" );
+
+ //clFFT initializations
+
+ // FFT state
+ clfftResultLocation place = CLFFT_INPLACE;
+ clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED;
+ clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
+ clfftDirection dir = CLFFT_FORWARD;
+
+ clfftPlanHandle plan_handle;
+ OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+ OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
+
+ // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+ OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+ OPENCL_V_THROW( clfftSetLayout( plan_handle, inLayout, outLayout ), "clfftSetLayout failed" );
+ OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batchSize ), "clfftSetPlanBatchSize failed" );
+ OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+ //Bake Plan
+ OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &commandQueue, NULL, NULL ), "clfftBakePlan failed" );
+
+ //get the buffersize
+ size_t buffersize=0;
+ OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+ //allocate the intermediate buffer
+ cl_mem clMedBuffer=NULL;
+
+ if (buffersize)
+ {
+ cl_int medstatus;
+ clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+ OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+ }
+
+ cl_mem * buffersOut = NULL; //NULL for in-place
+
+ //Pre-process kernel string
+ const char* source = STRINGIFY(ZERO_PAD_C2C_KERNEL);
+
+ double wtime_acc = 0.0;
+ Timer tr;
+
+ // Loop as many times as the user specifies to average out the timings
+ for( cl_uint i = 0; i < profile_count; ++i )
+ {
+ tr.Start();
+ cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
+ OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
+
+ status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
+ OPENCL_V_THROW( status, "clBuildProgram failed" );
+
+#if defined( _DEBUG )
+ if( status != CL_SUCCESS )
+ {
+ if( status == CL_BUILD_PROGRAM_FAILURE )
+ {
+ size_t buildLogSize = 0;
+ OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
+ "clGetProgramBuildInfo failed" );
+
+ std::vector< char > buildLog( buildLogSize );
+ ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
+
+ OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+ "clGetProgramBuildInfo failed" );
+
+ std::cerr << "\n\t\t\tBUILD LOG\n";
+ std::cerr << "************************************************\n";
+ std::cerr << &buildLog[ 0 ] << std::endl;
+ std::cerr << "************************************************\n";
+ }
+
+ OPENCL_V_THROW( status, "clBuildProgram failed" );
+ }
+#endif
+
+ cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
+ OPENCL_V_THROW( status, "clCreateKernel failed" );
+
+ cl_uint uarg = 0;
+
+ //Buffer to be zero-padded
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
+
+ //originial data
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
+
+ //single batch length
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_uint ), (const void *)&vectorLength ), "clSetKernelArg failed" );
+
+ //Launch pre-process kernel
+ size_t gSize = fftLength;
+ size_t lSize = 64;
+ status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+ NULL, &gSize, &lSize, 0, NULL, NULL );
+ OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ //Now invoke the clfft execute
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ wtime_acc += tr.Sample();
+
+ //Ignore the first time sample if profiling for more than one iteration
+ if (i == 0 && profile_count > 1) wtime_acc = 0.0;
+
+ //cleanup preprocess kernel opencl objects
+ OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
+ OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
+ }
+
+ double wtime = wtime_acc/((double)profile_count);
+ size_t totalLen = 1;
+ for(int i=0; i<dim; i++) totalLen *= inlengths[i];
+ double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+ tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
+ tout << "Execution gflops (Separate Pre-process Kernel): " << ((double)batchSize * opsconst)/(1000000000.0*wtime) << std::endl;
+
+ if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+ if (profile_count == 1)
+ {
+ std::vector< std::complex< T > > output( fftLength );
+
+ OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
+ 0, NULL, NULL ), "Reading the result buffer failed" );
+
+ //Reference fftw output
+ fftwf_complex *refout;
+
+ refout = get_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+
+ /*for( cl_uint i = 0; i < fftLength; i++)
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ }*/
+ if (!compare<fftwf_complex, T>(refout, output, fftLength))
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+ }
+ else
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+ }
+
+ fftwf_free(refout);
+ }
+
+ OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+ OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+ //cleanup
+ OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
+ OPENCL_V_THROW( clReleaseMemObject( userdatabuffer ), "Error: In clReleaseMemObject\n" );
+}
+
+//Compare reference and opencl output
+template < typename T1, typename T2>
+bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
+ size_t length, const float epsilon = 1e-6f)
+{
+ float error = 0.0f;
+ T1 ref;
+ T1 diff;
+ float normRef = 0.0f;
+ float normError = 0.0f;
+
+ for(size_t i = 0; i < length; ++i)
+ {
+ diff[0] = refData[i][0] - data[i].real();
+ error += (float)(diff[0] * diff[0]);
+ ref[0] += refData[i][0] * refData[i][0];
+ }
+ if (error != 0)
+ {
+ normRef =::sqrtf((float) ref[0]);
+ if (::fabs((float) ref[0]) < 1e-7f)
+ {
+ return false;
+ }
+ normError = ::sqrtf((float) error);
+ error = normError / normRef;
+
+ if (error > epsilon)
+ return false;
+ }
+
+ //imag
+ error = 0.0f;
+ ref[1] = 0.0;
+ for(size_t i = 0; i < length; ++i)
+ {
+ diff[1] = refData[i][1] - data[i].imag();
+ error += (float)(diff[1] * diff[1]);
+ ref[1] += refData[i][1] * refData[i][1];
+ }
+
+ if (error == 0)
+ return true;
+
+ normRef =::sqrtf((float) ref[1]);
+ if (::fabs((float) ref[1]) < 1e-7f)
+ {
+ return false;
+ }
+ normError = ::sqrtf((float) error);
+ error = normError / normRef;
+
+ if (error > epsilon)
+ return false;
+
+ return true;
+}
diff --git a/src/callback-client/client.h b/src/callback-client/client.h
new file mode 100644
index 0000000..a1e100d
--- /dev/null
+++ b/src/callback-client/client.h
@@ -0,0 +1,70 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( CLIENT_H )
+#define CLIENT_H
+
+// Boost headers that we want to use
+// #define BOOST_PROGRAM_OPTIONS_DYN_LINK
+#include <boost/program_options.hpp>
+
+#define CALLBCKSTR(...) #__VA_ARGS__
+#define STRINGIFY(...) CALLBCKSTR(__VA_ARGS__)
+
+#ifdef WIN32
+
+struct Timer
+{
+ LARGE_INTEGER start, stop, freq;
+
+public:
+ Timer() { QueryPerformanceFrequency( &freq ); }
+
+ void Start() { QueryPerformanceCounter(&start); }
+ double Sample()
+ {
+ QueryPerformanceCounter ( &stop );
+ double time = (double)(stop.QuadPart-start.QuadPart) / (double)(freq.QuadPart);
+ return time;
+ }
+};
+
+#else
+
+#include <time.h>
+#include <math.h>
+
+struct Timer
+{
+ struct timespec start, end;
+
+public:
+ Timer() { }
+
+ void Start() { clock_gettime(CLOCK_MONOTONIC, &start); }
+ double Sample()
+ {
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ double time = 1000000000L * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+ return time * 1E-9;
+ }
+};
+
+#endif
+
+#endif
diff --git a/src/callback-client/openCL.misc.cpp b/src/callback-client/openCL.misc.cpp
new file mode 100644
index 0000000..e406d71
--- /dev/null
+++ b/src/callback-client/openCL.misc.cpp
@@ -0,0 +1,533 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.opencl.cpp : Provides functions to set up openCL
+//
+
+#include "stdafx.h"
+#include <stdexcept>
+#include <iomanip>
+#include <sstream>
+#include <cstring>
+#include <vector>
+#include "clFFT.h"
+#include "openCL.misc.h"
+
+
+
+void prettyPrintPlatformInfo( const cl_platform_id& pId )
+{
+ size_t platformProfileSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, 0, NULL, &platformProfileSize ),
+ "Getting CL_PLATFORM_PROFILE Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformProfile( platformProfileSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, platformProfileSize, &szPlatformProfile[ 0 ], NULL),
+ "Getting CL_PLATFORM_PROFILE Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformVersionSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, 0, NULL, &platformVersionSize ),
+ "Getting CL_PLATFORM_VERSION Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformVersion( platformVersionSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, platformVersionSize, &szPlatformVersion[ 0 ], NULL),
+ "Getting CL_PLATFORM_VERSION Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformNameSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, 0, NULL, &platformNameSize ),
+ "Getting CL_PLATFORM_NAME Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformName( platformNameSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, platformNameSize, &szPlatformName[ 0 ], NULL),
+ "Getting CL_PLATFORM_NAME Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t vendorStringSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, 0, NULL, &vendorStringSize ),
+ "Getting CL_PLATFORM_VENDOR Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformVendor( vendorStringSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, vendorStringSize, &szPlatformVendor[ 0 ], NULL),
+ "Getting CL_PLATFORM_VENDOR Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformExtensionsSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, 0, NULL, &platformExtensionsSize ),
+ "Getting CL_PLATFORM_EXTENSIONS Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformExtensions( platformExtensionsSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, platformExtensionsSize, &szPlatformExtensions[ 0 ], NULL),
+ "Getting CL_PLATFORM_EXTENSIONS Platform Info string ( ::clGetPlatformInfo() )" );
+
+ const int indent = countOf( " CL_PLATFORM_EXTENSIONS: " );
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_PROFILE: " << &szPlatformProfile[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VERSION: " << &szPlatformVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_NAME: " << &szPlatformName[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VENDOR: " << &szPlatformVendor[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_EXTENSIONS: " << &szPlatformExtensions[ 0 ] << std::endl;
+ std::cout << std::right << std::endl;
+}
+
+void prettyPrintDeviceInfo( const cl_device_id& dId )
+{
+ size_t deviceNameSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, 0, NULL, &deviceNameSize ),
+ "Getting CL_DEVICE_NAME Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceName( deviceNameSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, deviceNameSize, &szDeviceName[ 0 ], NULL ),
+ "Getting CL_DEVICE_NAME Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t deviceVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+ "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceVersion( deviceVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+ "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t driverVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, 0, NULL, &driverVersionSize ),
+ "Getting CL_DRIVER_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDriverVersion( driverVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, driverVersionSize, &szDriverVersion[ 0 ], NULL ),
+ "Getting CL_DRIVER_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t openCLVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &openCLVersionSize ),
+ "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szOpenCLVersion( openCLVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, openCLVersionSize, &szOpenCLVersion[ 0 ], NULL ),
+ "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_TYPE, sizeof( cl_device_type ), &devType, NULL ),
+ "Getting CL_DEVICE_TYPE device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint devAddrBits = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_ADDRESS_BITS, sizeof( cl_uint ), &devAddrBits, NULL ),
+ "Getting CL_DEVICE_ADDRESS_BITS device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint maxClockFreq = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( cl_uint ), &maxClockFreq, NULL ),
+ "Getting CL_DEVICE_MAX_CLOCK_FREQUENCY device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool devAvailable = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_AVAILABLE, sizeof( cl_bool ), &devAvailable, NULL ),
+ "Getting CL_DEVICE_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool devCompAvailable = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_COMPILER_AVAILABLE, sizeof( cl_bool ), &devCompAvailable, NULL ),
+ "Getting CL_DEVICE_COMPILER_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+ size_t devMaxWorkGroup = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &devMaxWorkGroup, NULL ),
+ "Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint devMaxWorkItemDim = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( cl_uint ), &devMaxWorkItemDim, NULL ),
+ "Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )" );
+
+ std::vector< size_t > devMaxWorkItemSizes( devMaxWorkItemDim );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( size_t )*devMaxWorkItemSizes.size( ), &devMaxWorkItemSizes[0], NULL),
+ "Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool deviceHostUnified = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( cl_bool ), &deviceHostUnified, NULL ),
+ "Getting CL_DEVICE_HOST_UNIFIED_MEMORY Platform Info string ( ::clGetDeviceInfo() )" );
+
+ cl_ulong devMaxConstantBuffer = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( cl_ulong ), &devMaxConstantBuffer, NULL ),
+ "Getting CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong devLocalMemSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &devLocalMemSize, NULL ),
+ "Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong deviceGlobalMemSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &deviceGlobalMemSize, NULL ),
+ "Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong deviceMaxMemAllocSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &deviceMaxMemAllocSize, NULL ),
+ "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ size_t deviceExtSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceExt( deviceExtSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+ const int indent = countOf( " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " );
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_NAME: " << &szDeviceName[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_VERSION: " << &szDeviceVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DRIVER_VERSION: " << &szDriverVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_TYPE: "
+ << (CL_DEVICE_TYPE_DEFAULT & devType ? "default" : "")
+ << (CL_DEVICE_TYPE_CPU & devType ? "CPU" : "")
+ << (CL_DEVICE_TYPE_GPU & devType ? "GPU" : "")
+ << (CL_DEVICE_TYPE_ACCELERATOR & devType ? "Accelerator" : "")
+ << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CLOCK_FREQUENCY: " << maxClockFreq << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_ADDRESS_BITS: " << devAddrBits << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_AVAILABLE: " << ( devAvailable ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_COMPILER_AVAILABLE: " << ( devCompAvailable ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_OPENCL_C_VERSION: " << &szOpenCLVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_GROUP_SIZE: " << devMaxWorkGroup << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << devMaxWorkItemDim << std::endl;
+ for( cl_uint wis = 0; wis < devMaxWorkItemSizes.size( ); ++wis )
+ {
+ std::stringstream dimString;
+ dimString << "Dimension[ " << wis << " ] ";
+ std::cout << std::right << std::setw( indent ) << dimString.str( ) << devMaxWorkItemSizes[wis] << std::endl;
+ }
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_HOST_UNIFIED_MEMORY: " << ( deviceHostUnified ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << devMaxConstantBuffer;
+ std::cout << " ( " << devMaxConstantBuffer / 1024 << " KB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_LOCAL_MEM_SIZE: " << devLocalMemSize;
+ std::cout << " ( " << devLocalMemSize / 1024 << " KB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_GLOBAL_MEM_SIZE: " << deviceGlobalMemSize;
+ std::cout << " ( " << deviceGlobalMemSize / 1048576 << " MB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << deviceMaxMemAllocSize;
+ std::cout << " ( " << deviceMaxMemAllocSize / 1048576 << " MB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_EXTENSIONS: " << &szDeviceExt[ 0 ] << std::endl;
+
+ std::cout << std::right << std::endl;
+}
+
+void prettyPrintCLPlatforms(std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices)
+{
+ for (unsigned int i = 0; i < platforms.size(); ++i)
+ {
+ std::cout << "OpenCL platform [ " << i << " ]:" << std::endl;
+ prettyPrintPlatformInfo(platforms[i]);
+
+ for (unsigned int n = 0; n < devices[i].size(); ++n)
+ {
+ std::cout << "OpenCL platform [ " << i << " ], device [ " << n << " ]:" << std::endl;
+ prettyPrintDeviceInfo((devices[i])[n]);
+ }
+ }
+
+}
+
+// Verify a failed condition; return true on fail
+inline cl_bool OPENCL_V_FAIL( cl_int res )
+{
+ if( res == CL_SUCCESS )
+ return CL_FALSE;
+ else
+ return CL_TRUE;
+}
+
+std::string prettyPrintclFFTStatus( const cl_int& status )
+{
+ switch( status )
+ {
+ case CLFFT_INVALID_GLOBAL_WORK_SIZE:
+ return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
+ case CLFFT_INVALID_MIP_LEVEL:
+ return "CLFFT_INVALID_MIP_LEVEL";
+ case CLFFT_INVALID_BUFFER_SIZE:
+ return "CLFFT_INVALID_BUFFER_SIZE";
+ case CLFFT_INVALID_GL_OBJECT:
+ return "CLFFT_INVALID_GL_OBJECT";
+ case CLFFT_INVALID_OPERATION:
+ return "CLFFT_INVALID_OPERATION";
+ case CLFFT_INVALID_EVENT:
+ return "CLFFT_INVALID_EVENT";
+ case CLFFT_INVALID_EVENT_WAIT_LIST:
+ return "CLFFT_INVALID_EVENT_WAIT_LIST";
+ case CLFFT_INVALID_GLOBAL_OFFSET:
+ return "CLFFT_INVALID_GLOBAL_OFFSET";
+ case CLFFT_INVALID_WORK_ITEM_SIZE:
+ return "CLFFT_INVALID_WORK_ITEM_SIZE";
+ case CLFFT_INVALID_WORK_GROUP_SIZE:
+ return "CLFFT_INVALID_WORK_GROUP_SIZE";
+ case CLFFT_INVALID_WORK_DIMENSION:
+ return "CLFFT_INVALID_WORK_DIMENSION";
+ case CLFFT_INVALID_KERNEL_ARGS:
+ return "CLFFT_INVALID_KERNEL_ARGS";
+ case CLFFT_INVALID_ARG_SIZE:
+ return "CLFFT_INVALID_ARG_SIZE";
+ case CLFFT_INVALID_ARG_VALUE:
+ return "CLFFT_INVALID_ARG_VALUE";
+ case CLFFT_INVALID_ARG_INDEX:
+ return "CLFFT_INVALID_ARG_INDEX";
+ case CLFFT_INVALID_KERNEL:
+ return "CLFFT_INVALID_KERNEL";
+ case CLFFT_INVALID_KERNEL_DEFINITION:
+ return "CLFFT_INVALID_KERNEL_DEFINITION";
+ case CLFFT_INVALID_KERNEL_NAME:
+ return "CLFFT_INVALID_KERNEL_NAME";
+ case CLFFT_INVALID_PROGRAM_EXECUTABLE:
+ return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
+ case CLFFT_INVALID_PROGRAM:
+ return "CLFFT_INVALID_PROGRAM";
+ case CLFFT_INVALID_BUILD_OPTIONS:
+ return "CLFFT_INVALID_BUILD_OPTIONS";
+ case CLFFT_INVALID_BINARY:
+ return "CLFFT_INVALID_BINARY";
+ case CLFFT_INVALID_SAMPLER:
+ return "CLFFT_INVALID_SAMPLER";
+ case CLFFT_INVALID_IMAGE_SIZE:
+ return "CLFFT_INVALID_IMAGE_SIZE";
+ case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+ return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+ case CLFFT_INVALID_MEM_OBJECT:
+ return "CLFFT_INVALID_MEM_OBJECT";
+ case CLFFT_INVALID_HOST_PTR:
+ return "CLFFT_INVALID_HOST_PTR";
+ case CLFFT_INVALID_COMMAND_QUEUE:
+ return "CLFFT_INVALID_COMMAND_QUEUE";
+ case CLFFT_INVALID_QUEUE_PROPERTIES:
+ return "CLFFT_INVALID_QUEUE_PROPERTIES";
+ case CLFFT_INVALID_CONTEXT:
+ return "CLFFT_INVALID_CONTEXT";
+ case CLFFT_INVALID_DEVICE:
+ return "CLFFT_INVALID_DEVICE";
+ case CLFFT_INVALID_PLATFORM:
+ return "CLFFT_INVALID_PLATFORM";
+ case CLFFT_INVALID_DEVICE_TYPE:
+ return "CLFFT_INVALID_DEVICE_TYPE";
+ case CLFFT_INVALID_VALUE:
+ return "CLFFT_INVALID_VALUE";
+ case CLFFT_MAP_FAILURE:
+ return "CLFFT_MAP_FAILURE";
+ case CLFFT_BUILD_PROGRAM_FAILURE:
+ return "CLFFT_BUILD_PROGRAM_FAILURE";
+ case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
+ return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
+ case CLFFT_IMAGE_FORMAT_MISMATCH:
+ return "CLFFT_IMAGE_FORMAT_MISMATCH";
+ case CLFFT_MEM_COPY_OVERLAP:
+ return "CLFFT_MEM_COPY_OVERLAP";
+ case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
+ return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
+ case CLFFT_OUT_OF_HOST_MEMORY:
+ return "CLFFT_OUT_OF_HOST_MEMORY";
+ case CLFFT_OUT_OF_RESOURCES:
+ return "CLFFT_OUT_OF_RESOURCES";
+ case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
+ return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
+ case CLFFT_COMPILER_NOT_AVAILABLE:
+ return "CLFFT_COMPILER_NOT_AVAILABLE";
+ case CLFFT_DEVICE_NOT_AVAILABLE:
+ return "CLFFT_DEVICE_NOT_AVAILABLE";
+ case CLFFT_DEVICE_NOT_FOUND:
+ return "CLFFT_DEVICE_NOT_FOUND";
+ case CLFFT_SUCCESS:
+ return "CLFFT_SUCCESS";
+ case CLFFT_NOTIMPLEMENTED:
+ return "CLFFT_NOTIMPLEMENTED";
+ case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
+ return "CLFFT_TRANSPOSED_NOTIMPLEMENTED";
+ case CLFFT_FILE_NOT_FOUND:
+ return "CLFFT_FILE_NOT_FOUND";
+ case CLFFT_FILE_CREATE_FAILURE:
+ return "CLFFT_FILE_CREATE_FAILURE";
+ case CLFFT_VERSION_MISMATCH:
+ return "CLFFT_VERSION_MISMATCH";
+ case CLFFT_INVALID_PLAN:
+ return "CLFFT_INVALID_PLAN";
+ default:
+ return "Error code not defined";
+ break;
+ }
+}
+
+
+int discoverCLPlatforms( cl_device_type deviceType,
+ std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices )
+{
+ cl_int status = 0;
+
+ /*
+ * Find all OpenCL platforms this system has to offer.
+ */
+
+ cl_uint numPlatforms = 0;
+ cl_platform_id platform = NULL;
+ OPENCL_V_THROW(::clGetPlatformIDs(0, NULL, &numPlatforms),
+ "Getting number of platforms( ::clGetPlatformsIDs() )");
+
+ if (numPlatforms > 0)
+ {
+ platforms.resize( numPlatforms );
+ devices.resize( numPlatforms );
+ OPENCL_V_THROW(::clGetPlatformIDs(numPlatforms, &platforms[0], NULL),
+ "Getting Platform Id's ( ::clGetPlatformsIDs() )");
+
+ if (NULL == platforms[0])
+ {
+ throw std::runtime_error("No appropriate OpenCL platform could be found");
+ }
+
+ /*
+ * Now, for each platform get all available devices matching deviceType.
+ */
+ for (unsigned int i = 0; i < numPlatforms; ++i)
+ {
+ // Get the device list for deviceType.
+ //
+ cl_uint numDevices = 0;
+ OPENCL_V_WARN(::clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices),
+ "Getting OpenCL devices ( ::clGetDeviceIDs() )");
+ if (0 == numDevices)
+ {
+ // OPENCL_V_WARN(CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+ continue;
+ }
+
+ devices[i].resize(numDevices);
+ OPENCL_V_THROW(::clGetDeviceIDs(platforms[i], deviceType, numDevices, &(devices[i])[0], NULL),
+ "Getting OpenCL deviceIDs ( ::clGetDeviceIDs() )");
+ }
+ }
+
+ return 0;
+}
+
+std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
+ cl_int deviceId,
+ cl_int platformId,
+ cl_context& context,
+ bool printclInfo)
+{
+ cl_int status = 0;
+ cl_platform_id platform = NULL;
+ std::vector< cl_device_id > devices(1);
+ devices[0] = NULL;
+
+ // Have a look at all the available platforms on this system
+ std::vector< cl_platform_id > platformInfos;
+ std::vector< std::vector< cl_device_id > > deviceInfos;
+ discoverCLPlatforms( deviceType, platformInfos, deviceInfos );
+
+
+ for (unsigned int i = 0; i < platformInfos.size(); ++i)
+ {
+ if(i == platformId)
+ {
+ for (unsigned int n = 0; n < deviceInfos[i].size(); ++n)
+ {
+ if (n == deviceId)
+ {
+ platform = platformInfos[i];
+ devices[0] = deviceInfos[i][n];
+
+ if(printclInfo)
+ {
+ prettyPrintPlatformInfo(platform);
+ prettyPrintDeviceInfo(devices[0]);
+ }
+
+ break;
+ }
+ }
+
+ break;
+ }
+ }
+
+
+
+ // Do some error checking if we really selected a valid platform and a valid device
+ if (NULL == devices[0])
+ {
+ OPENCL_V_THROW(CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+ }
+
+ if (NULL == platform)
+ {
+ throw std::runtime_error("No appropriate OpenCL platform could be found");
+ }
+
+ // Create an OpenCL context
+ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0 };
+ context = clCreateContext(cps,
+ (cl_uint)devices.size(),
+ &devices[0],
+ NULL,
+ NULL,
+ &status);
+ OPENCL_V_THROW(status, "Creating Context ( ::clCreateContextFromType() )");
+
+ return devices;
+}
+
+int cleanupCL( cl_context* context, cl_command_queue* commandQueue,
+ const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[] )
+{
+ releaseOpenCLMemBuffer( numBuffersIn, inputBuffer);
+ releaseOpenCLMemBuffer( numBuffersOut, outputBuffer);
+
+ if( *commandQueue != NULL )
+ OPENCL_V_THROW( clReleaseCommandQueue( *commandQueue ), "Error: In clReleaseCommandQueue\n" );
+
+ if( *context != NULL )
+ OPENCL_V_THROW( clReleaseContext( *context ), "Error: In clReleaseContext\n" );
+
+ return 0;
+}
+
+int createOpenCLMemoryBuffer( cl_context& context, const size_t bufferSizeBytes, const cl_uint numBuffers, cl_mem buffer[], cl_mem_flags accessibility) {
+ cl_int status = 0;
+
+ for( cl_uint i = 0; i < numBuffers; ++i )
+ {
+ buffer[ i ] = ::clCreateBuffer( context, accessibility, bufferSizeBytes, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
+ }
+
+ return 0;
+}
+
+int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[])
+{
+ for( cl_uint i = 0; i < numBuffers; ++i )
+ {
+ if( buffer[ i ] != NULL )
+ OPENCL_V_THROW( clReleaseMemObject( buffer[ i ] ), "Error: In clReleaseMemObject\n" );
+ }
+
+ return 0;
+}
+
+void createOpenCLCommandQueue( cl_context& context,
+ cl_uint commandQueueFlags,
+ cl_command_queue& commandQueue,
+ std::vector< cl_device_id > devices,
+ const size_t bufferSizeBytesIn,
+ const cl_uint numBuffersIn,
+ cl_mem clMemBufferIn[],
+ const size_t bufferSizeBytesOut,
+ const cl_uint numBuffersOut,
+ cl_mem clMemBufferOut[] )
+{
+ cl_int status = 0;
+ commandQueue = ::clCreateCommandQueue( context, devices[0], commandQueueFlags, &status );
+ OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
+
+ createOpenCLMemoryBuffer( context, bufferSizeBytesIn, numBuffersIn, clMemBufferIn, CL_MEM_READ_WRITE);
+ createOpenCLMemoryBuffer( context, bufferSizeBytesOut, numBuffersOut, clMemBufferOut, CL_MEM_READ_WRITE);
+}
+
diff --git a/src/callback-client/openCL.misc.h b/src/callback-client/openCL.misc.h
new file mode 100644
index 0000000..67ab537
--- /dev/null
+++ b/src/callback-client/openCL.misc.h
@@ -0,0 +1,151 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( OPENCL_MISC_H )
+#define OPENCL_MISC_H
+#include <memory>
+#include <stdexcept>
+#include "unicode.compatibility.h"
+
+// Creating a portable defintion of countof
+#if defined( _MSC_VER )
+ #define countOf _countof
+#else
+ #define countOf( arr ) ( sizeof( arr ) / sizeof( arr[ 0 ] ) )
+#endif
+
+/*
+ * \brief OpenCL platform and device discovery
+ * Creates a list of OpenCL platforms
+ * and their associated devices
+ */
+int discoverCLPlatforms( cl_device_type deviceType,
+ std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices );
+
+void prettyPrintCLPlatforms(std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices);
+
+/*
+ * \brief OpenCL related initialization
+ * Create Context, Device list
+ * Load CL file, compile, link CL source
+ * Build program and kernel objects
+ */
+std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
+ cl_int deviceId,
+ cl_int platformId,
+ cl_context& context,
+ bool printclInfo );
+
+/*
+ * \brief OpenCL memory buffer creation
+ */
+int createOpenCLMemoryBuffer(
+ cl_context& context,
+ const size_t bufferSizeBytes,
+ const cl_uint numBuffers,
+ cl_mem buffer[],
+ cl_mem_flags accessibility
+ );
+
+/*
+ * \brief OpenCL command queue creation
+ * Create Command Queue
+ * Create OpenCL memory buffer objects
+ */
+void createOpenCLCommandQueue( cl_context& context,
+ cl_uint commandQueueFlags,
+ cl_command_queue& commandQueue,
+ std::vector< cl_device_id > devices,
+ const size_t bufferSizeBytesIn,
+ const cl_uint numBuffersIn,
+ cl_mem clMemBufferIn[],
+ const size_t bufferSizeBytesOut,
+ const cl_uint numBuffersOut,
+ cl_mem clMemBufferOut[] );
+
+/*
+ * \brief release OpenCL memory buffer
+ */
+int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[] );
+
+std::string prettyPrintclFFTStatus( const cl_int& status );
+
+// This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
+// If an error occurs, we throw.
+// Note: std::runtime_error does not take unicode strings as input, so only strings supported
+inline cl_int OpenCL_V_Throw ( cl_int res, const std::string& msg, size_t lineno )
+{
+ switch( res )
+ {
+ case CL_SUCCESS: /**< No error */
+ break;
+ default:
+ {
+ std::stringstream tmp;
+ tmp << "OPENCL_V_THROWERROR< ";
+ tmp << prettyPrintclFFTStatus( res );
+ tmp << " > (";
+ tmp << lineno;
+ tmp << "): ";
+ tmp << msg;
+ std::string errorm (tmp.str());
+ std::cout << errorm<< std::endl;
+ throw std::runtime_error( errorm );
+ }
+ }
+
+ return res;
+}
+#define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw (_status, _message, __LINE__)
+
+inline cl_int OpenCL_V_Warn(cl_int res, const std::string& msg, size_t lineno)
+{
+ switch (res)
+ {
+ case CL_SUCCESS: /**< No error */
+ break;
+ case CL_DEVICE_NOT_FOUND:
+ // This happens all the time when discovering the OpenCL capabilities of the system,
+ // so do nothing here.
+ break;
+ default:
+ {
+ std::stringstream tmp;
+ tmp << "OPENCL_V_WARN< ";
+ tmp << prettyPrintclFFTStatus(res);
+ tmp << " > (";
+ tmp << lineno;
+ tmp << "): ";
+ tmp << msg;
+ std::string errorm(tmp.str());
+ std::cout << errorm << std::endl;
+ }
+ }
+
+ return res;
+}
+#define OPENCL_V_WARN(_status,_message) OpenCL_V_Warn (_status, _message, __LINE__);
+
+/*
+ * \brief Release OpenCL resources (Context, Memory etc.)
+ */
+int cleanupCL( cl_context* context, cl_command_queue* commandQueue, const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[]);
+
+#endif
diff --git a/src/callback-client/stdafx.cpp b/src/callback-client/stdafx.cpp
new file mode 100644
index 0000000..2587b2c
--- /dev/null
+++ b/src/callback-client/stdafx.cpp
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// stdafx.cpp : source file that includes just the standard includes
+// clFFT.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list