[clfft] 02/128: precallback-initialversion
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:32 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 3b8afc505d9dae90f4e6900cfd84de926b33a2cb
Author: Pradeep <pradeep.rao at amd.com>
Date: Tue Jul 21 16:30:28 2015 +0530
precallback-initialversion
---
src/CMakeLists.txt | 15 +
src/client-callback/CMakeLists.txt | 62 +++
src/client-callback/callback-client.cpp | 744 ++++++++++++++++++++++++++++++++
src/client-callback/client.h | 70 +++
src/client-callback/openCL.misc.cpp | 536 +++++++++++++++++++++++
src/client-callback/openCL.misc.h | 151 +++++++
src/client-callback/stdafx.cpp | 25 ++
src/include/clFFT.h | 24 ++
src/library/accessors.cpp | 35 ++
src/library/action.cpp | 13 +
src/library/generator.stockham.cpp | 220 +++++++++-
src/library/plan.cpp | 8 +
src/library/plan.h | 24 +-
13 files changed, 1910 insertions(+), 17 deletions(-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index af571ce..4a87888 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,6 +72,7 @@ option( BUILD_TEST "Build the library testing suite (dependency on google test,
option( BUILD_LOADLIBRARIES "Build the optional dynamic load libraries that the FFT runtime will search for" ON )
option( BUILD_SHARED_LIBS "Build shared libraries." ON)
option( BUILD_EXAMPLES "Build examples." ON)
+option( BUILD_CALLBACK_CLIENT "Build a command line clFFT client program that tests callback functionality (dependency on Boost)" ON )
# If BOOST_ROOT is defined as an environment value, use that value and cache it so it's visible in the cmake-gui.
# Otherwise, create a sensible default that the user can change
@@ -175,6 +176,13 @@ if( BUILD_TEST )
endif( )
endif( )
+# Enable building of the callback client if requested and all dependencies are found
+if( BUILD_CALLBACK_CLIENT AND Boost_FOUND )
+ set( FFT_CALLBACK_CLIENT ON )
+else( )
+ set( FFT_CALLBACK_CLIENT OFF )
+endif( )
+
# FFLAGS depend on the compiler, grab the compiler name from the path
get_filename_component( C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME_WE )
# message( "C_COMPILER_NAME: " ${C_COMPILER_NAME} )
@@ -283,6 +291,13 @@ else( )
message( "GoogleTest unit tests will NOT be built" )
endif( )
+# We only want to build the following if the user options are set
+if( FFT_CALLBACK_CLIENT AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/client-callback" )
+ add_subdirectory( client-callback )
+else( )
+ message( "FFT callback client will NOT be built" )
+endif( )
+
if( BUILD_EXAMPLES )
add_subdirectory( examples )
endif()
diff --git a/src/client-callback/CMakeLists.txt b/src/client-callback/CMakeLists.txt
new file mode 100644
index 0000000..268f2b6
--- /dev/null
+++ b/src/client-callback/CMakeLists.txt
@@ -0,0 +1,62 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# client
+set( Client.Source callback-client.cpp
+ openCL.misc.cpp
+ stdafx.cpp )
+
+set( Client.Headers client.h
+ openCL.misc.h
+ ../statTimer/statisticalTimer.extern.h
+ ../include/unicode.compatibility.h
+ ../include/stdafx.h
+ ../include/targetver.h
+ ../include/clFFT.h )
+
+set( Client.Files ${Client.Source} ${Client.Headers} )
+
+set( DL_LIB "" )
+if( WIN32 )
+ add_definitions( "/D_CONSOLE" )
+elseif( APPLE )
+ set( CMAKE_CXX_FLAGS "-std=c++11 -stdlib=libc++ ${CMAKE_CXX_FLAGS}" )
+else( )
+ # To use the dlopen() and dlclose() functions, we should link with libdl
+ set( DL_LIB "-ldl -lrt" )
+endif( )
+
+# Include standard OpenCL headers
+include_directories( ${Boost_INCLUDE_DIRS} ${OPENCL_INCLUDE_DIRS} ../../../common ${PROJECT_BINARY_DIR}/include ../include )
+
+add_executable( clFFT-callback ${Client.Files} )
+
+target_link_libraries( clFFT-callback clFFT ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} ${DL_LIB} )
+
+set_target_properties( clFFT-callback PROPERTIES VERSION ${CLFFT_VERSION} )
+set_target_properties( clFFT-callback PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
+if( APPLE )
+ # properly deal with RPATH on mac
+ set_target_properties( clFFT-callback PROPERTIES INSTALL_RPATH "@loader_path/../lib${SUFFIX_LIB}")
+endif()
+
+# CPack configuration; include the executable into the package
+install( TARGETS clFFT-callback
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+ )
diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
new file mode 100644
index 0000000..fc8de65
--- /dev/null
+++ b/src/client-callback/callback-client.cpp
@@ -0,0 +1,744 @@
+#include "stdafx.h"
+#include <functional>
+#include <cmath>
+
+#include "client.h"
+#include "../library/private.h"
+#include "openCL.misc.h"
+#include "../statTimer/statisticalTimer.extern.h"
+#include "../include/sharedLibrary.h"
+#include "../include/unicode.compatibility.h"
+
+namespace po = boost::program_options;
+
+#define SCALAR 100
+#define PRECALLBACKTYPE 1
+
+#define MULVAL float2 mulval(__global void* in, int offset, __global void* userdata)\n \
+ { \n \
+ int scalar = *((__global int*)userdata); \n \
+ float2 ret = *((__global float2*)in + offset) * scalar; \n \
+ return ret; \n \
+ }
+
+#define MULVAL_PLANAR float2 mulval(__global void* inRe, __global void* inIm, int offset, __global void* userdata)\n \
+ { \n \
+ __global USER_DATA *data = (__global USER_DATA *)userdata; \n \
+ int scalar = (int)data->scalar; \n \
+ float2 ret; \n \
+ ret.x = *((__global float*)inRe + offset) * scalar; \n \
+ ret.y = *((__global float*)inIm + offset) * scalar; \n \
+ return ret; \n \
+ }
+
+#define STRUCT_USERDATA typedef struct USER_DATA \
+ { \
+ int scalar; \
+ int datalength; \
+ } USER_DATA;
+STRUCT_USERDATA
+
+// This is used with the program_options class so that the user can type an integer on the command line
+// and we store into an enum varaible
+template<class _Elem, class _Traits>
+std::basic_istream<_Elem, _Traits> & operator>> (std::basic_istream<_Elem, _Traits> & stream, clfftLayout & layout)
+{
+ cl_uint tmp;
+ stream >> tmp;
+ layout = clfftLayout(tmp);
+ return stream;
+}
+
+template < typename T >
+int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+ clfftLayout in_layout, clfftLayout out_layout,
+ clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
+ cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
+ cl_uint command_queue_flags, cl_uint profile_count,
+ std::auto_ptr< clfftSetupData > setupData,
+ bool hasPrecallback)
+{
+ // Our command line does not specify what dimension FFT we wish to transform; we decode
+ // this from the lengths that the user specifies for X, Y, Z. A length of one means that
+ // The user does not want that dimension.
+
+ const size_t max_dimensions = 3;
+ size_t strides[ 4 ];
+ size_t o_strides[ 4 ];
+ size_t fftVectorSize = 0;
+ size_t fftVectorSizePadded = 0;
+ size_t fftBatchSize = 0;
+ size_t outfftVectorSize = 0;
+ size_t outfftVectorSizePadded = 0;
+ size_t outfftBatchSize = 0;
+ size_t size_of_input_buffers_in_bytes = 0;
+ size_t size_of_output_buffers_in_bytes = 0;
+ cl_uint number_of_output_buffers = 0;
+ clfftDim dim = CLFFT_1D;
+ cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
+ cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
+ std::vector< cl_device_id > device_id;
+ cl_context context;
+ cl_command_queue queue;
+ cl_event outEvent = NULL;
+ clfftPlanHandle plan_handle;
+
+ for (unsigned u = 0; u < max_dimensions; ++u) {
+ if (0 != lengths[u])
+ continue;
+ lengths[u] = 1;
+ }
+
+ if( lengths[ 1 ] > 1 )
+ {
+ dim = CLFFT_2D;
+ }
+ if( lengths[ 2 ] > 1 )
+ {
+ dim = CLFFT_3D;
+ }
+
+ strides[ 0 ] = inStrides[0];
+ strides[ 1 ] = inStrides[1];
+ strides[ 2 ] = inStrides[2];
+ strides[ 3 ] = inStrides[3];
+
+ o_strides[ 0 ] = outStrides[0];
+ o_strides[ 1 ] = outStrides[1];
+ o_strides[ 2 ] = outStrides[2];
+ o_strides[ 3 ] = outStrides[3];
+
+ fftVectorSize = lengths[0] * lengths[1] * lengths[2];
+ fftVectorSizePadded = strides[3];
+ fftBatchSize = fftVectorSizePadded * batch_size;
+
+ size_t Nt = 1 + lengths[0]/2;
+
+ if(place == CLFFT_INPLACE)
+ {
+ outfftVectorSize = fftVectorSize;
+ outfftVectorSizePadded = fftVectorSizePadded;
+ outfftBatchSize = fftBatchSize;
+ }
+ else
+ {
+ outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
+ outfftVectorSizePadded = o_strides[3];
+ outfftBatchSize = outfftVectorSizePadded * batch_size;
+ }
+
+ // Real to complex case
+ if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) )
+ {
+ terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+ return 1;
+ }
+
+ switch( out_layout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ number_of_output_buffers = 1;
+ size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > );
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ number_of_output_buffers = 2;
+ size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T);
+ break;
+ default:
+ terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+ return 1;
+ }
+
+ // Fill the input buffers
+ switch( in_layout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ // This call creates our openCL context and sets up our devices; expected to throw on error
+ size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > );
+
+ device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
+ createOpenCLCommandQueue( context,
+ command_queue_flags, queue,
+ device_id,
+ size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+ size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+
+ std::vector< std::complex< T > > input( fftBatchSize );
+
+ // set zero
+ for( cl_uint i = 0; i < fftBatchSize; ++i )
+ {
+ input[ i ] = 0;
+ }
+
+ // impulse test case
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+ for(size_t k = 0; k < lengths[2]; k++)
+ {
+ size_t p2 = p3 + k * strides[2];
+ for(size_t j = 0; j < lengths[1]; j++)
+ {
+ size_t p1 = p2 + j * strides[1];
+ for(size_t i = 0; i < lengths[0]; i++)
+ {
+ size_t p0 = p1 + i * strides[0];
+ input[p0] = 1;
+ }
+ }
+ }
+ }
+
+
+ OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ],
+ 0, NULL, &outEvent ),
+ "clEnqueueWriteBuffer failed" );
+
+ }
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ // This call creates our openCL context and sets up our devices; expected to throw on error
+ size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T );
+
+ device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
+ createOpenCLCommandQueue( context,
+ command_queue_flags, queue,
+ device_id,
+ size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
+ size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+
+ std::vector< T > real( fftBatchSize );
+ std::vector< T > imag( fftBatchSize );
+
+ // set zero
+ for( cl_uint i = 0; i < fftBatchSize; ++i )
+ {
+ real[ i ] = 0;
+ imag[ i ] = 0;
+ }
+
+ // impulse test case
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+ for(size_t k = 0; k < lengths[2]; k++)
+ {
+ size_t p2 = p3 + k * strides[2];
+ for(size_t j = 0; j < lengths[1]; j++)
+ {
+ size_t p1 = p2 + j * strides[1];
+ for(size_t i = 0; i < lengths[0]; i++)
+ {
+ size_t p0 = p1 + i * strides[0];
+ real[p0] = 1;
+ }
+ }
+ }
+ }
+
+
+ OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, &outEvent ),
+ "clEnqueueWriteBuffer failed" );
+ OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+ 0, NULL, &outEvent ),
+ "clEnqueueWriteBuffer failed" );
+ }
+ break;
+ default:
+ terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+ return 1;
+ }
+
+ // Discover and load the timer module if present
+ void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
+ if( timerLibHandle == NULL )
+ {
+ terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
+ }
+
+
+ // Timer module discovered and loaded successfully
+ // Initialize function pointers to call into the shared module
+ PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );
+
+ // Create and initialize our timer class, if the external timer shared library loaded
+ baseStatTimer* timer = NULL;
+ size_t clFFTID = 0;
+ if( get_timer )
+ {
+ timer = get_timer( CLFFT_GPU );
+ timer->Reserve( 1, profile_count );
+ timer->setNormalize( true );
+
+ clFFTID = timer->getUniqueID( "clFFT", 0 );
+ }
+
+ OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+ OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );
+
+ // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+ OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+ OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
+ OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
+ OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+ OPENCL_V_THROW (clfftSetPlanInStride ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
+ OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
+ OPENCL_V_THROW (clfftSetPlanDistance ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );
+
+ // Set backward scale factor to 1.0 for non real FFTs to do correct output checks
+ if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
+ OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );
+
+ //Check for Precallback
+ //Currently test includes only for 1D
+ if (hasPrecallback && dim == CLFFT_1D)
+ {
+ int precallbakType = PRECALLBACKTYPE;
+ cl_mem userdata;
+
+ if (in_layout == CLFFT_COMPLEX_INTERLEAVED)
+ {
+ switch (precallbakType)
+ {
+ case 1: //C2C 1D Interleaved without LDS
+ {
+ char* precallbackstr = STRINGIFY(MULVAL);
+ int h_userdata[1] = { SCALAR };
+ userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), (void*)h_userdata, NULL);
+
+ //Register the callback
+ OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (in_layout == CLFFT_COMPLEX_PLANAR)
+ {
+ switch (precallbakType)
+ {
+ case 1: //C2C 1D PLANAR without LDS
+ {
+ char* precallbackstr = STRINGIFY(MULVAL_PLANAR);
+ USER_DATA h_userdata[1];
+ h_userdata[0].scalar = SCALAR;
+ userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA), (void*)h_userdata, NULL);
+
+ //Register the callback
+ OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" );
+
+ //get the buffersize
+ size_t buffersize=0;
+ OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+ //allocate the intermediate buffer
+ cl_mem clMedBuffer=NULL;
+
+ if (buffersize)
+ {
+ cl_int medstatus;
+ clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+ OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+ }
+
+ if (( place == CLFFT_INPLACE )
+ && ( in_layout != out_layout ))
+ {
+ switch( in_layout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
+ {
+ throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+ }
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
+ {
+ throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
+ }
+ break;
+ }
+ default:
+ terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+ return 1;
+ }
+ }
+
+ cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
+
+ Timer tr;
+ tr.Start();
+
+ // Loop as many times as the user specifies to average out the timings
+ for( cl_uint i = 0; i < profile_count; ++i )
+ {
+ if( timer ) timer->Start( clFFTID );
+
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent,
+ &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ if( timer ) timer->Stop( clFFTID );
+ }
+ OPENCL_V_THROW( clFinish( queue ), "clFinish failed" );
+ if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+ double wtime = tr.Sample()/((double)profile_count);
+ size_t totalLen = 1;
+ for(int i=0; i<dim; i++) totalLen *= lengths[i];
+ double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+ if(profile_count > 1)
+ {
+ tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl;
+ tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl;
+ }
+
+ if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) )
+ {
+ // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result
+ timer->pruneOutliers( 2.0 );
+ timer->Print( );
+ timer->Reset( );
+ }
+
+ /*****************/
+ FreeSharedLibrary( timerLibHandle );
+
+ // Read and check output data
+ // This check is not valid if the FFT is executed multiple times inplace.
+ //
+ if (( place == CLFFT_OUTOFPLACE )
+ || ( profile_count == 1))
+ {
+ bool checkflag= false;
+ switch( out_layout )
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ std::vector< std::complex< T > > output( outfftBatchSize );
+
+ if( place == CLFFT_INPLACE )
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+ else
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+
+ //check output data
+ for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ {
+ if (0 == (i % outfftVectorSizePadded))
+ {
+ if (hasPrecallback)
+ {
+ if (output[i].real() != outfftVectorSize * SCALAR)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+ else
+ {
+ if (output[i].real() != outfftVectorSize)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+ }
+ else
+ {
+ if (output[ i ].real() != 0)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+
+ if (output[ i ].imag() != 0)
+ {
+ checkflag = true;
+ break;
+ }
+ //std::cout << i << " real = " << output[i].real() << " img = " << output[ i ].imag() << std::endl;
+ }
+ }
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ std::valarray< T > real( outfftBatchSize );
+ std::valarray< T > imag( outfftBatchSize );
+
+ if( place == CLFFT_INPLACE )
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+ else
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+
+ // Check output data
+ for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ {
+ if (0 == (i % outfftVectorSizePadded))
+ {
+ if (hasPrecallback)
+ {
+ if (real[i] != outfftVectorSize * SCALAR)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+ else
+ {
+ if (real[i] != outfftVectorSize)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+ }
+ else
+ {
+ if (real[i] != 0)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+
+ if (imag[i] != 0)
+ {
+ checkflag = true;
+ break;
+ }
+ //std::cout << i << " real = " << real[i] << " img = " << imag[ i ] << std::endl;
+ }
+ }
+ break;
+ default:
+ terr << _T("Complex-Real callback cases not yet implemented" ) << std::endl;
+ throw std::runtime_error( "Input layout format not yet supported" );
+ break;
+ }
+
+ if (checkflag)
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+ }
+ else
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+ }
+ }
+
+ OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+ OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+ cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent );
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ // OpenCL state
+ cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+ cl_int deviceId = 0;
+ cl_int platformId = 0;
+
+ // FFT state
+
+ clfftResultLocation place = CLFFT_INPLACE;
+ clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED;
+ clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
+ clfftPrecision precision = CLFFT_SINGLE;
+ clfftDirection dir = CLFFT_FORWARD;
+ size_t lengths[ 3 ] = {1,1,1};
+ size_t iStrides[ 4 ] = {0,0,0,0};
+ size_t oStrides[ 4 ] = {0,0,0,0};
+ cl_uint profile_count = 0;
+
+ cl_uint command_queue_flags = 0;
+ size_t batchSize = 1;
+
+ //callback
+ bool hasPrecallback = true;
+
+ // Initialize flags for FFT library
+ std::auto_ptr< clfftSetupData > setupData( new clfftSetupData );
+ OPENCL_V_THROW( clfftInitSetupData( setupData.get( ) ),
+ "clfftInitSetupData failed" );
+
+ try
+ {
+ // Declare the supported options.
+ po::options_description desc( "clFFT client command line options" );
+ desc.add_options()
+ ( "help,h", "produces this help message" )
+ ( "gpu,g", "Force selection of OpenCL GPU devices only" )
+ ( "cpu,c", "Force selection of OpenCL CPU devices only" )
+ ( "all,a", "Force selection of all OpenCL devices (default)" )
+ ( "outPlace,o", "Out of place FFT transform (default: in place)" )
+ ( "double", "Double precision transform (default: single)" )
+ ( "inv", "Backward transform (default: forward)" )
+ ( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" )
+ ( "noprecall", "Disable Precallback (default: precallback on)" )
+ ( "lenX,x", po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ), "Specify the length of the 1st dimension of a test array" )
+ ( "lenY,y", po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ), "Specify the length of the 2nd dimension of a test array" )
+ ( "lenZ,z", po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ), "Specify the length of the 3rd dimension of a test array" )
+ ( "isX", po::value< size_t >( &iStrides[ 0 ] )->default_value( 1 ), "Specify the input stride of the 1st dimension of a test array" )
+ ( "isY", po::value< size_t >( &iStrides[ 1 ] )->default_value( 0 ), "Specify the input stride of the 2nd dimension of a test array" )
+ ( "isZ", po::value< size_t >( &iStrides[ 2 ] )->default_value( 0 ), "Specify the input stride of the 3rd dimension of a test array" )
+ ( "iD", po::value< size_t >( &iStrides[ 3 ] )->default_value( 0 ), "input distance between subsequent sets of data when batch size > 1" )
+ ( "osX", po::value< size_t >( &oStrides[ 0 ] )->default_value( 1 ), "Specify the output stride of the 1st dimension of a test array" )
+ ( "osY", po::value< size_t >( &oStrides[ 1 ] )->default_value( 0 ), "Specify the output stride of the 2nd dimension of a test array" )
+ ( "osZ", po::value< size_t >( &oStrides[ 2 ] )->default_value( 0 ), "Specify the output stride of the 3rd dimension of a test array" )
+ ( "oD", po::value< size_t >( &oStrides[ 3 ] )->default_value( 0 ), "output distance between subsequent sets of data when batch size > 1" )
+ ( "batchSize,b", po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " )
+ ( "profile,p", po::value< cl_uint >( &profile_count )->default_value( 1 ), "Time and report the kernel speed of the FFT (default: profiling off)" )
+ ( "inLayout", po::value< clfftLayout >( &inLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" )
+ ( "outLayout", po::value< clfftLayout >( &outLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" )
+ ;
+
+ po::variables_map vm;
+ po::store( po::parse_command_line( argc, argv, desc ), vm );
+ po::notify( vm );
+
+ if( vm.count( "help" ) )
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+
+ size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
+ | ((vm.count( "cpu" ) > 0) ? 2 : 0)
+ | ((vm.count( "all" ) > 0) ? 4 : 0);
+ if ((mutex & (mutex-1)) != 0) {
+ terr << _T("You have selected mutually-exclusive OpenCL device options:") << std::endl;
+ if (vm.count ( "gpu" ) > 0) terr << _T(" gpu,g Force selection of OpenCL GPU devices only" ) << std::endl;
+ if (vm.count ( "cpu" ) > 0) terr << _T(" cpu,c Force selection of OpenCL CPU devices only" ) << std::endl;
+ if (vm.count ( "all" ) > 0) terr << _T(" all,a Force selection of all OpenCL devices (default)" ) << std::endl;
+ return 1;
+ }
+
+ if( vm.count( "gpu" ) )
+ {
+ deviceType = CL_DEVICE_TYPE_GPU;
+ }
+
+ if( vm.count( "cpu" ) )
+ {
+ deviceType = CL_DEVICE_TYPE_CPU;
+ }
+
+ if( vm.count( "all" ) )
+ {
+ deviceType = CL_DEVICE_TYPE_ALL;
+ }
+
+ if( vm.count( "dumpKernels" ) )
+ {
+ setupData->debugFlags |= CLFFT_DUMP_PROGRAMS;
+ }
+
+ if( vm.count( "noprecall" ) )
+ {
+ hasPrecallback = false;
+ }
+
+ int inL = (int)inLayout;
+ int otL = (int)outLayout;
+
+ // input output layout support matrix
+ int ioLayoutSupport[5][5] = {
+ { 1, 1, 0, 0, 1 },
+ { 1, 1, 0, 0, 1 },
+ { 0, 0, 0, 0, 1 },
+ { 0, 0, 0, 0, 1 },
+ { 1, 1, 1, 1, 0 },
+ };
+
+ if((inL < 1) || (inL > 5)) throw std::runtime_error( "Invalid Input layout format" );
+ if((otL < 1) || (otL > 5)) throw std::runtime_error( "Invalid Output layout format" );
+
+ if(ioLayoutSupport[inL-1][otL-1] == 0) throw std::runtime_error( "Invalid combination of Input/Output layout formats" );
+
+ if( ((inL == 1) || (inL == 2)) && ((otL == 1) || (otL == 2)) ) // Complex-Complex cases
+ {
+ iStrides[1] = iStrides[1] ? iStrides[1] : lengths[0] * iStrides[0];
+ iStrides[2] = iStrides[2] ? iStrides[2] : lengths[1] * iStrides[1];
+ iStrides[3] = iStrides[3] ? iStrides[3] : lengths[2] * iStrides[2];
+
+ if(place == CLFFT_INPLACE)
+ {
+ oStrides[0] = iStrides[0];
+ oStrides[1] = iStrides[1];
+ oStrides[2] = iStrides[2];
+ oStrides[3] = iStrides[3];
+ }
+ else
+ {
+ oStrides[1] = oStrides[1] ? oStrides[1] : lengths[0] * oStrides[0];
+ oStrides[2] = oStrides[2] ? oStrides[2] : lengths[1] * oStrides[1];
+ oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2];
+ }
+ }
+ else
+ {
+ terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+ return 1;
+ }
+
+ if( precision == CLFFT_SINGLE )
+ transform<float>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, false, command_queue_flags, profile_count, setupData, hasPrecallback );
+ else
+ transform<double>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, false, command_queue_flags, profile_count, setupData, hasPrecallback );
+ }
+ catch( std::exception& e )
+ {
+ terr << _T( "clFFT error condition reported:" ) << std::endl << e.what() << std::endl;
+ return 1;
+ }
+ return 0;
+}
\ No newline at end of file
diff --git a/src/client-callback/client.h b/src/client-callback/client.h
new file mode 100644
index 0000000..a1e100d
--- /dev/null
+++ b/src/client-callback/client.h
@@ -0,0 +1,70 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( CLIENT_H )
+#define CLIENT_H
+
+// Boost headers that we want to use
+// #define BOOST_PROGRAM_OPTIONS_DYN_LINK
+#include <boost/program_options.hpp>
+
+#define CALLBCKSTR(...) #__VA_ARGS__
+#define STRINGIFY(...) CALLBCKSTR(__VA_ARGS__)
+
+#ifdef WIN32
+
+struct Timer
+{
+ LARGE_INTEGER start, stop, freq;
+
+public:
+ Timer() { QueryPerformanceFrequency( &freq ); }
+
+ void Start() { QueryPerformanceCounter(&start); }
+ double Sample()
+ {
+ QueryPerformanceCounter ( &stop );
+ double time = (double)(stop.QuadPart-start.QuadPart) / (double)(freq.QuadPart);
+ return time;
+ }
+};
+
+#else
+
+#include <time.h>
+#include <math.h>
+
+struct Timer
+{
+ struct timespec start, end;
+
+public:
+ Timer() { }
+
+ void Start() { clock_gettime(CLOCK_MONOTONIC, &start); }
+ double Sample()
+ {
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ double time = 1000000000L * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+ return time * 1E-9;
+ }
+};
+
+#endif
+
+#endif
diff --git a/src/client-callback/openCL.misc.cpp b/src/client-callback/openCL.misc.cpp
new file mode 100644
index 0000000..cb5db29
--- /dev/null
+++ b/src/client-callback/openCL.misc.cpp
@@ -0,0 +1,536 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.opencl.cpp : Provides functions to set up openCL
+//
+
+#include "stdafx.h"
+#include <stdexcept>
+#include <iomanip>
+#include <sstream>
+#include <cstring>
+#include <vector>
+#include "clFFT.h"
+#include "openCL.misc.h"
+
+
+
+void prettyPrintPlatformInfo( const cl_platform_id& pId )
+{
+ size_t platformProfileSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, 0, NULL, &platformProfileSize ),
+ "Getting CL_PLATFORM_PROFILE Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformProfile( platformProfileSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, platformProfileSize, &szPlatformProfile[ 0 ], NULL),
+ "Getting CL_PLATFORM_PROFILE Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformVersionSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, 0, NULL, &platformVersionSize ),
+ "Getting CL_PLATFORM_VERSION Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformVersion( platformVersionSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, platformVersionSize, &szPlatformVersion[ 0 ], NULL),
+ "Getting CL_PLATFORM_VERSION Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformNameSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, 0, NULL, &platformNameSize ),
+ "Getting CL_PLATFORM_NAME Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformName( platformNameSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, platformNameSize, &szPlatformName[ 0 ], NULL),
+ "Getting CL_PLATFORM_NAME Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t vendorStringSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, 0, NULL, &vendorStringSize ),
+ "Getting CL_PLATFORM_VENDOR Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformVendor( vendorStringSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, vendorStringSize, &szPlatformVendor[ 0 ], NULL),
+ "Getting CL_PLATFORM_VENDOR Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformExtensionsSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, 0, NULL, &platformExtensionsSize ),
+ "Getting CL_PLATFORM_EXTENSIONS Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformExtensions( platformExtensionsSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, platformExtensionsSize, &szPlatformExtensions[ 0 ], NULL),
+ "Getting CL_PLATFORM_EXTENSIONS Platform Info string ( ::clGetPlatformInfo() )" );
+
+ const int indent = countOf( " CL_PLATFORM_EXTENSIONS: " );
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_PROFILE: " << &szPlatformProfile[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VERSION: " << &szPlatformVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_NAME: " << &szPlatformName[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VENDOR: " << &szPlatformVendor[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_EXTENSIONS: " << &szPlatformExtensions[ 0 ] << std::endl;
+ std::cout << std::right << std::endl;
+}
+
+void prettyPrintDeviceInfo( const cl_device_id& dId )
+{
+ size_t deviceNameSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, 0, NULL, &deviceNameSize ),
+ "Getting CL_DEVICE_NAME Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceName( deviceNameSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, deviceNameSize, &szDeviceName[ 0 ], NULL ),
+ "Getting CL_DEVICE_NAME Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t deviceVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+ "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceVersion( deviceVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+ "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t driverVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, 0, NULL, &driverVersionSize ),
+ "Getting CL_DRIVER_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDriverVersion( driverVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, driverVersionSize, &szDriverVersion[ 0 ], NULL ),
+ "Getting CL_DRIVER_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t openCLVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &openCLVersionSize ),
+ "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szOpenCLVersion( openCLVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, openCLVersionSize, &szOpenCLVersion[ 0 ], NULL ),
+ "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_TYPE, sizeof( cl_device_type ), &devType, NULL ),
+ "Getting CL_DEVICE_TYPE device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint devAddrBits = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_ADDRESS_BITS, sizeof( cl_uint ), &devAddrBits, NULL ),
+ "Getting CL_DEVICE_ADDRESS_BITS device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint maxClockFreq = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( cl_uint ), &maxClockFreq, NULL ),
+ "Getting CL_DEVICE_MAX_CLOCK_FREQUENCY device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool devAvailable = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_AVAILABLE, sizeof( cl_bool ), &devAvailable, NULL ),
+ "Getting CL_DEVICE_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool devCompAvailable = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_COMPILER_AVAILABLE, sizeof( cl_bool ), &devCompAvailable, NULL ),
+ "Getting CL_DEVICE_COMPILER_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+ size_t devMaxWorkGroup = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &devMaxWorkGroup, NULL ),
+ "Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint devMaxWorkItemDim = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( cl_uint ), &devMaxWorkItemDim, NULL ),
+ "Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )" );
+
+ std::vector< size_t > devMaxWorkItemSizes( devMaxWorkItemDim );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( size_t )*devMaxWorkItemSizes.size( ), &devMaxWorkItemSizes[0], NULL),
+ "Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool deviceHostUnified = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( cl_bool ), &deviceHostUnified, NULL ),
+ "Getting CL_DEVICE_HOST_UNIFIED_MEMORY Platform Info string ( ::clGetDeviceInfo() )" );
+
+ cl_ulong devMaxConstantBuffer = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( cl_ulong ), &devMaxConstantBuffer, NULL ),
+ "Getting CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong devLocalMemSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &devLocalMemSize, NULL ),
+ "Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong deviceGlobalMemSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &deviceGlobalMemSize, NULL ),
+ "Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong deviceMaxMemAllocSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &deviceMaxMemAllocSize, NULL ),
+ "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ size_t deviceExtSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceExt( deviceExtSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+ const int indent = countOf( " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " );
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_NAME: " << &szDeviceName[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_VERSION: " << &szDeviceVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DRIVER_VERSION: " << &szDriverVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_TYPE: "
+ << (CL_DEVICE_TYPE_DEFAULT & devType ? "default" : "")
+ << (CL_DEVICE_TYPE_CPU & devType ? "CPU" : "")
+ << (CL_DEVICE_TYPE_GPU & devType ? "GPU" : "")
+ << (CL_DEVICE_TYPE_ACCELERATOR & devType ? "Accelerator" : "")
+ << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CLOCK_FREQUENCY: " << maxClockFreq << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_ADDRESS_BITS: " << devAddrBits << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_AVAILABLE: " << ( devAvailable ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_COMPILER_AVAILABLE: " << ( devCompAvailable ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_OPENCL_C_VERSION: " << &szOpenCLVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_GROUP_SIZE: " << devMaxWorkGroup << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << devMaxWorkItemDim << std::endl;
+ for( cl_uint wis = 0; wis < devMaxWorkItemSizes.size( ); ++wis )
+ {
+ std::stringstream dimString;
+ dimString << "Dimension[ " << wis << " ] ";
+ std::cout << std::right << std::setw( indent ) << dimString.str( ) << devMaxWorkItemSizes[wis] << std::endl;
+ }
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_HOST_UNIFIED_MEMORY: " << ( deviceHostUnified ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << devMaxConstantBuffer;
+ std::cout << " ( " << devMaxConstantBuffer / 1024 << " KB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_LOCAL_MEM_SIZE: " << devLocalMemSize;
+ std::cout << " ( " << devLocalMemSize / 1024 << " KB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_GLOBAL_MEM_SIZE: " << deviceGlobalMemSize;
+ std::cout << " ( " << deviceGlobalMemSize / 1048576 << " MB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << deviceMaxMemAllocSize;
+ std::cout << " ( " << deviceMaxMemAllocSize / 1048576 << " MB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_EXTENSIONS: " << &szDeviceExt[ 0 ] << std::endl;
+
+ std::cout << std::right << std::endl;
+}
+
+void prettyPrintCLPlatforms(std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices)
+{
+ for (unsigned int i = 0; i < platforms.size(); ++i)
+ {
+ std::cout << "OpenCL platform [ " << i << " ]:" << std::endl;
+ prettyPrintPlatformInfo(platforms[i]);
+
+ for (unsigned int n = 0; n < devices[i].size(); ++n)
+ {
+ std::cout << "OpenCL platform [ " << i << " ], device [ " << n << " ]:" << std::endl;
+ prettyPrintDeviceInfo((devices[i])[n]);
+ }
+ }
+
+}
+
+// Verify a failed condition; return true on fail
+inline cl_bool OPENCL_V_FAIL( cl_int res )
+{
+ if( res == CL_SUCCESS )
+ return CL_FALSE;
+ else
+ return CL_TRUE;
+}
+
+std::string prettyPrintclFFTStatus( const cl_int& status )
+{
+ switch( status )
+ {
+ case CLFFT_INVALID_GLOBAL_WORK_SIZE:
+ return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
+ case CLFFT_INVALID_MIP_LEVEL:
+ return "CLFFT_INVALID_MIP_LEVEL";
+ case CLFFT_INVALID_BUFFER_SIZE:
+ return "CLFFT_INVALID_BUFFER_SIZE";
+ case CLFFT_INVALID_GL_OBJECT:
+ return "CLFFT_INVALID_GL_OBJECT";
+ case CLFFT_INVALID_OPERATION:
+ return "CLFFT_INVALID_OPERATION";
+ case CLFFT_INVALID_EVENT:
+ return "CLFFT_INVALID_EVENT";
+ case CLFFT_INVALID_EVENT_WAIT_LIST:
+ return "CLFFT_INVALID_EVENT_WAIT_LIST";
+ case CLFFT_INVALID_GLOBAL_OFFSET:
+ return "CLFFT_INVALID_GLOBAL_OFFSET";
+ case CLFFT_INVALID_WORK_ITEM_SIZE:
+ return "CLFFT_INVALID_WORK_ITEM_SIZE";
+ case CLFFT_INVALID_WORK_GROUP_SIZE:
+ return "CLFFT_INVALID_WORK_GROUP_SIZE";
+ case CLFFT_INVALID_WORK_DIMENSION:
+ return "CLFFT_INVALID_WORK_DIMENSION";
+ case CLFFT_INVALID_KERNEL_ARGS:
+ return "CLFFT_INVALID_KERNEL_ARGS";
+ case CLFFT_INVALID_ARG_SIZE:
+ return "CLFFT_INVALID_ARG_SIZE";
+ case CLFFT_INVALID_ARG_VALUE:
+ return "CLFFT_INVALID_ARG_VALUE";
+ case CLFFT_INVALID_ARG_INDEX:
+ return "CLFFT_INVALID_ARG_INDEX";
+ case CLFFT_INVALID_KERNEL:
+ return "CLFFT_INVALID_KERNEL";
+ case CLFFT_INVALID_KERNEL_DEFINITION:
+ return "CLFFT_INVALID_KERNEL_DEFINITION";
+ case CLFFT_INVALID_KERNEL_NAME:
+ return "CLFFT_INVALID_KERNEL_NAME";
+ case CLFFT_INVALID_PROGRAM_EXECUTABLE:
+ return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
+ case CLFFT_INVALID_PROGRAM:
+ return "CLFFT_INVALID_PROGRAM";
+ case CLFFT_INVALID_BUILD_OPTIONS:
+ return "CLFFT_INVALID_BUILD_OPTIONS";
+ case CLFFT_INVALID_BINARY:
+ return "CLFFT_INVALID_BINARY";
+ case CLFFT_INVALID_SAMPLER:
+ return "CLFFT_INVALID_SAMPLER";
+ case CLFFT_INVALID_IMAGE_SIZE:
+ return "CLFFT_INVALID_IMAGE_SIZE";
+ case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+ return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+ case CLFFT_INVALID_MEM_OBJECT:
+ return "CLFFT_INVALID_MEM_OBJECT";
+ case CLFFT_INVALID_HOST_PTR:
+ return "CLFFT_INVALID_HOST_PTR";
+ case CLFFT_INVALID_COMMAND_QUEUE:
+ return "CLFFT_INVALID_COMMAND_QUEUE";
+ case CLFFT_INVALID_QUEUE_PROPERTIES:
+ return "CLFFT_INVALID_QUEUE_PROPERTIES";
+ case CLFFT_INVALID_CONTEXT:
+ return "CLFFT_INVALID_CONTEXT";
+ case CLFFT_INVALID_DEVICE:
+ return "CLFFT_INVALID_DEVICE";
+ case CLFFT_INVALID_PLATFORM:
+ return "CLFFT_INVALID_PLATFORM";
+ case CLFFT_INVALID_DEVICE_TYPE:
+ return "CLFFT_INVALID_DEVICE_TYPE";
+ case CLFFT_INVALID_VALUE:
+ return "CLFFT_INVALID_VALUE";
+ case CLFFT_MAP_FAILURE:
+ return "CLFFT_MAP_FAILURE";
+ case CLFFT_BUILD_PROGRAM_FAILURE:
+ return "CLFFT_BUILD_PROGRAM_FAILURE";
+ case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
+ return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
+ case CLFFT_IMAGE_FORMAT_MISMATCH:
+ return "CLFFT_IMAGE_FORMAT_MISMATCH";
+ case CLFFT_MEM_COPY_OVERLAP:
+ return "CLFFT_MEM_COPY_OVERLAP";
+ case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
+ return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
+ case CLFFT_OUT_OF_HOST_MEMORY:
+ return "CLFFT_OUT_OF_HOST_MEMORY";
+ case CLFFT_OUT_OF_RESOURCES:
+ return "CLFFT_OUT_OF_RESOURCES";
+ case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
+ return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
+ case CLFFT_COMPILER_NOT_AVAILABLE:
+ return "CLFFT_COMPILER_NOT_AVAILABLE";
+ case CLFFT_DEVICE_NOT_AVAILABLE:
+ return "CLFFT_DEVICE_NOT_AVAILABLE";
+ case CLFFT_DEVICE_NOT_FOUND:
+ return "CLFFT_DEVICE_NOT_FOUND";
+ case CLFFT_SUCCESS:
+ return "CLFFT_SUCCESS";
+ case CLFFT_NOTIMPLEMENTED:
+ return "CLFFT_NOTIMPLEMENTED";
+ case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
+ return "CLFFT_TRANSPOSED_NOTIMPLEMENTED";
+ case CLFFT_FILE_NOT_FOUND:
+ return "CLFFT_FILE_NOT_FOUND";
+ case CLFFT_FILE_CREATE_FAILURE:
+ return "CLFFT_FILE_CREATE_FAILURE";
+ case CLFFT_VERSION_MISMATCH:
+ return "CLFFT_VERSION_MISMATCH";
+ case CLFFT_INVALID_PLAN:
+ return "CLFFT_INVALID_PLAN";
+ default:
+ return "Error code not defined";
+ break;
+ }
+}
+
+
+int discoverCLPlatforms( cl_device_type deviceType,
+ std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices )
+{
+ cl_int status = 0;
+
+ /*
+ * Find all OpenCL platforms this system has to offer.
+ */
+
+ cl_uint numPlatforms = 0;
+ cl_platform_id platform = NULL;
+ OPENCL_V_THROW(::clGetPlatformIDs(0, NULL, &numPlatforms),
+ "Getting number of platforms( ::clGetPlatformsIDs() )");
+
+ if (numPlatforms > 0)
+ {
+ platforms.resize( numPlatforms );
+ devices.resize( numPlatforms );
+ OPENCL_V_THROW(::clGetPlatformIDs(numPlatforms, &platforms[0], NULL),
+ "Getting Platform Id's ( ::clGetPlatformsIDs() )");
+
+ if (NULL == platforms[0])
+ {
+ throw std::runtime_error("No appropriate OpenCL platform could be found");
+ }
+
+ /*
+ * Now, for each platform get all available devices matching deviceType.
+ */
+ for (unsigned int i = 0; i < numPlatforms; ++i)
+ {
+ // Get the device list for deviceType.
+ //
+ cl_uint numDevices = 0;
+ OPENCL_V_WARN(::clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices),
+ "Getting OpenCL devices ( ::clGetDeviceIDs() )");
+ if (0 == numDevices)
+ {
+ // OPENCL_V_WARN(CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+ continue;
+ }
+
+ devices[i].resize(numDevices);
+ OPENCL_V_THROW(::clGetDeviceIDs(platforms[i], deviceType, numDevices, &(devices[i])[0], NULL),
+ "Getting OpenCL deviceIDs ( ::clGetDeviceIDs() )");
+ }
+ }
+
+ return 0;
+}
+
+std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
+ cl_int deviceId,
+ cl_int platformId,
+ cl_context& context,
+ bool printclInfo )
+{
+ cl_int status = 0;
+ cl_platform_id platform = NULL;
+ std::vector< cl_device_id > devices(1);
+ devices[0] = NULL;
+
+ // Have a look at all the available platforms on this system
+ std::vector< cl_platform_id > platformInfos;
+ std::vector< std::vector< cl_device_id > > deviceInfos;
+ discoverCLPlatforms( deviceType, platformInfos, deviceInfos );
+
+
+ for (unsigned int i = 0; i < platformInfos.size(); ++i)
+ {
+ if(i == platformId)
+ {
+ for (unsigned int n = 0; n < deviceInfos[i].size(); ++n)
+ {
+ if (n == deviceId)
+ {
+ platform = platformInfos[i];
+ devices[0] = deviceInfos[i][n];
+
+ if(printclInfo)
+ {
+ prettyPrintPlatformInfo(platform);
+ prettyPrintDeviceInfo(devices[0]);
+ }
+
+ break;
+ }
+ }
+
+ break;
+ }
+ }
+
+
+
+ // Do some error checking if we really selected a valid platform and a valid device
+ if (NULL == devices[0])
+ {
+ OPENCL_V_THROW(CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+ }
+
+ if (NULL == platform)
+ {
+ throw std::runtime_error("No appropriate OpenCL platform could be found");
+ }
+
+ // Create an OpenCL context
+ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0 };
+ context = clCreateContext(cps,
+ (cl_uint)devices.size(),
+ &devices[0],
+ NULL,
+ NULL,
+ &status);
+ OPENCL_V_THROW(status, "Creating Context ( ::clCreateContextFromType() )");
+
+ return devices;
+}
+
+int cleanupCL( cl_context* context, cl_command_queue* commandQueue,
+ const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent )
+{
+ if( *outEvent != NULL )
+ OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" );
+
+ releaseOpenCLMemBuffer( numBuffersIn, inputBuffer);
+ releaseOpenCLMemBuffer( numBuffersOut, outputBuffer);
+
+ if( *commandQueue != NULL )
+ OPENCL_V_THROW( clReleaseCommandQueue( *commandQueue ), "Error: In clReleaseCommandQueue\n" );
+
+ if( *context != NULL )
+ OPENCL_V_THROW( clReleaseContext( *context ), "Error: In clReleaseContext\n" );
+
+ return 0;
+}
+
+int createOpenCLMemoryBuffer( cl_context& context, const size_t bufferSizeBytes, const cl_uint numBuffers, cl_mem buffer[], cl_mem_flags accessibility) {
+ cl_int status = 0;
+
+ for( cl_uint i = 0; i < numBuffers; ++i )
+ {
+ buffer[ i ] = ::clCreateBuffer( context, accessibility, bufferSizeBytes, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
+ }
+
+ return 0;
+}
+
+int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[])
+{
+ for( cl_uint i = 0; i < numBuffers; ++i )
+ {
+ if( buffer[ i ] != NULL )
+ OPENCL_V_THROW( clReleaseMemObject( buffer[ i ] ), "Error: In clReleaseMemObject\n" );
+ }
+
+ return 0;
+}
+
+void createOpenCLCommandQueue( cl_context& context,
+ cl_uint commandQueueFlags,
+ cl_command_queue& commandQueue,
+ std::vector< cl_device_id > devices,
+ const size_t bufferSizeBytesIn,
+ const cl_uint numBuffersIn,
+ cl_mem clMemBufferIn[],
+ const size_t bufferSizeBytesOut,
+ const cl_uint numBuffersOut,
+ cl_mem clMemBufferOut[] )
+{
+ cl_int status = 0;
+ commandQueue = ::clCreateCommandQueue( context, devices[0], commandQueueFlags, &status );
+ OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
+
+ createOpenCLMemoryBuffer( context, bufferSizeBytesIn, numBuffersIn, clMemBufferIn, CL_MEM_READ_WRITE);
+ createOpenCLMemoryBuffer( context, bufferSizeBytesOut, numBuffersOut, clMemBufferOut, CL_MEM_READ_WRITE);
+}
+
diff --git a/src/client-callback/openCL.misc.h b/src/client-callback/openCL.misc.h
new file mode 100644
index 0000000..07cc416
--- /dev/null
+++ b/src/client-callback/openCL.misc.h
@@ -0,0 +1,151 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( OPENCL_MISC_H )
+#define OPENCL_MISC_H
+#include <memory>
+#include <stdexcept>
+#include "unicode.compatibility.h"
+
+// Creating a portable defintion of countof
+#if defined( _MSC_VER )
+ #define countOf _countof
+#else
+ #define countOf( arr ) ( sizeof( arr ) / sizeof( arr[ 0 ] ) )
+#endif
+
+/*
+ * \brief OpenCL platform and device discovery
+ * Creates a list of OpenCL platforms
+ * and their associated devices
+ */
+int discoverCLPlatforms( cl_device_type deviceType,
+ std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices );
+
+void prettyPrintCLPlatforms(std::vector< cl_platform_id >& platforms,
+ std::vector< std::vector< cl_device_id > >& devices);
+
+/*
+ * \brief OpenCL related initialization
+ * Create Context, Device list
+ * Load CL file, compile, link CL source
+ * Build program and kernel objects
+ */
+std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
+ cl_int deviceId,
+ cl_int platformId,
+ cl_context& context,
+ bool printclInfo );
+
+/*
+ * \brief OpenCL memory buffer creation
+ */
+int createOpenCLMemoryBuffer(
+ cl_context& context,
+ const size_t bufferSizeBytes,
+ const cl_uint numBuffers,
+ cl_mem buffer[],
+ cl_mem_flags accessibility
+ );
+
+/*
+ * \brief OpenCL command queue creation
+ * Create Command Queue
+ * Create OpenCL memory buffer objects
+ */
+void createOpenCLCommandQueue( cl_context& context,
+ cl_uint commandQueueFlags,
+ cl_command_queue& commandQueue,
+ std::vector< cl_device_id > devices,
+ const size_t bufferSizeBytesIn,
+ const cl_uint numBuffersIn,
+ cl_mem clMemBufferIn[],
+ const size_t bufferSizeBytesOut,
+ const cl_uint numBuffersOut,
+ cl_mem clMemBufferOut[] );
+
+/*
+ * \brief release OpenCL memory buffer
+ */
+int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[] );
+
+std::string prettyPrintclFFTStatus( const cl_int& status );
+
+// This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
+// If an error occurs, we throw.
+// Note: std::runtime_error does not take unicode strings as input, so only strings supported
+inline cl_int OpenCL_V_Throw ( cl_int res, const std::string& msg, size_t lineno )
+{
+ switch( res )
+ {
+ case CL_SUCCESS: /**< No error */
+ break;
+ default:
+ {
+ std::stringstream tmp;
+ tmp << "OPENCL_V_THROWERROR< ";
+ tmp << prettyPrintclFFTStatus( res );
+ tmp << " > (";
+ tmp << lineno;
+ tmp << "): ";
+ tmp << msg;
+ std::string errorm (tmp.str());
+ std::cout << errorm<< std::endl;
+ throw std::runtime_error( errorm );
+ }
+ }
+
+ return res;
+}
+#define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw (_status, _message, __LINE__)
+
+inline cl_int OpenCL_V_Warn(cl_int res, const std::string& msg, size_t lineno)
+{
+ switch (res)
+ {
+ case CL_SUCCESS: /**< No error */
+ break;
+ case CL_DEVICE_NOT_FOUND:
+ // This happens all the time when discovering the OpenCL capabilities of the system,
+ // so do nothing here.
+ break;
+ default:
+ {
+ std::stringstream tmp;
+ tmp << "OPENCL_V_WARN< ";
+ tmp << prettyPrintclFFTStatus(res);
+ tmp << " > (";
+ tmp << lineno;
+ tmp << "): ";
+ tmp << msg;
+ std::string errorm(tmp.str());
+ std::cout << errorm << std::endl;
+ }
+ }
+
+ return res;
+}
+#define OPENCL_V_WARN(_status,_message) OpenCL_V_Warn (_status, _message, __LINE__);
+
+/*
+ * \brief Release OpenCL resources (Context, Memory etc.)
+ */
+int cleanupCL( cl_context* context, cl_command_queue* commandQueue, const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent );
+
+#endif
diff --git a/src/client-callback/stdafx.cpp b/src/client-callback/stdafx.cpp
new file mode 100644
index 0000000..2587b2c
--- /dev/null
+++ b/src/client-callback/stdafx.cpp
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// stdafx.cpp : source file that includes just the standard includes
+// clFFT.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/src/include/clFFT.h b/src/include/clFFT.h
index 95ded9f..5de09b1 100644
--- a/src/include/clFFT.h
+++ b/src/include/clFFT.h
@@ -213,6 +213,14 @@ struct clfftSetupData_
};
typedef struct clfftSetupData_ clfftSetupData;
+/*! @brief Type of Callback function.
+*/
+typedef enum clFFTCallbackType_
+{
+ PRECALLBACK, /*!< Callback function will be invoked only once at the beginning of FFT transform for each point of input */
+ POSTCALLBACK /*!< Callback function will be invoked only once at the end of FFT transform for each point of output */
+}clFFTCallbackType;
+
/*! @brief An abstract handle to the object that represents the state of the FFT(s) */
typedef size_t clfftPlanHandle;
@@ -541,6 +549,22 @@ extern "C" {
*/
CLFFTAPI clfftStatus clfftGetTmpBufSize( const clfftPlanHandle plHandle, size_t* buffersize );
+ /*! @brief Register the callback parameters
+ * @details Client can provide a callback function to do custom processing when reading input data and/or
+ * when writing output data. The callback function is provided as a string.
+ * clFFT library incorporates the callback function string into the main FFT kernel. This function is used
+ * by client to set the necessary parameters for callback
+ * @param[in] plHandle Handle to a plan previously created
+ * @param[funcName] Callback function name
+ * @param[funcString] Callback function in string form
+ * @param[userStructString] Optional - Custom data struct in string form used by Callback function. Pass NULL callback has no custom data type
+ * @param[localMemSize] Optional - Local memory size if needed by callback. Pass 0 if local memory not needed by callback
+ * @param[callbackType] Type of callback - Pre-Callback or Post-Callback
+ * @param[userdata] cl_mem object passed as paarameter to callback function
+ */
+ CLFFTAPI clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName, const char* funcString, const char* userStructString, int localMemSize, clFFTCallbackType callbackType, void *userdata);
+
+
/*! @brief Enqueue an FFT transform operation, and return immediately (non-blocking)
* @details This transform API is the function that actually computes the FFT transfrom. It is non-blocking as it
* only enqueues the OpenCL kernels for execution. The synchronization step has to be managed by the user.
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index 4b5bb4f..c47075e 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -765,4 +765,39 @@ clfftStatus clfftLocalMemSize( const clfftPlanHandle plHandle, cl_ulong* local_m
*local_mem_size = plan->envelope.limit_LocalMemSize;
return CLFFT_SUCCESS;
+}
+
+clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName,
+ const char* funcString, const char* userStructString,
+ int localMemSize, clFFTCallbackType callbackType,
+ void *userdata)
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+ FFTPlan* fftPlan = NULL;
+ lockRAII* planLock = NULL;
+
+ OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+ scopedLock sLock( *planLock, _T( "clFFTSetPlanCallback" ) );
+
+ if (callbackType == PRECALLBACK)
+ {
+ if (fftPlan->dim == CLFFT_1D && (fftPlan->inputLayout == CLFFT_COMPLEX_INTERLEAVED || fftPlan->inputLayout == CLFFT_COMPLEX_PLANAR))
+ {
+ if (funcName != NULL && funcString != NULL)
+ {
+ fftPlan->hasPreCallback = true;
+
+ fftPlan->preCallback.funcname = funcName;
+ fftPlan->preCallback.funcstring = funcString;
+ fftPlan->preCallback.userdatastruct = userStructString;
+ fftPlan->preCallback.localMemSize = (localMemSize > 0) ? localMemSize : 0;
+
+ fftPlan->precallUserData = userdata;
+ }
+ }
+ else
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
+ return CLFFT_SUCCESS;
}
\ No newline at end of file
diff --git a/src/library/action.cpp b/src/library/action.cpp
index e1506ff..194df1c 100644
--- a/src/library/action.cpp
+++ b/src/library/action.cpp
@@ -582,6 +582,19 @@ clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
}
+ //If pre-callback function is set for the plan, pass the appropriate aruments
+ if (this->plan->hasPreCallback)
+ {
+ OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->precallUserData ), _T( "clSetKernelArg failed" ) );
+
+ //Pass LDS size arument if set
+ if (this->plan->preCallback.localMemSize > 0)
+ {
+ //TODO: Check for available LDS beyond what FFT already uses
+ OPENCL_V( clSetKernelArg( kern, uarg++, this->plan->preCallback.localMemSize, NULL ), _T( "clSetKernelArg failed" ) );
+ }
+ }
+
std::vector< size_t > gWorkSize;
std::vector< size_t > lWorkSize;
clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 93d073d..5fd0cd9 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -612,6 +612,10 @@ namespace StockhamGenerator
bool halfLds; // only half the LDS of a complex length need to be used
Pass<PR> *nextPass;
+ //callback members
+ bool fft_doPreCallback;
+ clfftCallbackParam fft_preCallback;
+
inline void RegBase(size_t regC, std::string &str) const
{
str += "B";
@@ -875,14 +879,51 @@ namespace StockhamGenerator
regIndexSub += SztToStr(v);
}
+ //get offset
+ std::string bufOffset;
+ bufOffset += offset; bufOffset += " + ( "; bufOffset += SztToStr(numPrev); bufOffset += " + ";
+ bufOffset += "me*"; bufOffset += SztToStr(numButterfly); bufOffset += " + ";
+ bufOffset += SztToStr(i*regC + v); bufOffset += " + ";
+ bufOffset += SztToStr(r*length/radix); bufOffset += " )*";
+ bufOffset += SztToStr(stride);
+
+ //If precallback is set invoke callback function
+ //Invoke callback only once in Planar data layout (i.e.c==0)
+ if (fft_doPreCallback && c == 0)
+ {
+ passStr += "\n\t";
+ passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "] = "; passStr += fft_preCallback.funcname; passStr += "(";
+ if(interleaved)
+ {
+ passStr += buffer; passStr += ", ";
+ }
+ else
+ {
+ passStr += bufferRe; passStr += ", "; passStr += bufferIm; passStr += ", ";
+ }
+ passStr += bufOffset; passStr += ", userdata";
+ if (fft_preCallback.localMemSize > 0)
+ {
+ passStr += ", localmem";
+ }
+ passStr += ");";
+ }
+
passStr += "\n\t";
passStr += regIndexSub;
- passStr += " = "; passStr += buffer;
- passStr += "["; passStr += offset; passStr += " + ( "; passStr += SztToStr(numPrev); passStr += " + ";
- passStr += "me*"; passStr += SztToStr(numButterfly); passStr += " + ";
- passStr += SztToStr(i*regC + v); passStr += " + ";
- passStr += SztToStr(r*length/radix); passStr += " )*";
- passStr += SztToStr(stride); passStr += "]"; passStr += tail;
+ passStr += " = ";
+
+ //Use the return value from precallback if set
+ if (fft_doPreCallback)
+ {
+ passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "]";
+ passStr += interleaved ? tail : (c == 0) ? ".x;" : ".y;";
+ }
+ else
+ {
+ passStr += buffer;
+ passStr += "["; passStr += bufOffset; passStr += "]"; passStr += tail;
+ }
}
// Since we read real & imag at once, we break the loop
@@ -1484,7 +1525,7 @@ namespace StockhamGenerator
r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal), realSpecial(realSpecialVal),
enableGrouping(true),
numB1(0), numB2(0), numB4(0),
- nextPass(NULL)
+ nextPass(NULL), fft_doPreCallback(false)
{
assert(radix <= length);
assert(length%radix == 0);
@@ -1528,6 +1569,12 @@ namespace StockhamGenerator
void SetNextPass(Pass<PR> *np) { nextPass = np; }
void SetGrouping(bool grp) { enableGrouping = grp; }
+ void SetPrecallback(bool hasPrecallback, clfftCallbackParam precallbackParam)
+ {
+ fft_doPreCallback = hasPrecallback;
+ fft_preCallback = precallbackParam;
+ }
+
void GeneratePass( bool fwd, std::string &passStr, bool fft_3StepTwiddle,
bool inInterleaved, bool outInterleaved,
bool inReal, bool outReal,
@@ -1688,6 +1735,18 @@ namespace StockhamGenerator
{
passStr += ", "; passStr += IterRegArgs();
}
+
+ //Include callback parameters if callback is set
+ if (fft_doPreCallback)
+ {
+ passStr += ", __global void* userdata";
+
+ if (fft_preCallback.localMemSize > 0)
+ {
+ passStr += ", __local void* localmem";
+ }
+ }
+
passStr += ")\n{\n";
// Register Declarations
@@ -1872,6 +1931,13 @@ namespace StockhamGenerator
{
if( (!halfLds) || (halfLds && (position == 0)) )
{
+ //If precallback is set
+ if (fft_doPreCallback)
+ {
+ passStr += "\n\tfloat2 retPrecallback[";
+ passStr += (numB4 > 0) ? "4" : (numB2 > 0) ? "2" : "1";
+ passStr += "];";
+ }
passStr += "\n\tif(rw)\n\t{";
SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
@@ -2325,6 +2391,13 @@ namespace StockhamGenerator
radices.push_back(rad);
passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
+
+ //Pass precallback information to Pass object if its the first pass.
+ //This will be used in single kernel transforms
+ if (!r2c2r && i == 0 && params.fft_hasPreCallback)
+ {
+ passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
+ }
LS *= rad;
}
@@ -2364,6 +2437,13 @@ namespace StockhamGenerator
radices.push_back(rad);
passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
+ //Pass precallback information to Pass object if its the first pass.
+ //This will be used in single kernel transforms
+ if (!r2c2r && pid == 0 && params.fft_hasPreCallback)
+ {
+ passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
+ }
+
pid++;
LS *= rad;
@@ -2560,6 +2640,24 @@ namespace StockhamGenerator
uradices.sort();
uradices.unique();
+ //If pre-callback is set for the plan
+ std::string callbackstr;
+ if (params.fft_hasPreCallback)
+ {
+ //If user defined struct defined for callback function add it to opencl source string
+ if (params.fft_preCallback.userdatastruct != NULL)
+ {
+ callbackstr += params.fft_preCallback.userdatastruct;
+ callbackstr += "\n";
+ }
+
+ //Insert callback function code at the beginning
+ callbackstr += params.fft_preCallback.funcstring;
+ callbackstr += "\n\n";
+
+ str += callbackstr;
+ }
+
typename std::vector< Pass<PR> >::const_iterator p;
if(length > 1)
{
@@ -2675,6 +2773,20 @@ namespace StockhamGenerator
delete [] nameVendor;
+ //If plan has pre-callback
+ callbackstr.clear();
+ if (params.fft_hasPreCallback)
+ {
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ callbackstr += ", __global void* userdata, __local void* localmem";
+ }
+ else
+ {
+ callbackstr += ", __global void* userdata";
+ }
+ }
+
// Function attributes
if(params.fft_placeness == CLFFT_INPLACE)
{
@@ -2697,12 +2809,28 @@ namespace StockhamGenerator
if(inInterleaved)
{
- str += "__global "; str += r2Type; str += " * restrict gb)\n";
+ str += "__global "; str += r2Type; str += " * restrict gb";
+
+ //If plan has pre-callback
+ if (params.fft_hasPreCallback)
+ {
+ str += callbackstr;
+ }
+
+ str += ")\n";
}
else
{
str += "__global "; str += rType; str += " * restrict gbRe, ";
- str += "__global "; str += rType; str += " * restrict gbIm)\n";
+ str += "__global "; str += rType; str += " * restrict gbIm";
+
+ //If plan has pre-callback
+ if (params.fft_hasPreCallback)
+ {
+ str += callbackstr;
+ }
+
+ str += ")\n";
}
}
}
@@ -2752,13 +2880,21 @@ namespace StockhamGenerator
if(outInterleaved)
{
- str += "__global "; str += r2Type; str += " * restrict gbOut)\n";
+ str += "__global "; str += r2Type; str += " * restrict gbOut";
}
else
{
str += "__global "; str += rType; str += " * restrict gbOutRe, ";
- str += "__global "; str += rType; str += " * restrict gbOutIm)\n";
+ str += "__global "; str += rType; str += " * restrict gbOutIm";
}
+
+ //If plan has pre-callback
+ if (params.fft_hasPreCallback)
+ {
+ str += callbackstr;
+ }
+
+ str += ")\n";
}
}
@@ -3148,8 +3284,8 @@ namespace StockhamGenerator
{
if(params.fft_placeness == CLFFT_INPLACE)
{
- if(inInterleaved) { inBuf = "lwb, "; outBuf = "lwb"; }
- else { inBuf = "lwbRe, lwbIm, "; outBuf = "lwbRe, lwbIm"; }
+ if(inInterleaved) { inBuf = "gb, "; outBuf = "lwb"; }
+ else { inBuf = "gbRe, gbIm, "; outBuf = "lwbRe, lwbIm"; }
}
else
{
@@ -3182,15 +3318,40 @@ namespace StockhamGenerator
str += "\n\tfor(uint t=0; t<2; t++)\n\t{\n\n";
}
+ std::string inOffset;
+ if (!r2c2r)
+ {
+ if (params.fft_placeness == CLFFT_INPLACE)
+ {
+ inOffset += "ioOffset";
+ }
+ else
+ {
+ inOffset += "iOffset";
+ }
+ }
+
// Call passes
if(numPasses == 1)
{
str += "\t";
str += PassName(0, fwd);
str += "("; str += rw; str += me;
- str += "0, 0, ";
+ str += (!r2c2r) ? inOffset : "0";
+ str += ", 0, ";
str += inBuf; str += outBuf;
str += IterRegs("&");
+
+ //if precalback set
+ if (!r2c2r && params.fft_hasPreCallback)
+ {
+ str += ", userdata";
+
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ str += ", localmem";
+ }
+ }
str += ");\n";
}
else
@@ -3234,12 +3395,32 @@ namespace StockhamGenerator
str += me;
if(p == passes.begin()) // beginning pass
{
- str += blockCompute ? ldsOff : "0";
+ if (blockCompute)
+ {
+ str += ldsOff;
+ }
+ else
+ {
+ str += (!r2c2r) ? inOffset : "0";
+ }
str += ", ";
str += ldsOff;
str += ", ";
str += inBuf;
- str += ldsArgs; str += IterRegs("&"); str += ");\n";
+ str += ldsArgs; str += IterRegs("&");
+
+ //if precalback set, append additional arguments
+ if (!r2c2r && params.fft_hasPreCallback)
+ {
+ str += ", userdata";
+
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ str += ", localmem";
+ }
+ }
+
+ str += ");\n";
if(!halfLds) { str += exTab; str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"; }
}
else if((p+1) == passes.end()) // ending pass
@@ -3360,6 +3541,13 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
this->signature.fft_inputLayout = this->plan->inputLayout;
this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
+ //Set callback if specified
+ if (this->plan->hasPreCallback)
+ {
+ this->signature.fft_hasPreCallback = true;
+ this->signature.fft_preCallback = this->plan->preCallback;
+ }
+
ARG_CHECK(this->plan->length.size() > 0);
ARG_CHECK(this->plan->inStride.size() > 0);
ARG_CHECK(this->plan->outStride.size() > 0);
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 8a2b4ec..ae06d07 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1588,6 +1588,14 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colTPlan->inStride.push_back(fftPlan->inStride[0]);
colTPlan->outStride.push_back(1);
+ //Set callback data if set on top level plan
+ if (fftPlan->hasPreCallback)
+ {
+ colTPlan->hasPreCallback = true;
+ colTPlan->preCallback = fftPlan->preCallback;
+ colTPlan->precallUserData = fftPlan->precallUserData;
+ }
+
// Enabling block column compute
if( (colTPlan->inStride[0] == length0) && IsPo2(fftPlan->length[0]) && (fftPlan->length[0] < 524288) )
{
diff --git a/src/library/plan.h b/src/library/plan.h
index be6231e..747137c 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -91,6 +91,18 @@ enum BlockComputeType
#define CLFFT_CB_SIZE 32
#define CLFFT_MAX_INTERNAL_DIM 16
+/*! @brief Data structure to store the callback function string and other metadata passed by client
+* @details Client sets the callback function and other required parameters through clFFTSetPlanCallback()
+* in order to register the callback function. The library populates these values into this data structure
+*/
+typedef struct clfftCallbackParam_
+{
+ int localMemSize; /*!< optional local memory size if needed by callback */
+ const char* funcname; /*!< callback function name */
+ const char* funcstring; /*!< callback function in string form */
+ const char* userdatastruct; /*!< optional custom data struct in string form */
+}clfftCallbackParam;
+
struct FFTKernelGenKeyParams {
/*
* This structure distills a subset of the fftPlan data,
@@ -135,6 +147,8 @@ struct FFTKernelGenKeyParams {
size_t blockSIMD;
size_t blockLDS;
+ bool fft_hasPreCallback;
+ clfftCallbackParam fft_preCallback;
// Default constructor
FFTKernelGenKeyParams()
@@ -170,6 +184,8 @@ struct FFTKernelGenKeyParams {
blockComputeType = BCT_C2C;
blockSIMD = 0;
blockLDS = 0;
+
+ fft_hasPreCallback = false;
}
};
@@ -429,6 +445,10 @@ public:
bool blockCompute;
BlockComputeType blockComputeType;
+ bool hasPreCallback;
+
+ clfftCallbackParam preCallback;
+ void *precallUserData;
clfftPlanHandle plHandle;
@@ -479,7 +499,9 @@ public:
, gen(Stockham)
, action(0)
, plHandle(0)
- {};
+ , hasPreCallback(false)
+ {
+ };
size_t ElementSize() const;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list