[clfft] 65/128: Precallback - Few review comment fixes
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:39 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 0967583457a94023ad317a2fc7969644115f6c35
Author: Pradeep <pradeep.rao at amd.com>
Date: Fri Sep 11 15:11:48 2015 +0530
Precallback - Few review comment fixes
clfftSetPlanCallback proper case for clfft prefix, cl_mem instead of
void* for userdata parameter, multiple user data buffers parameter
(support only 1 though for now)
---
src/callback-client/callback-client.cpp | 339 ++++++--------------------------
src/callback-client/client.h | 50 ++---
src/include/clFFT.h | 4 +-
src/library/accessors.cpp | 33 +++-
src/library/mainpage.h | 2 +-
src/library/plan.h | 4 +-
src/tests/cl_transform.h | 4 +-
7 files changed, 118 insertions(+), 318 deletions(-)
diff --git a/src/callback-client/callback-client.cpp b/src/callback-client/callback-client.cpp
index 9a7e72b..abae16d 100644
--- a/src/callback-client/callback-client.cpp
+++ b/src/callback-client/callback-client.cpp
@@ -57,9 +57,9 @@ int main(int argc, char **argv)
dim = CLFFT_3D;
}
- // Complex-Complex cases, SP
+ // Real-Complex cases, SP
- C2C_transform<float>(setupData, lengths, batchSize, dim, precision, profile_count);
+ R2C_transform<float>(setupData, lengths, batchSize, dim, precision, profile_count);
}
catch( std::exception& e )
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
}
template < typename T >
-void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
+void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
clfftDim dim, clfftPrecision precision, cl_uint profile_count)
{
// OpenCL state
@@ -83,7 +83,10 @@ void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
cl_uint command_queue_flags = 0;
command_queue_flags |= CL_QUEUE_PROFILING_ENABLE;
- size_t vectorLength = inlengths[0] * inlengths[1] * inlengths[2];
+ // Test for in-place Hermitian Interleaved output
+ // Hence output size is N/2 + 1 complex. So allocate N + 2 real input
+ size_t Nt = inlengths[0] + 2;
+ size_t vectorLength = Nt * inlengths[1] * inlengths[2];
size_t fftLength = vectorLength * batchSize;
//OpenCL initializations
@@ -97,10 +100,12 @@ void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
if (precision == CLFFT_SINGLE)
{
//Run clFFT with seaparate Pre-process Kernel
- runC2CPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+ /*runR2CPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision,
+ batchSize, vectorLength, fftLength, profile_count);*/
//Run clFFT using pre-callback
- runC2CPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+ runR2CPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision,
+ batchSize, vectorLength, fftLength, profile_count);
}
OPENCL_V_THROW( clReleaseCommandQueue( commandQueue ), "Error: In clReleaseCommandQueue\n" );
@@ -108,55 +113,50 @@ void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
}
template < typename T >
-void runC2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
size_t* inlengths, clfftDim dim, clfftPrecision precision,
size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
{
cl_int status = 0;
-
- size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
- size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
- size_t userdataLength = vectorLength_userdata * batchSize;
-
+
//input/output allocation sizes
- size_t size_of_buffers = fftLength * sizeof( std::complex< T > );
- size_t size_of_buffers_userdata = userdataLength * sizeof( std::complex< T > );
+ size_t in_size_of_buffers = fftLength * sizeof(char) * 3 ;
+ size_t out_size_of_buffers = fftLength * sizeof( T );
- //in-place transform. Same buffer for input and output
- cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
- OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
+ char* in24bitData = (char*)malloc(in_size_of_buffers);
//Initialize Data
- std::vector< std::complex< T > > userdata( userdataLength );
-
- // impulse test case
- std::complex< T > impulsedata(1,0);
- for (size_t idx = 0; idx < userdataLength; ++idx)
+ for (size_t idx = 0; idx < fftLength; ++idx)
{
- userdata[idx] = impulsedata;
+ in24bitData[3 * idx + 2] = (char)(rand() % 256);
+ in24bitData[3 * idx + 1] = (char)(rand() % 256);
+ in24bitData[3 * idx] = (char)(rand() % 256);
}
- //user data buffer
- cl_mem userDatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
- OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userDatabuffer) )" );
+ //input data buffer
+ cl_mem infftbuffer = ::clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, in_size_of_buffers, (void*)in24bitData, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(infftbuffer) )" );
+
+ //out-place transform.
+ cl_mem outfftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, out_size_of_buffers, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(oufftbuffer) )" );
//clFFT initializations
// FFT state
- clfftResultLocation place = CLFFT_INPLACE;
- clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED;
- clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
- clfftDirection dir = CLFFT_FORWARD;
+ clfftResultLocation place = CLFFT_OUTOFPLACE;
+ clfftLayout inLayout = CLFFT_REAL;
+ clfftLayout outLayout = CLFFT_HERMITIAN_INTERLEAVED;
clfftPlanHandle plan_handle;
OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
//Precallback setup
- char* precallbackstr = STRINGIFY(ZERO_PAD_C2C);
+ char* precallbackstr = STRINGIFY(ConvertToFloat);
//Register the callback
- OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "zeroPad", precallbackstr, NULL, 0, PRECALLBACK, userDatabuffer), "clFFTSetPlanCallback failed");
+ OPENCL_V_THROW (clfftSetPlanCallback(plan_handle, "convert24To32bit", precallbackstr, NULL, 0, PRECALLBACK, NULL, 0), "clFFTSetPlanCallback failed");
// Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
@@ -181,206 +181,13 @@ void runC2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context
OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
}
- cl_mem * buffersOut = NULL; //NULL for in-place
-
- // for functional test
- OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
- &fftbuffer, buffersOut, clMedBuffer ),
- "clfftEnqueueTransform failed" );
-
- OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
-
- // Loop as many times as the user specifies to average out the timings
- if (profile_count > 1)
- {
- Timer tr;
- tr.Start();
-
- for( cl_uint i = 0; i < profile_count; ++i )
- {
- OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
- &fftbuffer, buffersOut, clMedBuffer ),
- "clfftEnqueueTransform failed" );
-
- OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
- }
- double wtimesample = tr.Sample();
- double wtime = wtimesample/((double)profile_count);
-
- tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
- }
-
- if(clMedBuffer) clReleaseMemObject(clMedBuffer);
-
- if (profile_count == 1)
- {
- std::vector< std::complex< T > > output( fftLength );
-
- OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
- 0, NULL, NULL ), "Reading the result buffer failed" );
-
- //Reference fftw output
- fftwf_complex *refout;
-
- refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
-
- /*for( cl_uint i = 0; i < fftLength; i++)
- {
- std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
- }*/
- if (!compare<fftwf_complex, T>(refout, output, fftLength))
- {
- std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****FAIL*****" << std::endl;
- }
- else
- {
- std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****PASS*****" << std::endl;
- }
-
- fftwf_free(refout);
- }
-
- OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
- OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
-
- //cleanup
- OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
- OPENCL_V_THROW( clReleaseMemObject( userDatabuffer ), "Error: In clReleaseMemObject\n" );
-}
-
-template < typename T >
-void runC2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context,
- cl_command_queue commandQueue, cl_device_id device_id,
- size_t* inlengths, clfftDim dim, clfftPrecision precision,
- size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
-{
- cl_int status = 0;
-
- size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
- size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
- size_t userdataLength = vectorLength_userdata * batchSize;
-
- //input/output allocation sizes
- size_t size_of_buffers = fftLength * sizeof( std::complex< T > );
- size_t size_of_buffers_userdata = userdataLength * sizeof( std::complex< T > );
-
- //in-place transform. Same buffer for input and output
- cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
- OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
-
- //Initialize Data
- std::vector< std::complex< T > > userdata( userdataLength );
-
- // impulse test case
- std::complex< T > impulsedata(1,0);
- for (size_t idx = 0; idx < userdataLength; ++idx)
- {
- userdata[idx] = impulsedata;
- }
-
- //user data buffer
- cl_mem userdatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
- OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userdatabuffer) )" );
-
- //clFFT initializations
-
- // FFT state
- clfftResultLocation place = CLFFT_INPLACE;
- clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED;
- clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
- clfftDirection dir = CLFFT_FORWARD;
-
- clfftPlanHandle plan_handle;
- OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
- OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
-
- // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
- OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
- OPENCL_V_THROW( clfftSetLayout( plan_handle, inLayout, outLayout ), "clfftSetLayout failed" );
- OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batchSize ), "clfftSetPlanBatchSize failed" );
- OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
-
- //Bake Plan
- OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &commandQueue, NULL, NULL ), "clfftBakePlan failed" );
-
- //get the buffersize
- size_t buffersize=0;
- OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
-
- //allocate the intermediate buffer
- cl_mem clMedBuffer=NULL;
-
- if (buffersize)
- {
- cl_int medstatus;
- clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
- OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
- }
-
- cl_mem * buffersOut = NULL; //NULL for in-place
-
- //Pre-process kernel string
- const char* source = STRINGIFY(ZERO_PAD_C2C_KERNEL);
-
- cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
- OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
-
- status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
- OPENCL_V_THROW( status, "clBuildProgram failed" );
-
-#if defined( _DEBUG )
- if( status != CL_SUCCESS )
- {
- if( status == CL_BUILD_PROGRAM_FAILURE )
- {
- size_t buildLogSize = 0;
- OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
- "clGetProgramBuildInfo failed" );
-
- std::vector< char > buildLog( buildLogSize );
- ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
-
- OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
- "clGetProgramBuildInfo failed" );
-
- std::cerr << "\n\t\t\tBUILD LOG\n";
- std::cerr << "************************************************\n";
- std::cerr << &buildLog[ 0 ] << std::endl;
- std::cerr << "************************************************\n";
- }
-
- OPENCL_V_THROW( status, "clBuildProgram failed" );
- }
-#endif
-
- cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
- OPENCL_V_THROW( status, "clCreateKernel failed" );
-
//for functional test
- cl_uint uarg = 0;
-
- //Buffer to be zero-padded
- OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
-
- //originial data
- OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
-
- //Launch pre-process kernel
- size_t gSize = fftLength;
- size_t lSize = 64;
- status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
- NULL, &gSize, &lSize, 0, NULL, NULL );
- OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
-
- OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
-
- //Now invoke the clfft execute
- OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
- &fftbuffer, buffersOut, clMedBuffer ),
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+ &infftbuffer, &outfftbuffer, clMedBuffer ),
"clfftEnqueueTransform failed" );
OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
-
+
if (profile_count > 1)
{
Timer tr;
@@ -389,75 +196,59 @@ void runC2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
// Loop as many times as the user specifies to average out the timings
for( cl_uint i = 0; i < profile_count; ++i )
{
- uarg = 0;
-
- //Buffer to be zero-padded
- OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
-
- //originial data
- OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
-
- //Launch pre-process kernel
- status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
- NULL, &gSize, &lSize, 0, NULL, NULL );
- OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
-
- OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
-
- //Now invoke the clfft execute
- OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
- &fftbuffer, buffersOut, clMedBuffer ),
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+ &infftbuffer, &outfftbuffer, clMedBuffer ),
"clfftEnqueueTransform failed" );
OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
}
- double wtimesample = tr.Sample();
-
+ double wtimesample = tr.Sample();
double wtime = wtimesample/((double)profile_count);
- tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
+ tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
}
- //cleanup preprocess kernel opencl objects
- OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
- OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
-
if(clMedBuffer) clReleaseMemObject(clMedBuffer);
-
+
if (profile_count == 1)
{
- std::vector< std::complex< T > > output( fftLength );
+ std::vector< std::complex< T > > output( fftLength/2 );
- OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
+ OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, outfftbuffer, CL_TRUE, 0, out_size_of_buffers, &output[ 0 ],
0, NULL, NULL ), "Reading the result buffer failed" );
- //Reference fftw output
- fftwf_complex *refout;
-
- refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
-
- /*for( cl_uint i = 0; i < fftLength; i++)
- {
- std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
- }*/
- if (!compare<fftwf_complex, T>(refout, output, fftLength))
- {
- std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****FAIL*****" << std::endl;
- }
- else
+ for( cl_uint i = 0; i < fftLength/2; i++)
{
- std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****PASS*****" << std::endl;
+ std::cout << "i " << i << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
}
- fftwf_free(refout);
+ ////Reference fftw output
+ //fftwf_complex *refout;
+
+ //refout = get_R2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim);
+
+ ///*for( cl_uint i = 0; i < fftLength; i++)
+ //{
+ // std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ //}*/
+ //if (!compare<fftwf_complex, T>(refout, output, fftLength))
+ //{
+ // std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****FAIL*****" << std::endl;
+ //}
+ //else
+ //{
+ // std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****PASS*****" << std::endl;
+ //}
+
+ //fftwf_free(refout);
}
OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
//cleanup
- OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
- OPENCL_V_THROW( clReleaseMemObject( userdatabuffer ), "Error: In clReleaseMemObject\n" );
+ OPENCL_V_THROW( clReleaseMemObject( infftbuffer ), "Error: In clReleaseMemObject\n" );
+ OPENCL_V_THROW( clReleaseMemObject( outfftbuffer ), "Error: In clReleaseMemObject\n" );
}
//Compare reference and opencl output
diff --git a/src/callback-client/client.h b/src/callback-client/client.h
index 502bed9..819735d 100644
--- a/src/callback-client/client.h
+++ b/src/callback-client/client.h
@@ -34,37 +34,31 @@
#define USERDATA_LENGTH 512
#define BATCH_LENGTH 1024 // Must be >= USERDATA_LENGTH
-#define ZERO_PAD_C2C __attribute__((always_inline)) \n float2 zeroPad (__global void *input, \n \
- uint inoffset, \n \
- __global void *userdata) \n \
- { \n \
- float2 scalar = 0.0f; \n \
- uint udoffset; \n \
- if ((inoffset % BATCH_LENGTH) < USERDATA_LENGTH) \n \
- { \n \
- udoffset = ((inoffset/BATCH_LENGTH) * USERDATA_LENGTH) + (inoffset % BATCH_LENGTH); \n \
- scalar = *((__global float2*)userdata + udoffset); \n \
- } \n \
- return scalar; \n \
- } \n
-
-#define ZERO_PAD_C2C_KERNEL __kernel void zeroPad (__global void *input, \n \
+#define ConvertToFloat float convert24To32bit(__global void* in, uint inoffset, __global void* userdata)\n \
+ { \n \
+ __global char* inData = (__global char*)in; \n \
+ float val = inData[3*inoffset+2] << 24 | inData[3*inoffset+1] << 16 | inData[3*inoffset] << 8 ; \n \
+ return val / (float)(INT_MAX - 256); \n \
+ }
+
+#define ConvertToFloat_KERNEL __kernel void convert24To32bit (__global void *input, \n \
__global void *userdata) \n \
{ \n \
uint inoffset = get_global_id(0); \n \
- float2 scalar = 0.0f; \n \
- uint udoffset; \n \
- if ((inoffset % BATCH_LENGTH) < USERDATA_LENGTH) \n \
- { \n \
- udoffset = ((inoffset/BATCH_LENGTH) * USERDATA_LENGTH) + (inoffset % BATCH_LENGTH); \n \
- scalar = *((__global float2*)userdata + udoffset); \n \
- } \n \
- *((__global float2*)input + inoffset) = scalar; \n \
+ __global char* inData = (__global char*)in; \n \
+ float val = inData[3*inoffset+2] << 24 | inData[3*inoffset+1] << 16 | inData[3*inoffset] << 8 ; \n \
+ *((__global float*)input + inoffset) = val / (float)(INT_MAX - 256); \n \
} \n
+
+template < typename T >
+void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
+ clfftDim dim, clfftPrecision precision, cl_uint profile_count);
+
template < typename T >
-void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
- clfftDim dim, clfftPrecision precision, cl_uint profile_count);
+void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+ size_t* inlengths, clfftDim dim, clfftPrecision precision,
+ size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
fftwf_complex* get_C2C_fftwf_output(size_t* lengths, size_t fftBatchSize, int batch_size, clfftLayout in_layout,
clfftDim dim, clfftDirection dir);
@@ -73,12 +67,6 @@ template < typename T1, typename T2>
bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
size_t length, const float epsilon = 1e-6f);
-template < typename T >
-void runC2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
-
-template < typename T >
-void runC2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, cl_device_id device_id, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
-
#ifdef WIN32
struct Timer
diff --git a/src/include/clFFT.h b/src/include/clFFT.h
index 5de09b1..9fb9186 100644
--- a/src/include/clFFT.h
+++ b/src/include/clFFT.h
@@ -561,8 +561,10 @@ extern "C" {
* @param[localMemSize] Optional - Local memory size if needed by callback. Pass 0 if local memory not needed by callback
* @param[callbackType] Type of callback - Pre-Callback or Post-Callback
* @param[userdata] cl_mem object passed as paarameter to callback function
+ * @param[numUserdataBuffers] Number of userdata buffers
*/
- CLFFTAPI clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName, const char* funcString, const char* userStructString, int localMemSize, clFFTCallbackType callbackType, void *userdata);
+ CLFFTAPI clfftStatus clfftSetPlanCallback(clfftPlanHandle plHandle, const char* funcName, const char* funcString, const char* userStructString,
+ int localMemSize, clFFTCallbackType callbackType, cl_mem *userdata, int numUserdataBuffers);
/*! @brief Enqueue an FFT transform operation, and return immediately (non-blocking)
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index 32de44d..c67373d 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -767,22 +767,30 @@ clfftStatus clfftLocalMemSize( const clfftPlanHandle plHandle, cl_ulong* local_m
return CLFFT_SUCCESS;
}
-clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName,
+clfftStatus clfftSetPlanCallback(clfftPlanHandle plHandle, const char* funcName,
const char* funcString, const char* userStructString,
int localMemSize, clFFTCallbackType callbackType,
- void *userdata)
+ cl_mem *userdata, int numUserdataBuffers)
{
FFTRepo& fftRepo = FFTRepo::getInstance( );
FFTPlan* fftPlan = NULL;
lockRAII* planLock = NULL;
OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
- scopedLock sLock( *planLock, _T( "clFFTSetPlanCallback" ) );
+ scopedLock sLock( *planLock, _T( "clfftSetPlanCallback" ) );
- if (callbackType == PRECALLBACK)
+ switch (callbackType)
{
- if (funcName != NULL && funcString != NULL)
+ case PRECALLBACK:
{
+ ARG_CHECK(funcName != NULL);
+ ARG_CHECK(funcString != NULL);
+ ARG_CHECK(numUserdataBuffers >= 0);
+
+ // We do not currently support multiple user data buffers
+ if( numUserdataBuffers > 1 )
+ return CLFFT_NOTIMPLEMENTED;
+
fftPlan->hasPreCallback = true;
fftPlan->preCallback.funcname = funcName;
@@ -790,8 +798,19 @@ clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName,
fftPlan->preCallback.userdatastruct = userStructString;
fftPlan->preCallback.localMemSize = (localMemSize > 0) ? localMemSize : 0;
- fftPlan->precallUserData = userdata;
- }
+ cl_mem userdataBuf = NULL;
+
+ if (userdata)
+ userdataBuf = userdata[0];
+
+ fftPlan->precallUserData = userdataBuf;
+ }
+
+ break;
+ case POSTCALLBACK:
+ return CLFFT_NOTIMPLEMENTED;
+ default:
+ ARG_CHECK (false);
}
return CLFFT_SUCCESS;
diff --git a/src/library/mainpage.h b/src/library/mainpage.h
index 268da3b..6f3f31d 100644
--- a/src/library/mainpage.h
+++ b/src/library/mainpage.h
@@ -635,7 +635,7 @@ cl_mem userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PT
//* Step 3 : Register the callback
//**************************************************************************
-status = clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata);
+status = clfftSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, &userdata, 1);
//**************************************************************************
diff --git a/src/library/plan.h b/src/library/plan.h
index 747137c..53be126 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -92,7 +92,7 @@ enum BlockComputeType
#define CLFFT_MAX_INTERNAL_DIM 16
/*! @brief Data structure to store the callback function string and other metadata passed by client
-* @details Client sets the callback function and other required parameters through clFFTSetPlanCallback()
+* @details Client sets the callback function and other required parameters through clfftSetPlanCallback()
* in order to register the callback function. The library populates these values into this data structure
*/
typedef struct clfftCallbackParam_
@@ -448,7 +448,7 @@ public:
bool hasPreCallback;
clfftCallbackParam preCallback;
- void *precallUserData;
+ cl_mem precallUserData;
clfftPlanHandle plHandle;
diff --git a/src/tests/cl_transform.h b/src/tests/cl_transform.h
index 9c6d2b2..4720462 100644
--- a/src/tests/cl_transform.h
+++ b/src/tests/cl_transform.h
@@ -665,7 +665,7 @@ public:
OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
//Register the callback
- OPENCL_V_THROW (clFFTSetPlanCallback(*plan_handle, "mulval", precallbackstr, NULL, localMemSize, PRECALLBACK, userdataBuff), "clFFTSetPlanCallback failed");
+ OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval", precallbackstr, NULL, localMemSize, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
}
/*****************************************************/
@@ -704,7 +704,7 @@ public:
OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
//Register the callback
- OPENCL_V_THROW (clFFTSetPlanCallback(*plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdataBuff), "clFFTSetPlanCallback failed");
+ OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
}
/*****************************************************/
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list