[clfft] 11/128: Precallback - Complex-Real 1D single kernel SP
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:33 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 8f07253282196bb552aa153f266be68f8d3f60e8
Author: Pradeep <pradeep.rao at amd.com>
Date: Mon Aug 10 13:24:23 2015 +0530
Precallback - Complex-Real 1D single kernel SP
---
src/client-callback/callback-client.cpp | 1317 +++++++++++++++++++++----------
src/library/accessors.cpp | 2 +-
src/library/generator.stockham.cpp | 207 +++--
3 files changed, 1039 insertions(+), 487 deletions(-)
diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
index 5fb6c9e..1614922 100644
--- a/src/client-callback/callback-client.cpp
+++ b/src/client-callback/callback-client.cpp
@@ -13,8 +13,7 @@
namespace po = boost::program_options;
-#define SCALAR 100
-#define PRECALLBACKTYPE 1
+#define SCALAR 10
#define MULVAL float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
{ \n \
@@ -23,6 +22,13 @@ namespace po = boost::program_options;
return ret; \n \
}
+#define MULVAL_C2R float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
+ { \n \
+ int scalar = *((__global int*)userdata + offset); \n \
+ float2 ret = *((__global float2*)in + offset) * scalar; \n \
+ return ret; \n \
+ }
+
#define MULVAL_DP double2 mulval(__global void* in, uint offset, __global void* userdata)\n \
{ \n \
int scalar = *((__global int*)userdata + offset); \n \
@@ -57,27 +63,27 @@ namespace po = boost::program_options;
} USER_DATA;
STRUCT_USERDATA
-//Compare reference and opencl output
+//Compare reference and opencl output
template < typename T1, typename T2>
bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
- const int length, const float epsilon = 1e-6f)
+ size_t length, const float epsilon = 1e-6f)
{
float error = 0.0f;
- float ref = 0.0f;
- float diff = 0.0f;
+ T1 ref;
+ T1 diff;
float normRef = 0.0f;
float normError = 0.0f;
- for(int i = 0; i < length; ++i)
+ for(size_t i = 0; i < length; ++i)
{
- diff = refData[i][0] - data[i].real();
- error += diff * diff;
- ref += refData[i][0] * refData[i][0];
+ diff[0] = refData[i][0] - data[i].real();
+ error += (float)(diff[0] * diff[0]);
+ ref[0] += refData[i][0] * refData[i][0];
}
if (error != 0)
{
- normRef =::sqrtf((float) ref);
- if (::fabs((float) ref) < 1e-7f)
+ normRef =::sqrtf((float) ref[0]);
+ if (::fabs((float) ref[0]) < 1e-7f)
{
return false;
}
@@ -90,19 +96,19 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
//imag
error = 0.0f;
- ref = 0.0f;
- for(int i = 0; i < length; ++i)
+ ref[1] = 0.0;
+ for(size_t i = 0; i < length; ++i)
{
- diff = refData[i][1] - data[i].imag();
- error += diff * diff;
- ref += refData[i][1] * refData[i][1];
+ diff[1] = refData[i][1] - data[i].imag();
+ error += (float)(diff[1] * diff[1]);
+ ref[1] += refData[i][1] * refData[i][1];
}
if (error == 0)
return true;
- normRef =::sqrtf((float) ref);
- if (::fabs((float) ref) < 1e-7f)
+ normRef =::sqrtf((float) ref[1]);
+ if (::fabs((float) ref[1]) < 1e-7f)
{
return false;
}
@@ -118,25 +124,25 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
//Compare reference and opencl output
template < typename T1, typename T2 >
bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
- const int length, const float epsilon = 1e-6f)
+ size_t length, const float epsilon = 1e-6f)
{
float error = 0.0f;
- float ref = 0.0f;
- float diff = 0.0f;
+ T1 ref;
+ T1 diff;
float normRef = 0.0f;
float normError = 0.0f;
//real compare
- for(int i = 0; i < length; ++i)
+ for(size_t i = 0; i < length; ++i)
{
- diff = refData[i][0] - real[i];
- error += diff * diff;
- ref += refData[i][0] * refData[i][0];
+ diff[0] = refData[i][0] - real[i];
+ error += (float)(diff[0] * diff[0]);
+ ref[0] += refData[i][0] * refData[i][0];
}
if (error != 0)
{
- normRef =::sqrtf((float) ref);
- if (::fabs((float) ref) < 1e-7f)
+ normRef =::sqrtf((float) ref[0]);
+ if (::fabs((float) ref[0]) < 1e-7f)
{
return false;
}
@@ -149,20 +155,20 @@ bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
//imag compare
error = 0.0f;
- ref = 0.0f;
+ ref[1] = 0.0;
- for(int i = 0; i < length; ++i)
+ for(size_t i = 0; i < length; ++i)
{
- diff = refData[i][1] - imag[i];
- error += diff * diff;
- ref += refData[i][1] * refData[i][1];
+ diff[1] = refData[i][1] - imag[i];
+ error += (float)(diff[1] * diff[1]);
+ ref[1] += refData[i][1] * refData[i][1];
}
if (error == 0)
return true;
- normRef =::sqrtf((float) ref);
- if (::fabs((float) ref) < 1e-7f)
+ normRef =::sqrtf((float) ref[1]);
+ if (::fabs((float) ref[1]) < 1e-7f)
{
return false;
}
@@ -175,6 +181,46 @@ bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
return true;
}
+//Compare reference and opencl output
+template < typename T1 , typename T2 >
+bool compare(T1 *refData, std::valarray< T2 > real,
+ size_t length, int batchsize, const float epsilon = 1e-6f)
+{
+ float error = 0.0f;
+ T1 ref = 0.0;
+ T1 diff;
+ float normRef = 0.0f;
+ float normError = 0.0f;
+ size_t scale = length;
+
+ //real compare
+ for (int b = 0; b < batchsize; b++)
+ {
+ int idx = b * (length + 2);
+ for(size_t i = idx; i < (idx + length); ++i)
+ {
+ diff = refData[i] - (real[i] * scale);
+ error += (float)(diff * diff);
+ ref += refData[i] * refData[i];
+ }
+ }
+ if (error != 0)
+ {
+ normRef =::sqrtf((float) ref);
+ if (::fabs((float) ref) < 1e-7f)
+ {
+ return false;
+ }
+ normError = ::sqrtf((float) error);
+ error = normError / normRef;
+
+ if (error > epsilon)
+ return false;
+ }
+
+ return true;
+}
+
// Compute reference output using fftw for float type
fftwf_complex* get_fftwf_output(size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
@@ -188,28 +234,28 @@ fftwf_complex* get_fftwf_output(size_t* lengths, const size_t *inStrides, const
fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftBatchSize);
fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*outfftBatchSize);
- refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size,
- refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded,
- refout, &fftwLengths[3 - dim], outStrides[0], outfftVectorSizePadded,
+ refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], (int)batch_size,
+ refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded,
+ refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded,
dir, FFTW_ESTIMATE);
int scalar;
- for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
{
switch (in_layout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- scalar = SCALAR + (i % fftVectorSize);
+ scalar = SCALAR + (int)(i % fftVectorSize);
break;
case CLFFT_COMPLEX_PLANAR:
- scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1);
+ scalar = (int)((SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1));
break;
default:
break;
}
- refin[i][0] = 1 * scalar;
- refin[i][1] = 0 * scalar;
+ refin[i][0] = (float)(1 * scalar);
+ refin[i][1] = (float)(0 * scalar);
}
fftwf_execute(refPlan);
@@ -234,21 +280,21 @@ fftw_complex* get_fftw_output(size_t* lengths, const size_t *inStrides, const si
//In FFTW last dimension has the fastest changing index
int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
- refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size,
- refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded,
- refout, &fftwLengths[3 - dim], outStrides[0], outfftVectorSizePadded,
+ refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], (int)batch_size,
+ refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded,
+ refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded,
dir, FFTW_ESTIMATE);
int scalar;
- for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
{
switch (in_layout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- scalar = SCALAR + (i % fftVectorSize);
+ scalar = SCALAR + (int)(i % fftVectorSize);
break;
case CLFFT_COMPLEX_PLANAR:
- scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1);
+ scalar = (int)((SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1));
break;
default:
break;
@@ -267,6 +313,70 @@ fftw_complex* get_fftw_output(size_t* lengths, const size_t *inStrides, const si
return refout;
}
+// Compute C2R reference output using fftw for float type
+float* get_fftwf_output_c2r(size_t* lengths, size_t *strides, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+ size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
+ size_t outfftVectorSizePadded, size_t outfftVectorSize, clfftDim dim, clfftDirection dir)
+{
+ //In FFTW last dimension has the fastest changing index
+ int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+
+ fftwf_plan refPlan;
+
+ fftwf_complex *refin = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*fftBatchSize);
+ float *refout = (float*) malloc(sizeof(float)*outfftBatchSize);
+
+ refPlan = fftwf_plan_many_dft_c2r(dim, &fftwLengths[3 - dim], (int)batch_size,
+ refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded,
+ refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded,
+ FFTW_ESTIMATE);
+
+ // set zero
+ for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0] )
+ {
+ refin[ i ][0] = 0; refin[ i ][1] = 0;
+ }
+
+ // impulse test case
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+ refin[ p3 ][0] = static_cast<float>(outfftVectorSize);
+ }
+
+ int scalar;
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+
+ for( size_t i = 0; i < fftVectorSizePadded; i = i + inStrides[0])
+ {
+ switch (in_layout)
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ scalar = SCALAR + i;
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ scalar = (int)(SCALAR + i + (SCALAR + i + 1));
+ break;
+ default:
+ break;
+ }
+
+ refin[p3 + i][0] *= (float)(scalar);
+ refin[p3 + i][1] *= (float)(scalar);
+ }
+ }
+
+ fftwf_execute(refPlan);
+
+ fftw_free(refin);
+
+ fftwf_destroy_plan(refPlan);
+
+ return refout;
+}
+
// This is used with the program_options class so that the user can type an integer on the command line
// and we store into an enum varaible
template<class _Elem, class _Traits>
@@ -278,40 +388,107 @@ std::basic_istream<_Elem, _Traits> & operator>> (std::basic_istream<_Elem, _Trai
return stream;
}
-template < typename T >
-int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
- clfftLayout in_layout, clfftLayout out_layout,
- clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
- cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
- cl_uint command_queue_flags, cl_uint profile_count,
- std::auto_ptr< clfftSetupData > setupData,
- bool hasPrecallback)
+//Validate the input and output data layout
+void validateDataLayout(clfftLayout in_layout, clfftLayout out_layout, clfftResultLocation place)
{
- // Our command line does not specify what dimension FFT we wish to transform; we decode
- // this from the lengths that the user specifies for X, Y, Z. A length of one means that
- // The user does not want that dimension.
+ switch( in_layout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ case CLFFT_COMPLEX_PLANAR:
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ break;
+ default:
+ // Don't recognize input layout
+ {
+ throw std::runtime_error( "Un-recognized data layout" );
+ }
+ break;
+ }
- const size_t max_dimensions = 3;
- size_t strides[ 4 ];
- size_t o_strides[ 4 ];
- size_t fftVectorSize = 0;
- size_t fftVectorSizePadded = 0;
- size_t fftBatchSize = 0;
- size_t outfftVectorSize = 0;
- size_t outfftVectorSizePadded = 0;
- size_t outfftBatchSize = 0;
- size_t size_of_input_buffers_in_bytes = 0;
- size_t size_of_output_buffers_in_bytes = 0;
+ switch( out_layout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ case CLFFT_COMPLEX_PLANAR:
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ break;
+ default:
+ // Don't recognize output layout
+ {
+ throw std::runtime_error( "Un-recognized data layout" );
+ }
+ break;
+ }
+
+ if (( place == CLFFT_INPLACE ) && ( in_layout != out_layout ))
+ {
+ switch( in_layout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
+ {
+ throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+ }
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
+ {
+ throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
+ }
+ break;
+ }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ if( out_layout != CLFFT_REAL )
+ {
+ throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+ }
+ break;
+ }
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
+ break;
+ }
+ case CLFFT_REAL:
+ {
+ if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
+ {
+ throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+ }
+ break;
+ }
+ default:
+ {
+ throw std::runtime_error( "Input layout format not yet supported" );
+ }
+ break;
+ }
+ }
+}
+
+//FFT data initializations
+template < typename T >
+cl_int dataInitialize(size_t* lengths, clfftDim *dim, size_t batch_size,
+ const size_t *inStrides, size_t *strides, const size_t *outStrides, size_t *o_strides,
+ size_t *fftBatchSize, size_t *outfftBatchSize, size_t *fftVectorSizePadded, clfftLayout in_layout, clfftLayout out_layout,
+ size_t *outfftVectorSizePadded, size_t *fftVectorSize, size_t *outfftVectorSize,
+ clfftResultLocation place, size_t *size_of_output_buffers_in_bytes,
+ size_t *size_of_input_buffers_in_bytes, cl_mem *input_cl_mem_buffers, cl_mem *output_cl_mem_buffers,
+ cl_context *context, cl_command_queue *queue,
+ cl_device_type deviceType, cl_int deviceId, cl_int platformId, cl_uint command_queue_flags)
+{
+ cl_event outEvent = NULL;
cl_uint number_of_output_buffers = 0;
- clfftDim dim = CLFFT_1D;
- cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
- cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
+ const size_t max_dimensions = 3;
std::vector< cl_device_id > device_id;
- cl_context context;
- cl_command_queue queue;
- cl_event outEvent = NULL;
- clfftPlanHandle plan_handle;
-
+
for (unsigned u = 0; u < max_dimensions; ++u) {
if (0 != lengths[u])
continue;
@@ -320,11 +497,11 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
if( lengths[ 1 ] > 1 )
{
- dim = CLFFT_2D;
+ *dim = CLFFT_2D;
}
if( lengths[ 2 ] > 1 )
{
- dim = CLFFT_3D;
+ *dim = CLFFT_3D;
}
strides[ 0 ] = inStrides[0];
@@ -337,72 +514,80 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
o_strides[ 2 ] = outStrides[2];
o_strides[ 3 ] = outStrides[3];
- fftVectorSize = lengths[0] * lengths[1] * lengths[2];
- fftVectorSizePadded = strides[3];
- fftBatchSize = fftVectorSizePadded * batch_size;
-
- size_t Nt = 1 + lengths[0]/2;
+ *fftVectorSize = lengths[0] * lengths[1] * lengths[2];
+ *fftVectorSizePadded = strides[3];
+ *fftBatchSize = *fftVectorSizePadded * batch_size;
if(place == CLFFT_INPLACE)
{
- outfftVectorSize = fftVectorSize;
- outfftVectorSizePadded = fftVectorSizePadded;
- outfftBatchSize = fftBatchSize;
+ *outfftVectorSize = *fftVectorSize;
+ *outfftVectorSizePadded = *fftVectorSizePadded;
+ *outfftBatchSize = *fftBatchSize;
}
else
{
- outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
- outfftVectorSizePadded = o_strides[3];
- outfftBatchSize = outfftVectorSizePadded * batch_size;
+ *outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
+ *outfftVectorSizePadded = o_strides[3];
+ *outfftBatchSize = *outfftVectorSizePadded * batch_size;
}
// Real to complex case
if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) )
{
- terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
- return 1;
+ *fftVectorSizePadded = strides[3];
+ *fftBatchSize = *fftVectorSizePadded * batch_size;
+
+ *outfftVectorSizePadded = o_strides[3];
+ *outfftBatchSize = *outfftVectorSizePadded * batch_size;
+
+ *fftVectorSize = lengths[0] * lengths[1] * lengths[2];
+ *outfftVectorSize = *fftVectorSize;
}
switch( out_layout )
{
case CLFFT_COMPLEX_INTERLEAVED:
number_of_output_buffers = 1;
- size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > );
+ *size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof( std::complex< T > );
break;
case CLFFT_COMPLEX_PLANAR:
number_of_output_buffers = 2;
- size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T);
+ *size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ number_of_output_buffers = 1;
+ *size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof( std::complex< T > );
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ number_of_output_buffers = 2;
+ *size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
+ break;
+ case CLFFT_REAL:
+ number_of_output_buffers = 1;
+ *size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
break;
- default:
- terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
- return 1;
}
- if (hasPrecallback && !(in_layout == CLFFT_COMPLEX_INTERLEAVED || in_layout == CLFFT_COMPLEX_PLANAR))
- {
- terr << _T("Pre-callback feature is currently supported only for Complex-Complex FFT " ) << std::endl;
- return 1;
- }
- // Fill the input buffers
+ // Fill the input buffers
switch( in_layout )
{
case CLFFT_COMPLEX_INTERLEAVED:
{
// This call creates our openCL context and sets up our devices; expected to throw on error
- size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > );
+ *size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( std::complex< T > );
- device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
- createOpenCLCommandQueue( context,
- command_queue_flags, queue,
+ device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+ createOpenCLCommandQueue( *context,
+ command_queue_flags, *queue,
device_id,
- size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
- size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+ *size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+ *size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
- std::vector< std::complex< T > > input( fftBatchSize );
+ std::vector< std::complex< T > > input( *fftBatchSize );
// set zero
- for( cl_uint i = 0; i < fftBatchSize; ++i )
+ for( cl_uint i = 0; i < *fftBatchSize; ++i )
{
input[ i ] = 0;
}
@@ -426,8 +611,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
}
}
-
- OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ],
+ OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &input[ 0 ],
0, NULL, &outEvent ),
"clEnqueueWriteBuffer failed" );
@@ -436,20 +620,20 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
case CLFFT_COMPLEX_PLANAR:
{
// This call creates our openCL context and sets up our devices; expected to throw on error
- size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T );
+ *size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
- device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
- createOpenCLCommandQueue( context,
- command_queue_flags, queue,
+ device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+ createOpenCLCommandQueue( *context,
+ command_queue_flags, *queue,
device_id,
- size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
- size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+ *size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
+ *size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
- std::vector< T > real( fftBatchSize );
- std::vector< T > imag( fftBatchSize );
+ std::vector< T > real( *fftBatchSize );
+ std::vector< T > imag( *fftBatchSize );
// set zero
- for( cl_uint i = 0; i < fftBatchSize; ++i )
+ for( cl_uint i = 0; i < *fftBatchSize; ++i )
{
real[ i ] = 0;
imag[ i ] = 0;
@@ -474,114 +658,577 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
}
}
-
- OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+ OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
0, NULL, &outEvent ),
"clEnqueueWriteBuffer failed" );
- OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+ OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &imag[ 0 ],
0, NULL, &outEvent ),
"clEnqueueWriteBuffer failed" );
}
break;
- default:
- terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
- return 1;
- }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ // This call creates our openCL context and sets up our devices; expected to throw on error
+ *size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( std::complex< T > );
- // Discover and load the timer module if present
- void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
- if( timerLibHandle == NULL )
- {
- terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
- }
+ device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+ createOpenCLCommandQueue( *context,
+ command_queue_flags, *queue,
+ device_id,
+ *size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+ *size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+ std::vector< std::complex< T > > input( *fftBatchSize );
- // Timer module discovered and loaded successfully
- // Initialize function pointers to call into the shared module
- PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );
+ // set zero
+ for( cl_uint i = 0; i < *fftBatchSize; ++i )
+ {
+ input[ i ] = 0;
+ }
- // Create and initialize our timer class, if the external timer shared library loaded
- baseStatTimer* timer = NULL;
- size_t clFFTID = 0;
- if( get_timer )
- {
- timer = get_timer( CLFFT_GPU );
- timer->Reserve( 1, profile_count );
- timer->setNormalize( true );
+ // impulse test case
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+ input[p3] = static_cast<T>(*outfftVectorSize);
- clFFTID = timer->getUniqueID( "clFFT", 0 );
- }
+ }
- OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
- OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );
+ OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &input[ 0 ],
+ 0, NULL, &outEvent ),
+ "clEnqueueWriteBuffer failed" );
+ }
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ // This call creates our openCL context and sets up our devices; expected to throw on error
+ *size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
- // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
- OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
- OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
- OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
- OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+ device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+ createOpenCLCommandQueue( *context,
+ command_queue_flags, *queue,
+ device_id,
+ *size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
+ *size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
- OPENCL_V_THROW (clfftSetPlanInStride ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
- OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
- OPENCL_V_THROW (clfftSetPlanDistance ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );
+ std::vector< T > real( *fftBatchSize );
+ std::vector< T > imag( *fftBatchSize );
- // Set backward scale factor to 1.0 for non real FFTs to do correct output checks
- if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
- OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );
+ // set zero
+ for( cl_uint i = 0; i < *fftBatchSize; ++i )
+ {
+ real[ i ] = 0;
+ imag[ i ] = 0;
+ }
- //Check for Precallback
- //Currently test includes only for 1D
- if (hasPrecallback)
- {
- int precallbakType = PRECALLBACKTYPE;
- cl_mem userdata;
+ // impulse test case
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+ real[p3] = static_cast<T>(*outfftVectorSize);
+ }
- if (in_layout == CLFFT_COMPLEX_INTERLEAVED)
+ OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, &outEvent ),
+ "clEnqueueWriteBuffer failed" );
+ OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &imag[ 0 ],
+ 0, NULL, &outEvent ),
+ "clEnqueueWriteBuffer failed" );
+ }
+ break;
+ case CLFFT_REAL:
{
- switch (precallbakType)
+ // This call creates our openCL context and sets up our devices; expected to throw on error
+ *size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
+
+ device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+ createOpenCLCommandQueue( *context,
+ command_queue_flags, *queue,
+ device_id,
+ *size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+ *size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+
+ std::vector< T > real( *fftBatchSize );
+
+ // set zero
+ for( cl_uint i = 0; i < *fftBatchSize; ++i )
{
- case 1: //C2C 1D Interleaved
+ real[ i ] = 0;
+ }
+
+ // impulse test case
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+ for(size_t k = 0; k < lengths[2]; k++)
{
- char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+ size_t p2 = p3 + k * strides[2];
+ for(size_t j = 0; j < lengths[1]; j++)
+ {
+ size_t p1 = p2 + j * strides[1];
+ for(size_t i = 0; i < lengths[0]; i++)
+ {
+ size_t p0 = p1 + i * strides[0];
+ real[p0] = 1;
+ }
+ }
+ }
+ }
- int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
- for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, &outEvent ),
+ "clEnqueueWriteBuffer failed" );
+ }
+ break;
+ default:
+ {
+ throw std::runtime_error( "Input layout format not yet supported" );
+ }
+ break;
+ }
+
+ return 0;
+}
+
+//Compare output with reference C/FFTW code
+template < typename T >
+void compareWithReference(clfftLayout in_layout, clfftLayout out_layout, size_t outfftBatchSize, clfftResultLocation place, clfftPrecision precision,
+ cl_command_queue queue, cl_mem *input_cl_mem_buffers, size_t size_of_input_buffers_in_bytes, size_t size_of_output_buffers_in_bytes,
+ cl_mem *BuffersOut, size_t* lengths, size_t * strides, const size_t *inStrides, const size_t *outStrides, size_t *o_strides,
+ size_t batch_size, size_t fftBatchSize, size_t fftVectorSizePadded, size_t outfftVectorSize,
+ size_t outfftVectorSizePadded, size_t fftVectorSize, clfftDim dim, clfftDirection dir, bool hasPrecallback)
+{
+ bool checkflag= false;
+
+ switch( out_layout )
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ std::vector< std::complex< T > > output( outfftBatchSize );
+
+ if( place == CLFFT_INPLACE )
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+ else
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+
+ //check output data
+ if (hasPrecallback)
+ {
+ switch(in_layout)
+ {
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if (precision == CLFFT_SINGLE)
+ {
+ fftwf_complex *refout;
+
+ refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+ in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+ if (!compare<fftwf_complex, T>(refout, output, outfftBatchSize))
+ checkflag = true;
+
+ //for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+ //{
+ // std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ //}
+
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ }*/
+
+ fftwf_free(refout);
+ }
+ else if (precision == CLFFT_DOUBLE)
+ {
+ fftw_complex *refout;
+
+ refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+ in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+ if (!compare<fftw_complex, T>(refout, output, outfftBatchSize))
+ checkflag = true;
+
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ }*/
+
+ fftw_free(refout);
+ }
+ }
+ break;
+ }
+ }
+ else
+ {
+ for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ {
+ if (0 == (i % outfftVectorSizePadded))
{
- h_userdata[ i ] = SCALAR + (i % fftVectorSize);
+ if (output[i].real() != outfftVectorSize)
+ {
+ checkflag = true;
+ break;
+ }
+
+ }
+ else
+ {
+ if (output[ i ].real() != 0)
+ {
+ checkflag = true;
+ break;
+ }
}
- userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
- //Register the callback
- OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+ if (output[ i ].imag() != 0)
+ {
+ checkflag = true;
+ break;
+ }
}
- break;
- default:
- break;
}
}
+ break;
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ std::valarray< T > real( outfftBatchSize );
+ std::valarray< T > imag( outfftBatchSize );
- if (in_layout == CLFFT_COMPLEX_PLANAR)
+ if( place == CLFFT_INPLACE )
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+ else
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+
+ // Check output data
+ if (hasPrecallback)
+ {
+ switch(in_layout)
+ {
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if (precision == CLFFT_SINGLE)
+ {
+ fftwf_complex *refout;
+
+ refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+ in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+ if (!compare<fftwf_complex, T>(refout, real, imag, outfftBatchSize))
+ checkflag = true;
+
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
+ }*/
+
+ fftwf_free(refout);
+ }
+ else if (precision == CLFFT_DOUBLE)
+ {
+ fftw_complex *refout;
+
+ refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+ in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+ if (!compare<fftw_complex, T>(refout, real, imag, outfftBatchSize))
+ checkflag = true;
+
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
+ }*/
+
+ fftw_free(refout);
+ }
+ }
+ break;
+ }
+ }
+ else
+ {
+ for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ {
+ if (0 == (i % outfftVectorSizePadded))
+ {
+ if (real[i] != outfftVectorSize)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+ else
+ {
+ if (real[i] != 0)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+
+ if (imag[i] != 0)
+ {
+ checkflag = true;
+ break;
+ }
+ }
+ }
+ }
+ break;
+ case CLFFT_REAL:
{
- switch (precallbakType)
+ std::valarray< T > real( outfftBatchSize );
+
+ if( place == CLFFT_INPLACE )
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+ else
+ {
+ OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
+ 0, NULL, NULL ),
+ "Reading the result buffer failed" );
+ }
+
+ // Check output data
+ if (hasPrecallback)
{
- case 1: //C2C 1D PLANAR
+ if (precision == CLFFT_SINGLE)
{
- char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
- USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
- for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ float *refout;
+
+ refout = get_fftwf_output_c2r(lengths, strides, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+ in_layout, outfftVectorSizePadded, outfftVectorSize, dim, dir);
+
+ if (!compare<float, T>(refout, real, outfftVectorSize, batch_size))
+ checkflag = true;
+
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
{
- h_userdata[i].scalar1 = SCALAR + (i % fftVectorSize);
- h_userdata[i].scalar2 = SCALAR + (i % fftVectorSize) + 1;
+ std::cout << "i " << i << " refreal " << refout[i] << " clreal " << (real[i] * outfftVectorSize) << std::endl;
+ }*/
+
+ if (refout)
+ free(refout);
+ }
+ }
+ else
+ {
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * o_strides[3];
+ for(size_t k = 0; k < lengths[2]; k++)
+ {
+ size_t p2 = p3 + k * o_strides[2];
+ for(size_t j = 0; j < lengths[1]; j++)
+ {
+ size_t p1 = p2 + j * o_strides[1];
+ for(size_t i = 0; i < lengths[0]; i++)
+ {
+ size_t p0 = p1 + i * o_strides[0];
+
+ if (real[p0] != 1)
+ {
+ checkflag = true;
+ break;
+ }
+
+ }
+ }
}
- userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
+ }
+ }
+ }
+ break;
+ default:
+ {
+ throw std::runtime_error( "Input layout format not yet supported" );
+ }
+ break;
+ }
+
+ if (checkflag)
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+ }
+ else
+ {
+ std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+ }
+}
+
+template < typename T >
+int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+ clfftLayout in_layout, clfftLayout out_layout,
+ clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
+ cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
+ cl_uint command_queue_flags, cl_uint profile_count,
+ std::auto_ptr< clfftSetupData > setupData,
+ bool hasPrecallback)
+{
+ // Our command line does not specify what dimension FFT we wish to transform; we decode
+ // this from the lengths that the user specifies for X, Y, Z. A length of one means that
+ // The user does not want that dimension.
- //Register the callback
- OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+ size_t strides[ 4 ];
+ size_t o_strides[ 4 ];
+ size_t fftVectorSize = 0;
+ size_t fftVectorSizePadded = 0;
+ size_t fftBatchSize = 0;
+ size_t outfftVectorSize = 0;
+ size_t outfftVectorSizePadded = 0;
+ size_t outfftBatchSize = 0;
+ size_t size_of_input_buffers_in_bytes = 0;
+ size_t size_of_output_buffers_in_bytes = 0;
+
+ clfftDim dim = CLFFT_1D;
+ cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
+ cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
+ cl_context context;
+ cl_command_queue queue;
+ cl_event outEvent = NULL;
+ clfftPlanHandle plan_handle;
+
+ //Valudate input and output data layout
+ validateDataLayout(in_layout, out_layout, place);
+
+ if (hasPrecallback && !(in_layout == CLFFT_COMPLEX_INTERLEAVED || in_layout == CLFFT_COMPLEX_PLANAR || in_layout == CLFFT_HERMITIAN_INTERLEAVED))
+ {
+ terr << _T("Pre-callback feature is currently supported only for Complex-Complex and Complex-Real Interleaved FFT " ) << std::endl;
+ return 1;
+ }
+
+ //Initializations
+ OPENCL_V_THROW( dataInitialize<T>(lengths, &dim, batch_size, inStrides, strides, outStrides, o_strides, &fftBatchSize, &outfftBatchSize,
+ &fftVectorSizePadded, in_layout, out_layout, &outfftVectorSizePadded, &fftVectorSize, &outfftVectorSize, place,
+ &size_of_output_buffers_in_bytes, &size_of_input_buffers_in_bytes, input_cl_mem_buffers, output_cl_mem_buffers, &context, &queue,
+ deviceType, deviceId, platformId, command_queue_flags), "Data Initialization failed");
+
+ // Discover and load the timer module if present
+ void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
+ if( timerLibHandle == NULL )
+ {
+ terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
+ }
+
+
+ // Timer module discovered and loaded successfully
+ // Initialize function pointers to call into the shared module
+ PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );
+
+ // Create and initialize our timer class, if the external timer shared library loaded
+ baseStatTimer* timer = NULL;
+ size_t clFFTID = 0;
+ if( get_timer )
+ {
+ timer = get_timer( CLFFT_GPU );
+ timer->Reserve( 1, profile_count );
+ timer->setNormalize( true );
+
+ clFFTID = timer->getUniqueID( "clFFT", 0 );
+ }
+
+ OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+ OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );
+
+ // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+ OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+ OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
+ OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
+ OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+ OPENCL_V_THROW (clfftSetPlanInStride ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
+ OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
+ OPENCL_V_THROW (clfftSetPlanDistance ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );
+
+ // Set backward scale factor to 1.0 for non real FFTs to do correct output checks
+ if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
+ OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );
+
+ //Check for Precallback
+ //Currently test includes only for 1D
+ if (hasPrecallback)
+ {
+ cl_mem userdata;
+
+ //C2C 1D Interleaved
+ if (in_layout == CLFFT_COMPLEX_INTERLEAVED )
+ {
+ char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+
+ int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
+ for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ {
+ h_userdata[ i ] = SCALAR + (i % fftVectorSize);
+ }
+ userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
+
+ //Register the callback
+ OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+ }
+ else if (in_layout == CLFFT_HERMITIAN_INTERLEAVED)
+ {
+ char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_C2R) : STRINGIFY(MULVAL_DP);
+
+ int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
+ for(size_t b = 0; b < batch_size; b++)
+ {
+ size_t p3 = b * strides[3];
+
+ for( size_t i = 0; i < fftVectorSizePadded; i = i + inStrides[0])
+ {
+ h_userdata[ p3 + i ] = SCALAR + i;
}
- break;
- default:
- break;
}
+ userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
+
+ //Register the callback
+ OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+ }
+
+ //C2C PLANAR
+ if (in_layout == CLFFT_COMPLEX_PLANAR)
+ {
+ char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
+ USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
+ for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
+ {
+ h_userdata[i].scalar1 = SCALAR + (int)(i % fftVectorSize);
+ h_userdata[i].scalar2 = SCALAR + (int)(i % fftVectorSize) + 1;
+ }
+ userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
+
+ //Register the callback
+ OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
}
}
@@ -601,32 +1248,6 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
}
- if (( place == CLFFT_INPLACE )
- && ( in_layout != out_layout ))
- {
- switch( in_layout )
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
- {
- throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
- }
- break;
- }
- case CLFFT_COMPLEX_PLANAR:
- {
- if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
- {
- throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
- }
- break;
- }
- default:
- terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
- return 1;
- }
- }
cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
@@ -675,217 +1296,9 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
if (( place == CLFFT_OUTOFPLACE )
|| ( profile_count == 1))
{
- bool checkflag= false;
- switch( out_layout )
- {
- case CLFFT_HERMITIAN_INTERLEAVED:
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- std::vector< std::complex< T > > output( outfftBatchSize );
-
- if( place == CLFFT_INPLACE )
- {
- OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
- 0, NULL, NULL ),
- "Reading the result buffer failed" );
- }
- else
- {
- OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
- 0, NULL, NULL ),
- "Reading the result buffer failed" );
- }
-
- //check output data
- if (hasPrecallback)
- {
- switch(in_layout)
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- if (precision == CLFFT_SINGLE)
- {
- fftwf_complex *refout;
-
- refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
- in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
- if (!compare(refout, output, outfftBatchSize))
- checkflag = true;
-
- //for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
- //{
- // std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
- //}
-
- fftwf_free(refout);
- }
- else if (precision == CLFFT_DOUBLE)
- {
- fftw_complex *refout;
-
- refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
- in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
- if (!compare(refout, output, outfftBatchSize))
- checkflag = true;
-
- /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
- {
- std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
- }*/
-
- fftw_free(refout);
- }
- }
- break;
- }
- }
- else
- {
- for( cl_uint i = 0; i < outfftBatchSize; ++i )
- {
- if (0 == (i % outfftVectorSizePadded))
- {
- if (output[i].real() != outfftVectorSize)
- {
- checkflag = true;
- break;
- }
-
- }
- else
- {
- if (output[ i ].real() != 0)
- {
- checkflag = true;
- break;
- }
- }
-
- if (output[ i ].imag() != 0)
- {
- checkflag = true;
- break;
- }
- }
- }
- }
- break;
- case CLFFT_HERMITIAN_PLANAR:
- case CLFFT_COMPLEX_PLANAR:
- {
- std::valarray< T > real( outfftBatchSize );
- std::valarray< T > imag( outfftBatchSize );
-
- if( place == CLFFT_INPLACE )
- {
- OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
- 0, NULL, NULL ),
- "Reading the result buffer failed" );
- OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
- 0, NULL, NULL ),
- "Reading the result buffer failed" );
- }
- else
- {
- OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
- 0, NULL, NULL ),
- "Reading the result buffer failed" );
- OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
- 0, NULL, NULL ),
- "Reading the result buffer failed" );
- }
-
- // Check output data
- if (hasPrecallback)
- {
- switch(in_layout)
- {
- case CLFFT_COMPLEX_PLANAR:
- {
- if (precision == CLFFT_SINGLE)
- {
- fftwf_complex *refout;
-
- refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
- in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
- if (!compare(refout, real, imag, outfftBatchSize))
- checkflag = true;
-
- /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
- {
- std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
- }*/
-
- fftwf_free(refout);
- }
- else if (precision == CLFFT_DOUBLE)
- {
- fftw_complex *refout;
-
- refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
- in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
- if (!compare(refout, real, imag, outfftBatchSize))
- checkflag = true;
-
- /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
- {
- std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
- }*/
-
- fftw_free(refout);
- }
- }
- break;
- }
- }
- else
- {
- for( cl_uint i = 0; i < outfftBatchSize; ++i )
- {
- if (0 == (i % outfftVectorSizePadded))
- {
- if (real[i] != outfftVectorSize)
- {
- checkflag = true;
- break;
- }
- }
- else
- {
- if (real[i] != 0)
- {
- checkflag = true;
- break;
- }
- }
-
- if (imag[i] != 0)
- {
- checkflag = true;
- break;
- }
- }
- }
- }
- break;
- default:
- terr << _T("Complex-Real callback cases not yet implemented" ) << std::endl;
- throw std::runtime_error( "Input layout format not yet supported" );
- break;
- }
-
- if (checkflag)
- {
- std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
- }
- else
- {
- std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
- }
+ compareWithReference<T>(in_layout, out_layout, outfftBatchSize, place, precision, queue, input_cl_mem_buffers, size_of_input_buffers_in_bytes, size_of_output_buffers_in_bytes,
+ BuffersOut, lengths, strides, inStrides, outStrides, o_strides, batch_size, fftBatchSize, fftVectorSizePadded, outfftVectorSize, outfftVectorSizePadded, fftVectorSize,
+ dim, dir, hasPrecallback);
}
OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
@@ -1059,10 +1472,42 @@ int main(int argc, char **argv)
oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2];
}
}
- else
+ else // Real-Complex and Complex-Real cases
{
- terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
- return 1;
+ size_t *rst, *cst;
+ size_t N = lengths[0];
+ size_t Nt = 1 + lengths[0]/2;
+ bool iflag = false;
+ bool rcFull = (inL == 1) || (inL == 2) || (otL == 1) || (otL == 2);
+
+ if(inLayout == CLFFT_REAL) { iflag = true; rst = iStrides; }
+ else { rst = oStrides; } // either in or out should be REAL
+
+ // Set either in or out strides whichever is real
+ if(place == CLFFT_INPLACE)
+ {
+ if(rcFull) { rst[1] = rst[1] ? rst[1] : N * 2 * rst[0]; }
+ else { rst[1] = rst[1] ? rst[1] : Nt * 2 * rst[0]; }
+
+ rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1];
+ rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2];
+ }
+ else
+ {
+ rst[1] = rst[1] ? rst[1] : lengths[0] * rst[0];
+ rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1];
+ rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2];
+ }
+
+ // Set the remaining of in or out strides that is not real
+ if(iflag) { cst = oStrides; }
+ else { cst = iStrides; }
+
+ if(rcFull) { cst[1] = cst[1] ? cst[1] : N * cst[0]; }
+ else { cst[1] = cst[1] ? cst[1] : Nt * cst[0]; }
+
+ cst[2] = cst[2] ? cst[2] : lengths[1] * cst[1];
+ cst[3] = cst[3] ? cst[3] : lengths[2] * cst[2];
}
if( precision == CLFFT_SINGLE )
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index cbd06fc..493c05d 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -781,7 +781,7 @@ clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName,
if (callbackType == PRECALLBACK)
{
- if (fftPlan->inputLayout == CLFFT_COMPLEX_INTERLEAVED || fftPlan->inputLayout == CLFFT_COMPLEX_PLANAR)
+ if (fftPlan->inputLayout == CLFFT_COMPLEX_INTERLEAVED || fftPlan->inputLayout == CLFFT_COMPLEX_PLANAR || fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED)
{
if (funcName != NULL && funcString != NULL)
{
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 854ffcc..df35141 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -825,6 +825,7 @@ namespace StockhamGenerator
return;
}
+ int hid;
for(size_t i=0; i<numB; i++)
{
std::string regBaseCount = regBase;
@@ -841,6 +842,7 @@ namespace StockhamGenerator
{
std::string tail;
std::string regIndex;
+ std::string regIndexC;
regIndex = linearRegs ? "(*R" : regBaseCount;
std::string buffer;
@@ -856,7 +858,17 @@ namespace StockhamGenerator
{
if(c == 0)
{
- if(linearRegs) { RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ").x"; }
+ if(linearRegs)
+ {
+ RegBaseAndCountAndPos("", i*radix + r, regIndex);
+
+ hid = r / (numB * radix / 2);
+ if (fft_doPreCallback && component == SR_COMP_REAL && hid != 0)
+ {
+ regIndexC = regIndex; regIndexC += ").y";
+ }
+ regIndex += ").x";
+ }
else { RegBaseAndCountAndPos("R", r, regIndex); }
buffer = bufferRe;
tail = interleaved ? ".x;" : ";";
@@ -889,7 +901,7 @@ namespace StockhamGenerator
//If precallback is set invoke callback function
//Invoke callback only once in Planar data layout (i.e.c==0)
- if (fft_doPreCallback && c == 0)
+ if (fft_doPreCallback && c == 0 && component == SR_COMP_BOTH)
{
passStr += "\n\t";
passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "] = "; passStr += fft_preCallback.funcname; passStr += "(";
@@ -909,12 +921,18 @@ namespace StockhamGenerator
passStr += ");";
}
+ if (fft_doPreCallback && component == SR_COMP_REAL && hid != 0)
+ {
+ passStr += "\n\t";
+ passStr += regIndexC; passStr += " = "; passStr += regIndexSub; passStr += ";";
+ }
+
passStr += "\n\t";
passStr += regIndexSub;
passStr += " = ";
//Use the return value from precallback if set
- if (fft_doPreCallback)
+ if (fft_doPreCallback && component == SR_COMP_BOTH)
{
passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "]";
passStr += interleaved ? tail : (c == 0) ? ".x;" : ".y;";
@@ -1210,22 +1228,30 @@ namespace StockhamGenerator
std::string regIndex = "(*R";
std::string buffer;
- if(c == 0)
+ RegBaseAndCountAndPos("", r, regIndex);
+ if (fft_doPreCallback && interleaved)
{
- RegBaseAndCountAndPos("", r, regIndex); regIndex += ").x";
- buffer = bufferRe;
- tail = interleaved ? ".x;" : ";";
- tail2 = interleaved ? ".y;" : ";";
+ regIndex += ")";
+ buffer = (c == 0) ? bufferRe : bufferIm;
}
else
{
- RegBaseAndCountAndPos("", r, regIndex); regIndex += ").y";
- buffer = bufferIm;
- tail = interleaved ? ".y;" : ";";
- tail2 = interleaved ? ".x;" : ";";
+ if(c == 0)
+ {
+ regIndex += ").x";
+ buffer = bufferRe;
+ tail = interleaved ? ".x;" : ";";
+ tail2 = interleaved ? ".y;" : ";";
+ }
+ else
+ {
+ regIndex += ").y";
+ buffer = bufferIm;
+ tail = interleaved ? ".y;" : ";";
+ tail2 = interleaved ? ".x;" : ";";
+ }
}
-
size_t bid = numCR/2;
bid = bid ? bid : 1;
size_t cid, lid;
@@ -1263,8 +1289,17 @@ namespace StockhamGenerator
{
if(act)
{
- passStr += buffer;
- passStr += "["; passStr += offset; passStr += " + ( ";
+ if (fft_doPreCallback)
+ {
+ passStr += fft_preCallback.funcname; passStr += "(";
+ passStr += buffer; passStr += ", ";
+ }
+ else
+ {
+ passStr += buffer;
+ passStr += "[";
+ }
+ passStr += offset; passStr += " + ( ";
}
if(fwd)
@@ -1280,10 +1315,20 @@ namespace StockhamGenerator
if(act)
{
- passStr += " )*"; passStr += SztToStr(stride); passStr += "]";
+ passStr += " )*"; passStr += SztToStr(stride);
+
+ if (fft_doPreCallback)
+ {
+ passStr += ", userdata";
+ passStr += (fft_preCallback.localMemSize > 0) ? ", localmem);" : ");";
+ }
+ else
+ {
+ passStr += "]";
- if(fwd) { passStr += tail; }
- else { if(!batch2) passStr += tail; else passStr += tail2; }
+ if(fwd) { passStr += tail; }
+ else { if(!batch2) passStr += tail; else passStr += tail2; }
+ }
}
}
}
@@ -1439,7 +1484,8 @@ namespace StockhamGenerator
passStr += "( ";
if(c == 0)
{
- regIndex += ".x"; regIndexPair += ".x";
+ regIndex += ".x";
+ regIndexPair += fft_doPreCallback ? ".y" : ".x";
if(!batch2) { passStr += regIndex; passStr += " - "; passStr += regIndexPair; }
else { passStr += regIndex; passStr += " + "; passStr += regIndexPair; }
@@ -1737,8 +1783,13 @@ namespace StockhamGenerator
}
//Include callback parameters if callback is set
- if (fft_doPreCallback)
+ if (fft_doPreCallback && !r2c)
{
+ if (c2r)
+ {
+ passStr += ", uint inOffset2";
+ }
+
passStr += ", __global void* userdata";
if (fft_preCallback.localMemSize > 0)
@@ -1830,7 +1881,17 @@ namespace StockhamGenerator
passStr += "\n\tif(rw && !me)\n\t{\n\t";
passStr += processBufRe; passStr += "["; passStr += processBufOffset; passStr += "] = ";
- passStr += bufferInRe; passStr+= "[inOffset]";
+
+ if (fft_doPreCallback)
+ {
+ passStr += fft_preCallback.funcname; passStr += "("; passStr += bufferInRe; passStr += ", inOffset, userdata";
+ passStr += fft_preCallback.localMemSize > 0 ? ", localmem)" : ")";
+ }
+ else
+ {
+ passStr += bufferInRe; passStr+= "[inOffset]";
+ }
+
if(inInterleaved) passStr += ".x;\n\t}"; else passStr += ";\n\t}";
if(length > 1)
@@ -1840,7 +1901,15 @@ namespace StockhamGenerator
passStr += "\n\t}\n";
passStr += "\n\tif(rw > 1)\n\t{";
- SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
+ if (fft_doPreCallback)
+ {
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, false, bufferInIm2, bufferInIm2, "inOffset2", passStr);
+ }
+ else
+ {
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
+ }
+
passStr += "\n\t}\n\telse\n\t{";
SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, true, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
passStr += "\n\t}\n";
@@ -1879,22 +1948,34 @@ namespace StockhamGenerator
passStr += "\n\tif((rw > 1) && !me)\n\t{\n\t";
passStr += processBufIm; passStr += "["; passStr += processBufOffset; passStr += "] = ";
- passStr += bufferInRe2; passStr+= "[inOffset]";
+
+ if (fft_doPreCallback)
+ {
+ passStr += fft_preCallback.funcname; passStr += "("; passStr += bufferInRe2; passStr += ", inOffset, userdata";
+ passStr += fft_preCallback.localMemSize > 0 ? ", localmem)" : ")";
+ }
+ else
+ {
+ passStr += bufferInRe2; passStr+= "[inOffset]";
+ }
if(inInterleaved) passStr += ".x;\n\t}"; else passStr += ";\n\t}";
passStr += "\n\tif((rw == 1) && !me)\n\t{\n\t"; passStr += processBufIm; passStr += "["; passStr += processBufOffset; passStr += "] = 0;\n\t}";
if(length > 1)
{
- passStr += "\n\n\tif(rw)\n\t{";
- SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInIm, bufferInIm, "inOffset", passStr);
- passStr += "\n\t}\n";
+ if (!fft_doPreCallback)
+ {
+ passStr += "\n\n\tif(rw)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInIm, bufferInIm, "inOffset", passStr);
+ passStr += "\n\t}\n";
- passStr += "\n\tif(rw > 1)\n\t{";
- SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
- passStr += "\n\t}\n\telse\n\t{";
- SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, true, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
- passStr += "\n\t}\n";
+ passStr += "\n\tif(rw > 1)\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
+ passStr += "\n\t}\n\telse\n\t{";
+ SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, true, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
+ passStr += "\n\t}\n";
+ }
if(oddp)
{
@@ -2394,7 +2475,7 @@ namespace StockhamGenerator
//Pass precallback information to Pass object if its the first pass.
//This will be used in single kernel transforms
- if (!r2c2r && i == 0 && !params.blockCompute && params.fft_hasPreCallback)
+ if (params.fft_hasPreCallback && !r2c && i == 0 && !params.blockCompute)
{
passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
}
@@ -2794,12 +2875,20 @@ namespace StockhamGenerator
{
if(outInterleaved)
{
- str += "__global "; str += r2Type; str += " * restrict gb)\n";
+ str += "__global "; str += r2Type; str += " * restrict gb";
}
else
{
- str += "__global "; str += rType; str += " * restrict gb)\n";
+ str += "__global "; str += rType; str += " * restrict gb";
}
+
+ //If plan has pre-callback
+ if (params.fft_hasPreCallback)
+ {
+ str += callbackstr;
+ }
+
+ str += ")\n";
}
else
{
@@ -2941,8 +3030,8 @@ namespace StockhamGenerator
if(inInterleaved)
{
- if(!rcSimple) { str += "__global "; str += r2Type; str += " *lwbIn2;\n\t"; }
- str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+ if(!rcSimple && !params.fft_hasPreCallback) { str += "__global "; str += r2Type; str += " *lwbIn2;\n\t"; }
+ if(!params.fft_hasPreCallback) { str += "__global "; str += r2Type; str += " *lwbIn;\n\t"; }
}
else if(inReal)
{
@@ -2952,10 +3041,13 @@ namespace StockhamGenerator
}
else
{
- if(!rcSimple) { str += "__global "; str += rType; str += " *lwbInRe2;\n\t"; }
- if(!rcSimple) { str += "__global "; str += rType; str += " *lwbInIm2;\n\t"; }
- str += "__global "; str += rType; str += " *lwbInRe;\n\t";
- str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ if(!rcSimple && !params.fft_hasPreCallback) { str += "__global "; str += rType; str += " *lwbInRe2;\n\t"; }
+ if(!rcSimple && !params.fft_hasPreCallback) { str += "__global "; str += rType; str += " *lwbInIm2;\n\t"; }
+ if (!params.fft_hasPreCallback)
+ {
+ str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ }
}
if(outInterleaved)
@@ -3108,8 +3200,8 @@ namespace StockhamGenerator
{
if(inInterleaved)
{
- if(!rcSimple) { str += "lwbIn2 = (__global "; str += r2Type; str += " *)gb + iOffset2;\n\t"; }
- str += "lwbIn = (__global "; str += r2Type; str += " *)gb + iOffset;\n\t";
+ if(!rcSimple && !params.fft_hasPreCallback) { str += "lwbIn2 = (__global "; str += r2Type; str += " *)gb + iOffset2;\n\t"; }
+ if(!params.fft_hasPreCallback) { str += "lwbIn = (__global "; str += r2Type; str += " *)gb + iOffset;\n\t"; }
}
else
{
@@ -3219,9 +3311,9 @@ namespace StockhamGenerator
}
std::string inOffset;
- if (!r2c2r)
+ if (!r2c)
{
- if (params.fft_placeness == CLFFT_INPLACE)
+ if (params.fft_placeness == CLFFT_INPLACE && !c2r)
{
inOffset += "ioOffset";
}
@@ -3340,8 +3432,9 @@ namespace StockhamGenerator
}
else
{
- if(inInterleaved || inReal) inBuf = "lwbIn, lwbIn2, ";
- else inBuf = "lwbInRe, lwbInRe2, lwbInIm, lwbInIm2, ";
+ if(inInterleaved || inReal) inBuf = (inInterleaved && params.fft_hasPreCallback) ? "gb, gb, " : "lwbIn, lwbIn2, ";
+ else inBuf = (params.fft_hasPreCallback) ? "gbInRe, gbInRe, gbInIm, gbInIm, " : "lwbInRe, lwbInRe2, lwbInIm, lwbInIm2, ";
+
if(outInterleaved || outReal) outBuf = "lwbOut, lwbOut2";
else outBuf = "lwbOutRe, lwbOutRe2, lwbOutIm, lwbOutIm2";
}
@@ -3390,7 +3483,14 @@ namespace StockhamGenerator
str += "\t";
str += PassName(0, fwd);
str += "("; str += rw; str += me;
- str += (!r2c2r) ? inOffset : "0";
+ if (r2c)
+ {
+ str += "0";
+ }
+ else
+ {
+ str += (params.fft_hasPreCallback || !c2r) ? inOffset : "0";
+ }
str += ", 0, ";
str += inBuf; str += outBuf;
str += IterRegs("&");
@@ -3454,7 +3554,14 @@ namespace StockhamGenerator
}
else
{
- str += (!r2c2r) ? inOffset : "0";
+ if (r2c)
+ {
+ str += "0";
+ }
+ else
+ {
+ str += (params.fft_hasPreCallback || !c2r) ? inOffset : "0";
+ }
}
str += ", ";
str += ldsOff;
@@ -3463,9 +3570,9 @@ namespace StockhamGenerator
str += ldsArgs; str += IterRegs("&");
//if precalback set, append additional arguments
- if (!r2c2r && !blockCompute && params.fft_hasPreCallback)
+ if (!r2c && !blockCompute && params.fft_hasPreCallback)
{
- str += ", userdata";
+ str += c2r ? ", iOffset2, userdata" : ", userdata";
if (params.fft_preCallback.localMemSize > 0)
{
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list