[clfft] 11/128: Precallback - Complex-Real 1D single kernel SP

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:33 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 8f07253282196bb552aa153f266be68f8d3f60e8
Author: Pradeep <pradeep.rao at amd.com>
Date:   Mon Aug 10 13:24:23 2015 +0530

    Precallback - Complex-Real 1D single kernel SP
---
 src/client-callback/callback-client.cpp | 1317 +++++++++++++++++++++----------
 src/library/accessors.cpp               |    2 +-
 src/library/generator.stockham.cpp      |  207 +++--
 3 files changed, 1039 insertions(+), 487 deletions(-)

diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
index 5fb6c9e..1614922 100644
--- a/src/client-callback/callback-client.cpp
+++ b/src/client-callback/callback-client.cpp
@@ -13,8 +13,7 @@
 
 namespace po = boost::program_options;
 
-#define SCALAR 100
-#define PRECALLBACKTYPE 1
+#define SCALAR 10
 
 #define MULVAL float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
 				{ \n \
@@ -23,6 +22,13 @@ namespace po = boost::program_options;
 				return ret; \n \
 				}
 
+#define MULVAL_C2R float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
+				{ \n \
+				int scalar = *((__global int*)userdata + offset); \n \
+				float2 ret = *((__global float2*)in + offset) * scalar; \n \
+				return ret; \n \
+				}
+
 #define MULVAL_DP double2 mulval(__global void* in, uint offset, __global void* userdata)\n \
 				{ \n \
 				int scalar = *((__global int*)userdata + offset); \n \
@@ -57,27 +63,27 @@ namespace po = boost::program_options;
 						} USER_DATA; 
 STRUCT_USERDATA
 
-//Compare reference and opencl output
+//Compare reference and opencl output 
 template < typename T1, typename T2>
 bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
-             const int length, const float epsilon = 1e-6f)
+             size_t length, const float epsilon = 1e-6f)
 {
     float error = 0.0f;
-    float ref = 0.0f;
-	float diff = 0.0f;
+    T1 ref;
+	T1 diff;
 	float normRef = 0.0f;
 	float normError = 0.0f;
 
-    for(int i = 0; i < length; ++i)
+    for(size_t i = 0; i < length; ++i)
     {
-        diff = refData[i][0] - data[i].real();
-        error += diff * diff;
-        ref += refData[i][0] * refData[i][0];
+        diff[0] = refData[i][0] - data[i].real();
+        error += (float)(diff[0] * diff[0]);
+        ref[0] += refData[i][0] * refData[i][0];
     }
 	if (error != 0)
 	{
-		normRef =::sqrtf((float) ref);
-		if (::fabs((float) ref) < 1e-7f)
+		normRef =::sqrtf((float) ref[0]);
+		if (::fabs((float) ref[0]) < 1e-7f)
 		{
 			return false;
 		}
@@ -90,19 +96,19 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
 
 	//imag
 	error = 0.0f;
-	ref = 0.0f;
-	for(int i = 0; i < length; ++i)
+	ref[1] = 0.0;
+	for(size_t i = 0; i < length; ++i)
     {
-        diff = refData[i][1] - data[i].imag();
-        error += diff * diff;
-        ref += refData[i][1] * refData[i][1];
+        diff[1] = refData[i][1] - data[i].imag();
+        error += (float)(diff[1] * diff[1]);
+        ref[1] += refData[i][1] * refData[i][1];
     }
 	
 	if (error == 0)
 		return true;
 
-	normRef =::sqrtf((float) ref);
-    if (::fabs((float) ref) < 1e-7f)
+	normRef =::sqrtf((float) ref[1]);
+    if (::fabs((float) ref[1]) < 1e-7f)
     {
         return false;
     }
@@ -118,25 +124,25 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
 //Compare reference and opencl output
 template < typename T1, typename T2 >
 bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
-             const int length, const float epsilon = 1e-6f)
+             size_t length, const float epsilon = 1e-6f)
 {
     float error = 0.0f;
-    float ref = 0.0f;
-	float diff = 0.0f;
+    T1 ref;
+	T1 diff;
 	float normRef = 0.0f;
 	float normError = 0.0f;
 
 	//real compare
-    for(int i = 0; i < length; ++i)
+    for(size_t i = 0; i < length; ++i)
     {
-        diff = refData[i][0] - real[i];
-        error += diff * diff;
-        ref += refData[i][0] * refData[i][0];
+        diff[0] = refData[i][0] - real[i];
+        error += (float)(diff[0] * diff[0]);
+        ref[0] += refData[i][0] * refData[i][0];
     }
 	if (error != 0)
 	{
-		normRef =::sqrtf((float) ref);
-		if (::fabs((float) ref) < 1e-7f)
+		normRef =::sqrtf((float) ref[0]);
+		if (::fabs((float) ref[0]) < 1e-7f)
 		{
 			return false;
 		}
@@ -149,20 +155,20 @@ bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
 
 	//imag compare
 	error = 0.0f;
-    ref = 0.0f;
+    ref[1] = 0.0;
 
-	for(int i = 0; i < length; ++i)
+	for(size_t i = 0; i < length; ++i)
     {
-        diff = refData[i][1] - imag[i];
-        error += diff * diff;
-        ref += refData[i][1] * refData[i][1];
+        diff[1] = refData[i][1] - imag[i];
+        error += (float)(diff[1] * diff[1]);
+        ref[1] += refData[i][1] * refData[i][1];
     }
 	
 	if (error == 0)
 		return true;
 
-    normRef =::sqrtf((float) ref);
-    if (::fabs((float) ref) < 1e-7f)
+    normRef =::sqrtf((float) ref[1]);
+    if (::fabs((float) ref[1]) < 1e-7f)
     {
         return false;
     }
@@ -175,6 +181,46 @@ bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
 	return true;
 }
 
+//Compare reference and opencl output
+template < typename T1 , typename T2 >
+bool compare(T1 *refData, std::valarray< T2 > real, 
+             size_t length, int batchsize, const float epsilon = 1e-6f)
+{
+    float error = 0.0f;
+    T1 ref = 0.0;
+	T1 diff;
+	float normRef = 0.0f;
+	float normError = 0.0f;
+	size_t scale = length;
+
+	//real compare
+	for (int b = 0; b < batchsize; b++)
+	{	
+		int idx = b * (length + 2);
+		for(size_t i = idx; i < (idx + length); ++i)
+		{
+			diff = refData[i] - (real[i] * scale);
+			error += (float)(diff * diff);
+			ref += refData[i] * refData[i];
+		}
+	}
+	if (error != 0)
+	{
+		normRef =::sqrtf((float) ref);
+		if (::fabs((float) ref) < 1e-7f)
+		{
+			return false;
+		}
+		normError = ::sqrtf((float) error);
+		error = normError / normRef;
+    
+		if (error > epsilon)
+			return false;
+	}
+
+	return true;
+}
+
 // Compute reference output using fftw for float type
 fftwf_complex* get_fftwf_output(size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
 								size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
@@ -188,28 +234,28 @@ fftwf_complex* get_fftwf_output(size_t* lengths, const size_t *inStrides, const
 	fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftBatchSize);
 	fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*outfftBatchSize);
 
-	refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, 
-									refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded, 
-									refout, &fftwLengths[3 - dim], outStrides[0], outfftVectorSizePadded, 
+	refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], (int)batch_size, 
+									refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded, 
+									refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded, 
 									dir, FFTW_ESTIMATE);
 
 	int scalar;
-	for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+	for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
 	{
 		switch (in_layout)
 		{
 		case CLFFT_COMPLEX_INTERLEAVED:
-			scalar = SCALAR + (i % fftVectorSize);
+			scalar = SCALAR + (int)(i % fftVectorSize);
 			break;
 		case CLFFT_COMPLEX_PLANAR:
-			scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1);
+			scalar = (int)((SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1));
 			break;
 		default:
 			break;
 		}
 
-		refin[i][0] = 1 * scalar;
-		refin[i][1] = 0 * scalar;
+		refin[i][0] = (float)(1 * scalar);
+		refin[i][1] = (float)(0 * scalar);
 	}
 
 	fftwf_execute(refPlan);
@@ -234,21 +280,21 @@ fftw_complex* get_fftw_output(size_t* lengths, const size_t *inStrides, const si
 	//In FFTW last dimension has the fastest changing index
 	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
 
-	refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, 
-									refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded, 
-									refout, &fftwLengths[3 - dim], outStrides[0], outfftVectorSizePadded, 
+	refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], (int)batch_size, 
+									refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded, 
+									refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded, 
 									dir, FFTW_ESTIMATE);
 							
 	int scalar;
-	for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+	for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
 	{
 		switch (in_layout)
 		{
 		case CLFFT_COMPLEX_INTERLEAVED:
-			scalar = SCALAR + (i % fftVectorSize);
+			scalar = SCALAR + (int)(i % fftVectorSize);
 			break;
 		case CLFFT_COMPLEX_PLANAR:
-			scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1);
+			scalar = (int)((SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1));
 			break;
 		default:
 			break;
@@ -267,6 +313,70 @@ fftw_complex* get_fftw_output(size_t* lengths, const size_t *inStrides, const si
 	return refout;
 }
 
+// Compute C2R reference output using fftw for float type
+float* get_fftwf_output_c2r(size_t* lengths, size_t *strides, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+								size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
+								size_t outfftVectorSizePadded, size_t outfftVectorSize, clfftDim dim, clfftDirection dir)
+{
+	//In FFTW last dimension has the fastest changing index
+	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+
+	fftwf_plan refPlan;
+
+	fftwf_complex *refin = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*fftBatchSize);
+	float *refout = (float*) malloc(sizeof(float)*outfftBatchSize);
+
+	refPlan = fftwf_plan_many_dft_c2r(dim, &fftwLengths[3 - dim], (int)batch_size, 
+									refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded, 
+									refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded,
+									FFTW_ESTIMATE);
+
+	// set zero
+	for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0] )
+	{
+		refin[ i ][0] = 0; refin[ i ][1] = 0;
+	}
+
+	// impulse test case
+	for(size_t b = 0; b < batch_size; b++)
+	{
+		size_t p3 = b * strides[3];
+		refin[ p3 ][0] = static_cast<float>(outfftVectorSize);
+	}
+
+	int scalar;
+	for(size_t b = 0; b < batch_size; b++)
+	{
+		size_t p3 = b * strides[3];
+	
+		for( size_t i = 0; i < fftVectorSizePadded; i = i + inStrides[0])
+		{
+			switch (in_layout)
+			{
+			case CLFFT_HERMITIAN_INTERLEAVED:
+				scalar = SCALAR + i;
+				break;
+			case CLFFT_HERMITIAN_PLANAR:
+				scalar = (int)(SCALAR + i + (SCALAR + i + 1));
+				break;
+			default:
+				break;
+			}
+
+			refin[p3 + i][0] *= (float)(scalar);
+			refin[p3 + i][1] *= (float)(scalar);
+		}
+	}
+
+	fftwf_execute(refPlan);
+
+	fftw_free(refin);
+
+	fftwf_destroy_plan(refPlan);
+
+	return refout;
+}
+
 //	This is used with the program_options class so that the user can type an integer on the command line
 //	and we store into an enum varaible
 template<class _Elem, class _Traits>
@@ -278,40 +388,107 @@ std::basic_istream<_Elem, _Traits> & operator>> (std::basic_istream<_Elem, _Trai
 	return stream;
 }
 
-template < typename T >
-int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
-				clfftLayout in_layout, clfftLayout out_layout,
-				clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
-				cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
-				cl_uint command_queue_flags, cl_uint profile_count,
-				std::auto_ptr< clfftSetupData > setupData,
-				bool hasPrecallback)
+//Validate the input and output data layout
+void validateDataLayout(clfftLayout in_layout, clfftLayout out_layout, clfftResultLocation place)
 {
-	//	Our command line does not specify what dimension FFT we wish to transform; we decode
-	//	this from the lengths that the user specifies for X, Y, Z.  A length of one means that
-	//	The user does not want that dimension.
+	switch( in_layout )
+	{
+	case CLFFT_COMPLEX_INTERLEAVED:
+	case CLFFT_COMPLEX_PLANAR:
+	case CLFFT_HERMITIAN_INTERLEAVED:
+	case CLFFT_HERMITIAN_PLANAR:
+	case CLFFT_REAL:
+		break;
+	default:
+		//	Don't recognize input layout
+		{
+			throw std::runtime_error( "Un-recognized data layout" );
+		}
+		break;
+	}
 
-	const size_t max_dimensions = 3;
-	size_t strides[ 4 ];
-	size_t o_strides[ 4 ];
-	size_t fftVectorSize = 0;
-	size_t fftVectorSizePadded = 0;
-	size_t fftBatchSize = 0;
-	size_t outfftVectorSize = 0;
-	size_t outfftVectorSizePadded = 0;
-	size_t outfftBatchSize = 0;
-	size_t size_of_input_buffers_in_bytes = 0;
-	size_t size_of_output_buffers_in_bytes = 0;
+	switch( out_layout )
+	{
+	case CLFFT_COMPLEX_INTERLEAVED:
+	case CLFFT_COMPLEX_PLANAR:
+	case CLFFT_HERMITIAN_INTERLEAVED:
+	case CLFFT_HERMITIAN_PLANAR:
+	case CLFFT_REAL:
+		break;
+	default:
+		//	Don't recognize output layout
+		{
+			throw std::runtime_error( "Un-recognized data layout" );
+		}
+		break;
+	}
+
+	if (( place == CLFFT_INPLACE ) &&  ( in_layout != out_layout )) 
+	{
+		switch( in_layout )
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			{
+				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
+				{
+					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+				}
+				break;
+			}
+		case CLFFT_COMPLEX_PLANAR:
+			{
+				if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
+				{
+					throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
+				}
+				break;
+			}
+		case CLFFT_HERMITIAN_INTERLEAVED:
+			{
+				if( out_layout != CLFFT_REAL )
+				{
+					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+				}
+				break;
+			}
+		case CLFFT_HERMITIAN_PLANAR:
+			{
+				throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
+				break;
+			}
+		case CLFFT_REAL:
+			{
+				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
+				{
+					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+				}
+				break;
+			}
+		default:
+			{
+				throw std::runtime_error( "Input layout format not yet supported" );
+			}
+			break;
+		}
+	}
+}
+
+//FFT data initializations
+template < typename T >
+cl_int dataInitialize(size_t* lengths, clfftDim *dim, size_t batch_size,
+				  const size_t *inStrides, size_t *strides, const size_t *outStrides, size_t *o_strides,
+				  size_t *fftBatchSize, size_t *outfftBatchSize, size_t *fftVectorSizePadded, clfftLayout in_layout, clfftLayout out_layout,
+				  size_t *outfftVectorSizePadded, size_t *fftVectorSize, size_t *outfftVectorSize,
+				  clfftResultLocation place, size_t *size_of_output_buffers_in_bytes,
+				  size_t *size_of_input_buffers_in_bytes, cl_mem *input_cl_mem_buffers, cl_mem *output_cl_mem_buffers,
+				  cl_context *context, cl_command_queue *queue, 
+				  cl_device_type deviceType, cl_int deviceId, cl_int platformId, cl_uint command_queue_flags)
+{
+	cl_event outEvent = NULL;
 	cl_uint number_of_output_buffers = 0;
-	clfftDim	dim = CLFFT_1D;
-	cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
-	cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
+	const size_t max_dimensions = 3;
 	std::vector< cl_device_id > device_id;
-	cl_context context;
-	cl_command_queue queue;
-	cl_event outEvent = NULL;
-	clfftPlanHandle plan_handle;
-
+	
 	for (unsigned u = 0; u < max_dimensions; ++u) {
 		if (0 != lengths[u])
 			continue;
@@ -320,11 +497,11 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 
 	if( lengths[ 1 ] > 1 )
 	{
-		dim	= CLFFT_2D;
+		*dim	= CLFFT_2D;
 	}
 	if( lengths[ 2 ] > 1 )
 	{
-		dim	= CLFFT_3D;
+		*dim	= CLFFT_3D;
 	}
 
 	strides[ 0 ] = inStrides[0];
@@ -337,72 +514,80 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 	o_strides[ 2 ] = outStrides[2];
 	o_strides[ 3 ] = outStrides[3];
 
-	fftVectorSize = lengths[0] * lengths[1] * lengths[2];
-	fftVectorSizePadded = strides[3];
-	fftBatchSize = fftVectorSizePadded * batch_size;
-
-	size_t Nt = 1 + lengths[0]/2;
+	*fftVectorSize = lengths[0] * lengths[1] * lengths[2];
+	*fftVectorSizePadded = strides[3];
+	*fftBatchSize = *fftVectorSizePadded * batch_size;
 
 	if(place == CLFFT_INPLACE)
 	{
-		outfftVectorSize = fftVectorSize;
-		outfftVectorSizePadded = fftVectorSizePadded;
-		outfftBatchSize = fftBatchSize;
+		*outfftVectorSize = *fftVectorSize;
+		*outfftVectorSizePadded = *fftVectorSizePadded;
+		*outfftBatchSize = *fftBatchSize;
 	}
 	else
 	{
-		outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
-		outfftVectorSizePadded = o_strides[3];
-		outfftBatchSize = outfftVectorSizePadded * batch_size;
+		*outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
+		*outfftVectorSizePadded = o_strides[3];
+		*outfftBatchSize = *outfftVectorSizePadded * batch_size;
 	}
 
 	// Real to complex case
 	if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) )
 	{
-		terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
-		return 1;
+		*fftVectorSizePadded = strides[3];
+		*fftBatchSize = *fftVectorSizePadded * batch_size;
+
+		*outfftVectorSizePadded = o_strides[3];
+		*outfftBatchSize = *outfftVectorSizePadded * batch_size;
+
+		*fftVectorSize = lengths[0] * lengths[1] * lengths[2];
+		*outfftVectorSize = *fftVectorSize;
 	}
 
 	switch( out_layout )
 	{
 	case CLFFT_COMPLEX_INTERLEAVED:
 		number_of_output_buffers = 1;
-		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > );
+		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof( std::complex< T > );
 		break;
 	case CLFFT_COMPLEX_PLANAR:
 		number_of_output_buffers = 2;
-		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T);
+		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
+		break;
+	case CLFFT_HERMITIAN_INTERLEAVED:
+		number_of_output_buffers = 1;
+		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof( std::complex< T > );
+		break;
+	case CLFFT_HERMITIAN_PLANAR:
+		number_of_output_buffers = 2;
+		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
+		break;
+	case CLFFT_REAL:
+		number_of_output_buffers = 1;
+		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
 		break;
-	default:
-		terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
-		return 1;
 	}
 
-	if (hasPrecallback && !(in_layout == CLFFT_COMPLEX_INTERLEAVED || in_layout == CLFFT_COMPLEX_PLANAR))
-	{
-		terr << _T("Pre-callback feature is currently supported only for Complex-Complex FFT " ) << std::endl;
-		return 1;
-	}
 
-		// Fill the input buffers
+	// Fill the input buffers
 	switch( in_layout )
 	{
 	case CLFFT_COMPLEX_INTERLEAVED:
 		{
 			//	This call creates our openCL context and sets up our devices; expected to throw on error
-			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > );
+			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( std::complex< T > );
 
-			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
-			createOpenCLCommandQueue( context,
-				command_queue_flags, queue,
+			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+			createOpenCLCommandQueue( *context,
+				command_queue_flags, *queue,
 				device_id,
-				size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
-				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+				*size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 
-			std::vector< std::complex< T > > input( fftBatchSize );
+			std::vector< std::complex< T > > input( *fftBatchSize );
 
 			// set zero
-			for( cl_uint i = 0; i < fftBatchSize; ++i )
+			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 			{
 				input[ i ] = 0;
 			}
@@ -426,8 +611,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 				}
 			}
 
-
-			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ],
+			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &input[ 0 ],
 				0, NULL, &outEvent ),
 				"clEnqueueWriteBuffer failed" );
 
@@ -436,20 +620,20 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 	case CLFFT_COMPLEX_PLANAR:
 		{
 			//	This call creates our openCL context and sets up our devices; expected to throw on error
-			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T );
+			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
 
-			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
-			createOpenCLCommandQueue( context,
-				command_queue_flags, queue,
+			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+			createOpenCLCommandQueue( *context,
+				command_queue_flags, *queue,
 				device_id,
-				size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
-				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+				*size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
+				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 
-			std::vector< T > real( fftBatchSize );
-			std::vector< T > imag( fftBatchSize );
+			std::vector< T > real( *fftBatchSize );
+			std::vector< T > imag( *fftBatchSize );
 
 			// set zero
-			for( cl_uint i = 0; i < fftBatchSize; ++i )
+			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 			{
 				real[ i ] = 0;
 				imag[ i ] = 0;
@@ -474,114 +658,577 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 				}
 			}
 
-
-			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
 				0, NULL, &outEvent ),
 				"clEnqueueWriteBuffer failed" );
-			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &imag[ 0 ],
 				0, NULL, &outEvent ),
 				"clEnqueueWriteBuffer failed" );
 		}
 		break;
-	default:
-		terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
-		return 1;
-	}
+	case CLFFT_HERMITIAN_INTERLEAVED:
+		{
+			//	This call creates our openCL context and sets up our devices; expected to throw on error
+			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( std::complex< T > );
 
-		//	Discover and load the timer module if present
-	void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
-	if( timerLibHandle == NULL )
-	{
-		terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
-	}
+			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+			createOpenCLCommandQueue( *context,
+				command_queue_flags, *queue,
+				device_id,
+				*size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 
+			std::vector< std::complex< T > > input( *fftBatchSize );
 
-	//	Timer module discovered and loaded successfully
-	//	Initialize function pointers to call into the shared module
-	PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );
+			// set zero
+			for( cl_uint i = 0; i < *fftBatchSize; ++i )
+			{
+				input[ i ] = 0;
+			}
 
-	//	Create and initialize our timer class, if the external timer shared library loaded
-	baseStatTimer* timer = NULL;
-	size_t	clFFTID = 0;
-	if( get_timer )
-	{
-		timer = get_timer( CLFFT_GPU );
-		timer->Reserve( 1, profile_count );
-		timer->setNormalize( true );
+			// impulse test case
+			for(size_t b = 0; b < batch_size; b++)
+			{
+				size_t p3 = b * strides[3];
+				input[p3] = static_cast<T>(*outfftVectorSize);
 
-		clFFTID	= timer->getUniqueID( "clFFT", 0 );
-	}
+			}
 
-	OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
-	OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );
+			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &input[ 0 ],
+				0, NULL, &outEvent ),
+				"clEnqueueWriteBuffer failed" );
+		}
+		break;
+	case CLFFT_HERMITIAN_PLANAR:
+		{
+			//	This call creates our openCL context and sets up our devices; expected to throw on error
+			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
 
-	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
-	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
-	OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
-	OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
-	OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+			createOpenCLCommandQueue( *context,
+				command_queue_flags, *queue,
+				device_id,
+				*size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
+				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 
-	OPENCL_V_THROW (clfftSetPlanInStride  ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
-	OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
-	OPENCL_V_THROW (clfftSetPlanDistance  ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );
+			std::vector< T > real( *fftBatchSize );
+			std::vector< T > imag( *fftBatchSize );
 
-	// Set backward scale factor to 1.0 for non real FFTs to do correct output checks
-	if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
-		OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );
+			// set zero
+			for( cl_uint i = 0; i < *fftBatchSize; ++i )
+			{
+				real[ i ] = 0;
+				imag[ i ] = 0;
+			}
 
-	//Check for Precallback
-	//Currently test includes only for 1D
-	if (hasPrecallback)
-	{
-		int precallbakType = PRECALLBACKTYPE;
-		cl_mem userdata;
+			// impulse test case
+			for(size_t b = 0; b < batch_size; b++)
+			{
+				size_t p3 = b * strides[3];
+				real[p3] = static_cast<T>(*outfftVectorSize);
+			}
 
-		if (in_layout == CLFFT_COMPLEX_INTERLEAVED)
+			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
+				0, NULL, &outEvent ),
+				"clEnqueueWriteBuffer failed" );
+			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &imag[ 0 ],
+				0, NULL, &outEvent ),
+				"clEnqueueWriteBuffer failed" );
+		}
+		break;
+	case CLFFT_REAL:
 		{
-			switch (precallbakType)
+			//	This call creates our openCL context and sets up our devices; expected to throw on error
+			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
+
+			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
+			createOpenCLCommandQueue( *context,
+				command_queue_flags, *queue,
+				device_id,
+				*size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+
+			std::vector< T > real( *fftBatchSize );
+
+			// set zero
+			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 			{
-			case 1: //C2C 1D Interleaved 
+				real[ i ] = 0;
+			}
+
+			// impulse test case
+			for(size_t b = 0; b < batch_size; b++)
+			{
+				size_t p3 = b * strides[3];
+				for(size_t k = 0; k < lengths[2]; k++)
 				{
-					char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+					size_t p2 = p3 + k * strides[2];
+					for(size_t j = 0; j < lengths[1]; j++)
+					{
+						size_t p1 = p2 + j * strides[1];
+						for(size_t i = 0; i < lengths[0]; i++)
+						{
+							size_t p0 = p1 + i * strides[0];
+							real[p0] = 1;
+						}
+					}
+				}
+			}
 
-					int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
-					for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
+				0, NULL, &outEvent ),
+				"clEnqueueWriteBuffer failed" );
+		}
+		break;
+	default:
+		{
+			throw std::runtime_error( "Input layout format not yet supported" );
+		}
+		break;
+	}
+
+	return 0;
+}
+
+//Compare output with reference C/FFTW code
+template < typename T >
+void compareWithReference(clfftLayout in_layout, clfftLayout out_layout, size_t outfftBatchSize, clfftResultLocation place, clfftPrecision precision,
+						  cl_command_queue queue, cl_mem *input_cl_mem_buffers, size_t size_of_input_buffers_in_bytes, size_t size_of_output_buffers_in_bytes,
+						  cl_mem *BuffersOut, size_t* lengths, size_t * strides, const size_t *inStrides, const size_t *outStrides, size_t *o_strides,
+						  size_t batch_size, size_t fftBatchSize, size_t fftVectorSizePadded, size_t outfftVectorSize,
+						  size_t outfftVectorSizePadded, size_t fftVectorSize, clfftDim dim, clfftDirection dir, bool hasPrecallback)
+{
+	bool checkflag= false;
+
+	switch( out_layout )
+	{
+	case CLFFT_HERMITIAN_INTERLEAVED:
+	case CLFFT_COMPLEX_INTERLEAVED:
+		{
+			std::vector< std::complex< T > > output( outfftBatchSize );
+
+			if( place == CLFFT_INPLACE )
+			{
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+			}
+			else
+			{
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+			}
+
+			//check output data
+			if (hasPrecallback)
+			{
+				switch(in_layout)
+				{
+				case CLFFT_HERMITIAN_INTERLEAVED:
+				case CLFFT_COMPLEX_INTERLEAVED:
+					{
+						if (precision == CLFFT_SINGLE)
+						{
+							fftwf_complex *refout;
+
+							refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+							if (!compare<fftwf_complex, T>(refout, output, outfftBatchSize))
+								checkflag = true;
+
+							//for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+							//{
+							//	std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+							//}
+							
+							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+							{
+								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+							}*/
+
+							fftwf_free(refout);
+						}
+						else if (precision == CLFFT_DOUBLE)
+						{
+							fftw_complex *refout;
+							
+							refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+							if (!compare<fftw_complex, T>(refout, output, outfftBatchSize))
+								checkflag = true;
+
+							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+							{
+								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+							}*/
+							
+							fftw_free(refout);
+						}
+					}
+					break;
+				}
+			}
+			else
+			{
+				for( cl_uint i = 0; i < outfftBatchSize; ++i )
+				{
+					if (0 == (i % outfftVectorSizePadded))
 					{
-						h_userdata[ i ] = SCALAR + (i % fftVectorSize);
+						if (output[i].real() != outfftVectorSize)
+						{
+							checkflag = true;
+							break;
+						}
+							
+					}
+					else
+					{
+						if (output[ i ].real() != 0)
+						{
+							checkflag = true;
+							break;
+						}
 					}
-					userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
 
-					//Register the callback
-					OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+					if (output[ i ].imag() != 0)
+					{
+						checkflag = true;
+						break;
+					}
 				}
-				break;
-			default:
-				break;
 			}
 		}
+		break;
+	case CLFFT_HERMITIAN_PLANAR:
+	case CLFFT_COMPLEX_PLANAR:
+		{
+			std::valarray< T > real( outfftBatchSize );
+			std::valarray< T > imag( outfftBatchSize );
 
-		if (in_layout == CLFFT_COMPLEX_PLANAR)
+			if( place == CLFFT_INPLACE )
+			{
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+			}
+			else
+			{
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+			}
+
+			//  Check output data
+			if (hasPrecallback)
+			{
+				switch(in_layout)
+				{
+				case CLFFT_COMPLEX_PLANAR:
+					{
+						if (precision == CLFFT_SINGLE)
+						{
+							fftwf_complex *refout;
+
+							refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+							if (!compare<fftwf_complex, T>(refout, real, imag, outfftBatchSize))
+								checkflag = true;
+
+							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+							{
+								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
+							}*/
+							
+							fftwf_free(refout);
+						}
+						else if (precision == CLFFT_DOUBLE)
+						{
+							fftw_complex *refout;
+
+							refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+							if (!compare<fftw_complex, T>(refout, real, imag, outfftBatchSize))
+								checkflag = true;
+
+							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+							{
+								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
+							}*/
+							
+							fftw_free(refout);
+						}
+					}
+					break;
+				}
+			}
+			else
+			{
+				for( cl_uint i = 0; i < outfftBatchSize; ++i )
+				{
+					if (0 == (i % outfftVectorSizePadded))
+					{
+						if (real[i] != outfftVectorSize)
+						{
+							checkflag = true;
+							break;
+						}
+					}
+					else
+					{
+						if (real[i] != 0)
+						{
+							checkflag = true;
+							break;
+						}
+					}
+
+					if (imag[i] != 0)
+					{
+						checkflag = true;
+						break;
+					}
+				}
+			}
+		}
+		break;
+	case CLFFT_REAL:
 		{
-			switch (precallbakType)
+			std::valarray< T > real( outfftBatchSize );
+
+			if( place == CLFFT_INPLACE )
+			{
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+			}
+			else
+			{
+				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
+					0, NULL, NULL ),
+					"Reading the result buffer failed" );
+			}
+
+			//  Check output data
+			if (hasPrecallback)
 			{
-			case 1: //C2C 1D PLANAR 
+				if (precision == CLFFT_SINGLE)
 				{
-					char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
-					USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
-					for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+					float *refout;
+
+					refout = get_fftwf_output_c2r(lengths, strides,  inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+												in_layout, outfftVectorSizePadded, outfftVectorSize, dim, dir);
+
+					if (!compare<float, T>(refout, real, outfftVectorSize, batch_size))
+						checkflag = true;
+
+					/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
 					{
-						h_userdata[i].scalar1 = SCALAR + (i % fftVectorSize);
-						h_userdata[i].scalar2 = SCALAR + (i % fftVectorSize) + 1;
+						std::cout << "i " << i << " refreal " << refout[i] << " clreal " << (real[i] * outfftVectorSize) << std::endl;
+					}*/
+					
+					if (refout)
+						free(refout);
+				}
+			}
+			else
+			{
+				for(size_t b = 0; b < batch_size; b++)
+				{
+					size_t p3 = b * o_strides[3];
+					for(size_t k = 0; k < lengths[2]; k++)
+					{
+						size_t p2 = p3 + k * o_strides[2];
+						for(size_t j = 0; j < lengths[1]; j++)
+						{
+							size_t p1 = p2 + j * o_strides[1];
+							for(size_t i = 0; i < lengths[0]; i++)
+							{
+								size_t p0 = p1 + i * o_strides[0];
+
+								if (real[p0] != 1)
+								{
+									checkflag = true;
+									break;
+								}
+
+							}
+						}
 					}
-					userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
+				}
+			}
+		}
+		break;
+	default:
+		{
+			throw std::runtime_error( "Input layout format not yet supported" );
+		}
+		break;
+	}
+
+	if (checkflag)
+	{
+		std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+	}
+	else
+	{
+		std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+	}
+}
+
+template < typename T >
+int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+				clfftLayout in_layout, clfftLayout out_layout,
+				clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
+				cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
+				cl_uint command_queue_flags, cl_uint profile_count,
+				std::auto_ptr< clfftSetupData > setupData,
+				bool hasPrecallback)
+{
+	//	Our command line does not specify what dimension FFT we wish to transform; we decode
+	//	this from the lengths that the user specifies for X, Y, Z.  A length of one means that
+	//	The user does not want that dimension.
 
-					//Register the callback
-					OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+	size_t strides[ 4 ];
+	size_t o_strides[ 4 ];
+	size_t fftVectorSize = 0;
+	size_t fftVectorSizePadded = 0;
+	size_t fftBatchSize = 0;
+	size_t outfftVectorSize = 0;
+	size_t outfftVectorSizePadded = 0;
+	size_t outfftBatchSize = 0;
+	size_t size_of_input_buffers_in_bytes = 0;
+	size_t size_of_output_buffers_in_bytes = 0;
+	
+	clfftDim	dim = CLFFT_1D;
+	cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
+	cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
+	cl_context context;
+	cl_command_queue queue;
+	cl_event outEvent = NULL;
+	clfftPlanHandle plan_handle;
+
+	//Valudate input and output data layout
+	validateDataLayout(in_layout, out_layout, place);
+	
+	if (hasPrecallback && !(in_layout == CLFFT_COMPLEX_INTERLEAVED || in_layout == CLFFT_COMPLEX_PLANAR || in_layout == CLFFT_HERMITIAN_INTERLEAVED))
+	{
+		terr << _T("Pre-callback feature is currently supported only for Complex-Complex and Complex-Real Interleaved FFT " ) << std::endl;
+		return 1;
+	}
+
+	//Initializations
+	OPENCL_V_THROW( dataInitialize<T>(lengths, &dim, batch_size, inStrides, strides, outStrides, o_strides, &fftBatchSize, &outfftBatchSize, 
+						&fftVectorSizePadded, in_layout, out_layout, &outfftVectorSizePadded, &fftVectorSize, &outfftVectorSize, place, 
+						&size_of_output_buffers_in_bytes, &size_of_input_buffers_in_bytes, input_cl_mem_buffers, output_cl_mem_buffers, &context, &queue,
+						deviceType, deviceId, platformId, command_queue_flags), "Data Initialization failed");
+
+	//	Discover and load the timer module if present
+	void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
+	if( timerLibHandle == NULL )
+	{
+		terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
+	}
+
+
+	//	Timer module discovered and loaded successfully
+	//	Initialize function pointers to call into the shared module
+	PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );
+
+	//	Create and initialize our timer class, if the external timer shared library loaded
+	baseStatTimer* timer = NULL;
+	size_t	clFFTID = 0;
+	if( get_timer )
+	{
+		timer = get_timer( CLFFT_GPU );
+		timer->Reserve( 1, profile_count );
+		timer->setNormalize( true );
+
+		clFFTID	= timer->getUniqueID( "clFFT", 0 );
+	}
+
+	OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+	OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );
+
+	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+	OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
+	OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
+	OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+	OPENCL_V_THROW (clfftSetPlanInStride  ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
+	OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
+	OPENCL_V_THROW (clfftSetPlanDistance  ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );
+
+	// Set backward scale factor to 1.0 for non real FFTs to do correct output checks
+	if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
+		OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );
+
+	//Check for Precallback
+	//Currently test includes only for 1D
+	if (hasPrecallback)
+	{
+		cl_mem userdata;
+
+		//C2C 1D Interleaved 
+		if (in_layout == CLFFT_COMPLEX_INTERLEAVED )
+		{
+			char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+
+			int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
+			for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+			{
+				h_userdata[ i ] = SCALAR + (i % fftVectorSize);
+			}
+			userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
+
+			//Register the callback
+			OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+		}
+		else if (in_layout == CLFFT_HERMITIAN_INTERLEAVED)
+		{	
+			char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_C2R) : STRINGIFY(MULVAL_DP);
+			
+			int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
+			for(size_t b = 0; b < batch_size; b++)
+			{
+				size_t p3 = b * strides[3];
+	
+				for( size_t i = 0; i < fftVectorSizePadded; i = i + inStrides[0])
+				{
+					h_userdata[ p3 + i ] = SCALAR + i;
 				}
-				break;
-			default:
-				break;
 			}
+			userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
+
+			//Register the callback
+			OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+		}
+
+		//C2C PLANAR 
+		if (in_layout == CLFFT_COMPLEX_PLANAR)
+		{	
+			char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
+			USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
+			for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
+			{
+				h_userdata[i].scalar1 = SCALAR + (int)(i % fftVectorSize);
+				h_userdata[i].scalar2 = SCALAR + (int)(i % fftVectorSize) + 1;
+			}
+			userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
+
+			//Register the callback
+			OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
 		}
 	}
 
@@ -601,32 +1248,6 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 		OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
 	}
 
-	if (( place == CLFFT_INPLACE )
-	&&  ( in_layout != out_layout )) 
-	{
-		switch( in_layout )
-		{
-		case CLFFT_COMPLEX_INTERLEAVED:
-			{
-				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
-				{
-					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
-				}
-				break;
-			}
-		case CLFFT_COMPLEX_PLANAR:
-			{
-				if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
-				{
-					throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
-				}
-				break;
-			}
-		default:
-			terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
-			return 1;
-		}
-	}
 
 	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
 
@@ -675,217 +1296,9 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 	if (( place == CLFFT_OUTOFPLACE )
 	||  ( profile_count == 1))
 	{
-		bool checkflag= false;
-		switch( out_layout )
-		{
-		case CLFFT_HERMITIAN_INTERLEAVED:
-		case CLFFT_COMPLEX_INTERLEAVED:
-			{
-				std::vector< std::complex< T > > output( outfftBatchSize );
-
-				if( place == CLFFT_INPLACE )
-				{
-					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
-						0, NULL, NULL ),
-						"Reading the result buffer failed" );
-				}
-				else
-				{
-					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
-						0, NULL, NULL ),
-						"Reading the result buffer failed" );
-				}
-
-				//check output data
-				if (hasPrecallback)
-				{
-					switch(in_layout)
-					{
-					case CLFFT_COMPLEX_INTERLEAVED:
-						{
-							if (precision == CLFFT_SINGLE)
-							{
-								fftwf_complex *refout;
-
-								refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
-															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
-								if (!compare(refout, output, outfftBatchSize))
-									checkflag = true;
-
-								//for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
-								//{
-								//	std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
-								//}
-							
-								fftwf_free(refout);
-							}
-							else if (precision == CLFFT_DOUBLE)
-							{
-								fftw_complex *refout;
-							
-								refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
-															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
-								if (!compare(refout, output, outfftBatchSize))
-									checkflag = true;
-
-								/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
-								{
-									std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
-								}*/
-							
-								fftw_free(refout);
-							}
-						}
-						break;
-					}
-				}
-				else
-				{
-					for( cl_uint i = 0; i < outfftBatchSize; ++i )
-					{
-						if (0 == (i % outfftVectorSizePadded))
-						{
-							if (output[i].real() != outfftVectorSize)
-							{
-								checkflag = true;
-								break;
-							}
-							
-						}
-						else
-						{
-							if (output[ i ].real() != 0)
-							{
-								checkflag = true;
-								break;
-							}
-						}
-
-						if (output[ i ].imag() != 0)
-						{
-							checkflag = true;
-							break;
-						}
-					}
-				}
-			}
-			break;
-		case CLFFT_HERMITIAN_PLANAR:
-		case CLFFT_COMPLEX_PLANAR:
-			{
-				std::valarray< T > real( outfftBatchSize );
-				std::valarray< T > imag( outfftBatchSize );
-
-				if( place == CLFFT_INPLACE )
-				{
-					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
-						0, NULL, NULL ),
-						"Reading the result buffer failed" );
-					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
-						0, NULL, NULL ),
-						"Reading the result buffer failed" );
-				}
-				else
-				{
-					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
-						0, NULL, NULL ),
-						"Reading the result buffer failed" );
-					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
-						0, NULL, NULL ),
-						"Reading the result buffer failed" );
-				}
-
-				//  Check output data
-				if (hasPrecallback)
-				{
-					switch(in_layout)
-					{
-					case CLFFT_COMPLEX_PLANAR:
-						{
-							if (precision == CLFFT_SINGLE)
-							{
-								fftwf_complex *refout;
-
-								refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
-															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
-								if (!compare(refout, real, imag, outfftBatchSize))
-									checkflag = true;
-
-								/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
-								{
-									std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
-								}*/
-							
-								fftwf_free(refout);
-							}
-							else if (precision == CLFFT_DOUBLE)
-							{
-								fftw_complex *refout;
-
-								refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
-															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
-
-								if (!compare(refout, real, imag, outfftBatchSize))
-									checkflag = true;
-
-								/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
-								{
-									std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
-								}*/
-							
-								fftw_free(refout);
-							}
-						}
-						break;
-					}
-				}
-				else
-				{
-					for( cl_uint i = 0; i < outfftBatchSize; ++i )
-					{
-						if (0 == (i % outfftVectorSizePadded))
-						{
-							if (real[i] != outfftVectorSize)
-							{
-								checkflag = true;
-								break;
-							}
-						}
-						else
-						{
-							if (real[i] != 0)
-							{
-								checkflag = true;
-								break;
-							}
-						}
-
-						if (imag[i] != 0)
-						{
-							checkflag = true;
-							break;
-						}
-					}
-				}
-			}
-			break;
-		default:
-			terr << _T("Complex-Real callback cases not yet implemented" ) << std::endl;
-			throw std::runtime_error( "Input layout format not yet supported" );
-			break;
-		}
-
-		if (checkflag)
-		{
-			std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
-		}
-		else
-		{
-			std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
-		}
+		compareWithReference<T>(in_layout, out_layout, outfftBatchSize, place, precision, queue, input_cl_mem_buffers, size_of_input_buffers_in_bytes, size_of_output_buffers_in_bytes,
+								BuffersOut, lengths, strides, inStrides, outStrides, o_strides, batch_size, fftBatchSize, fftVectorSizePadded, outfftVectorSize, outfftVectorSizePadded, fftVectorSize,
+								dim, dir, hasPrecallback);
 	}
 
 	OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
@@ -1059,10 +1472,42 @@ int main(int argc, char **argv)
 				oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2];
 			}
 		}
-		else
+		else // Real-Complex and Complex-Real cases
 		{
-			terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
-			return 1;
+			size_t *rst, *cst;
+			size_t N = lengths[0];
+			size_t Nt = 1 + lengths[0]/2;
+			bool iflag = false;
+			bool rcFull = (inL == 1) || (inL == 2) || (otL == 1) || (otL == 2);
+
+			if(inLayout == CLFFT_REAL) { iflag = true; rst = iStrides; }
+			else { rst = oStrides; } // either in or out should be REAL
+
+			// Set either in or out strides whichever is real
+			if(place == CLFFT_INPLACE)
+			{
+				if(rcFull)	{ rst[1] = rst[1] ? rst[1] :  N * 2 * rst[0]; }
+				else		{ rst[1] = rst[1] ? rst[1] : Nt * 2 * rst[0]; }
+
+				rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1];
+				rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2];
+			}
+			else
+			{
+				rst[1] = rst[1] ? rst[1] : lengths[0] * rst[0];
+				rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1];
+				rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2];
+			}
+
+			// Set the remaining of in or out strides that is not real
+			if(iflag) { cst = oStrides; }
+			else	  { cst = iStrides; }
+
+			if(rcFull)	{ cst[1] = cst[1] ? cst[1] :  N * cst[0]; }
+			else		{ cst[1] = cst[1] ? cst[1] : Nt * cst[0]; }
+
+			cst[2] = cst[2] ? cst[2] : lengths[1] * cst[1];
+			cst[3] = cst[3] ? cst[3] : lengths[2] * cst[2];
 		}
 
 		if( precision == CLFFT_SINGLE )
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index cbd06fc..493c05d 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -781,7 +781,7 @@ clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName,
 
 	if (callbackType == PRECALLBACK)
 	{
-		if (fftPlan->inputLayout == CLFFT_COMPLEX_INTERLEAVED || fftPlan->inputLayout == CLFFT_COMPLEX_PLANAR)
+		if (fftPlan->inputLayout == CLFFT_COMPLEX_INTERLEAVED || fftPlan->inputLayout == CLFFT_COMPLEX_PLANAR || fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED)
 		{
 			if (funcName != NULL && funcString != NULL)
 			{
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 854ffcc..df35141 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -825,6 +825,7 @@ namespace StockhamGenerator
 				return;
 			}
 
+			int hid;
 			for(size_t i=0; i<numB; i++)
 			{
 				std::string regBaseCount = regBase;
@@ -841,6 +842,7 @@ namespace StockhamGenerator
 						{
 							std::string tail;
 							std::string regIndex;
+							std::string regIndexC;
 							regIndex = linearRegs ? "(*R" : regBaseCount;
 							std::string buffer;
 
@@ -856,7 +858,17 @@ namespace StockhamGenerator
 							{
 								if(c == 0)
 								{
-									if(linearRegs) { RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ").x"; }
+									if(linearRegs) 
+									{ 
+										RegBaseAndCountAndPos("", i*radix + r, regIndex); 
+									
+										hid = r / (numB * radix / 2);
+										if (fft_doPreCallback && component == SR_COMP_REAL && hid != 0)
+										{
+											regIndexC = regIndex; regIndexC += ").y";
+										}
+										regIndex += ").x"; 
+									}
 									else		   { RegBaseAndCountAndPos("R", r, regIndex); }
 									buffer = bufferRe;
 									tail = interleaved ? ".x;" : ";";
@@ -889,7 +901,7 @@ namespace StockhamGenerator
 
 								//If precallback is set invoke callback function
 								//Invoke callback only once in Planar data layout (i.e.c==0)
-								if (fft_doPreCallback && c == 0)
+								if (fft_doPreCallback && c == 0 && component == SR_COMP_BOTH)
 								{
 									passStr += "\n\t";
 									passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "] = "; passStr += fft_preCallback.funcname; passStr += "("; 
@@ -909,12 +921,18 @@ namespace StockhamGenerator
 									passStr += ");";
 								}
 
+								if (fft_doPreCallback && component == SR_COMP_REAL && hid != 0)
+								{
+									passStr += "\n\t";
+									passStr += regIndexC; passStr += " = "; passStr += regIndexSub; passStr += ";";
+								}
+
 								passStr += "\n\t";
 								passStr += regIndexSub;
 								passStr += " = "; 
 
 								//Use the return value from precallback if set
-								if (fft_doPreCallback)
+								if (fft_doPreCallback && component == SR_COMP_BOTH)
 								{
 									passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "]"; 
 									passStr += interleaved ? tail : (c == 0) ? ".x;" : ".y;";
@@ -1210,22 +1228,30 @@ namespace StockhamGenerator
 						std::string regIndex = "(*R";
 						std::string buffer;
 
-						if(c == 0)
+						RegBaseAndCountAndPos("", r, regIndex); 
+						if (fft_doPreCallback && interleaved)
 						{
-							RegBaseAndCountAndPos("", r, regIndex); regIndex += ").x";
-							buffer = bufferRe;
-							tail  = interleaved ? ".x;" : ";";
-							tail2 = interleaved ? ".y;" : ";";
+							 regIndex += ")";
+							 buffer = (c == 0) ? bufferRe : bufferIm;
 						}
 						else
 						{
-							RegBaseAndCountAndPos("", r, regIndex); regIndex += ").y";
-							buffer = bufferIm;
-							tail  = interleaved ? ".y;" : ";";
-							tail2 = interleaved ? ".x;" : ";";
+							if(c == 0)
+							{
+								regIndex += ").x";
+								buffer = bufferRe;
+								tail  = interleaved ? ".x;" : ";";
+								tail2 = interleaved ? ".y;" : ";";
+							}
+							else
+							{
+								regIndex += ").y";
+								buffer = bufferIm;
+								tail  = interleaved ? ".y;" : ";";
+								tail2 = interleaved ? ".x;" : ";";
+							}
 						}
 
-
 						size_t bid = numCR/2;
 						bid = bid ? bid : 1;
 						size_t cid, lid;
@@ -1263,8 +1289,17 @@ namespace StockhamGenerator
 						{
 							if(act)
 							{
-								passStr += buffer;
-								passStr += "["; passStr += offset; passStr += " + ( ";
+								if (fft_doPreCallback)
+								{
+									passStr += fft_preCallback.funcname; passStr += "(";
+									passStr += buffer; passStr += ", ";  
+								}
+								else
+								{
+									passStr += buffer;
+									passStr += "["; 
+								}
+								passStr += offset; passStr += " + ( ";
 							}
 
 							if(fwd)
@@ -1280,10 +1315,20 @@ namespace StockhamGenerator
 
 							if(act)
 							{
-								passStr += " )*"; passStr += SztToStr(stride); passStr += "]";
+								passStr += " )*"; passStr += SztToStr(stride); 
+								
+								if (fft_doPreCallback)
+								{
+									passStr += ", userdata";
+									passStr += (fft_preCallback.localMemSize > 0) ? ", localmem);" : ");";
+								}
+								else
+								{
+									passStr += "]";
 
-								if(fwd) { passStr += tail; }
-								else	{ if(!batch2) passStr += tail; else passStr += tail2; }
+									if(fwd) { passStr += tail; }
+									else	{ if(!batch2) passStr += tail; else passStr += tail2; }
+								}
 							}
 						}
 					}
@@ -1439,7 +1484,8 @@ namespace StockhamGenerator
 								passStr += "( ";
 								if(c == 0)
 								{
-									regIndex += ".x"; regIndexPair += ".x";
+									regIndex += ".x"; 
+									regIndexPair += fft_doPreCallback ? ".y" : ".x";
 
 									if(!batch2)	{ passStr += regIndex; passStr += " - "; passStr += regIndexPair; }
 									else		{ passStr += regIndex; passStr += " + "; passStr += regIndexPair; }
@@ -1737,8 +1783,13 @@ namespace StockhamGenerator
 			}
 
 			//Include callback parameters if callback is set
-			if (fft_doPreCallback)
+			if (fft_doPreCallback && !r2c)
 			{
+				if (c2r)
+				{
+					passStr += ", uint inOffset2";
+				}
+
 				passStr += ", __global void* userdata";
 
 				if (fft_preCallback.localMemSize > 0)
@@ -1830,7 +1881,17 @@ namespace StockhamGenerator
 
 					passStr += "\n\tif(rw && !me)\n\t{\n\t";
 					passStr += processBufRe; passStr += "["; passStr += processBufOffset; passStr += "] = ";
-					passStr += bufferInRe; passStr+= "[inOffset]";
+					
+					if (fft_doPreCallback)
+					{
+						passStr += fft_preCallback.funcname; passStr += "("; passStr += bufferInRe; passStr += ", inOffset, userdata";
+						passStr += fft_preCallback.localMemSize > 0 ? ", localmem)" : ")";
+					}
+					else
+					{
+						passStr += bufferInRe; passStr+= "[inOffset]";
+					}
+
 					if(inInterleaved) passStr += ".x;\n\t}"; else passStr += ";\n\t}";
 
 					if(length > 1)
@@ -1840,7 +1901,15 @@ namespace StockhamGenerator
 						passStr += "\n\t}\n";
 
 						passStr += "\n\tif(rw > 1)\n\t{";
-						SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
+						if (fft_doPreCallback)
+						{
+							SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, false, bufferInIm2, bufferInIm2, "inOffset2", passStr);
+						}
+						else
+						{
+							SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
+						}
+
 						passStr += "\n\t}\n\telse\n\t{";
 						SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, true, true, false, bufferInIm2, bufferInIm2, "inOffset", passStr);
 						passStr += "\n\t}\n";
@@ -1879,22 +1948,34 @@ namespace StockhamGenerator
 
 					passStr += "\n\tif((rw > 1) && !me)\n\t{\n\t";
 					passStr += processBufIm; passStr += "["; passStr += processBufOffset; passStr += "] = ";
-					passStr += bufferInRe2; passStr+= "[inOffset]";
+					
+					if (fft_doPreCallback)
+					{
+						passStr += fft_preCallback.funcname; passStr += "("; passStr += bufferInRe2; passStr += ", inOffset, userdata";
+						passStr += fft_preCallback.localMemSize > 0 ? ", localmem)" : ")";
+					}
+					else
+					{
+						passStr += bufferInRe2; passStr+= "[inOffset]";
+					}
 					if(inInterleaved) passStr += ".x;\n\t}"; else passStr += ";\n\t}";
 					passStr += "\n\tif((rw == 1) && !me)\n\t{\n\t"; passStr += processBufIm; passStr += "["; passStr += processBufOffset; passStr += "] = 0;\n\t}";
 
 
 					if(length > 1)
 					{
-						passStr += "\n\n\tif(rw)\n\t{";
-						SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInIm, bufferInIm, "inOffset", passStr);
-						passStr += "\n\t}\n";
+						if (!fft_doPreCallback)
+						{
+							passStr += "\n\n\tif(rw)\n\t{";
+							SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInIm, bufferInIm, "inOffset", passStr);
+							passStr += "\n\t}\n";
 
-						passStr += "\n\tif(rw > 1)\n\t{";
-						SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
-						passStr += "\n\t}\n\telse\n\t{";
-						SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, true, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
-						passStr += "\n\t}\n";
+							passStr += "\n\tif(rw > 1)\n\t{";
+							SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
+							passStr += "\n\t}\n\telse\n\t{";
+							SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, true, true, false, bufferInRe2, bufferInRe2, "inOffset", passStr);
+							passStr += "\n\t}\n";
+						}
 
 						if(oddp)
 						{
@@ -2394,7 +2475,7 @@ namespace StockhamGenerator
 					
 					//Pass precallback information to Pass object if its the first pass. 
 					//This will be used in single kernel transforms
-					if (!r2c2r && i == 0 && !params.blockCompute && params.fft_hasPreCallback)
+					if (params.fft_hasPreCallback && !r2c && i == 0 && !params.blockCompute)
 					{
 						passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
 					}
@@ -2794,12 +2875,20 @@ namespace StockhamGenerator
 					{
 						if(outInterleaved)
 						{
-							str += "__global "; str += r2Type; str += " * restrict gb)\n";
+							str += "__global "; str += r2Type; str += " * restrict gb";
 						}
 						else
 						{
-							str += "__global "; str += rType; str += " * restrict gb)\n";
+							str += "__global "; str += rType; str += " * restrict gb";
 						}
+
+						//If plan has pre-callback
+						if (params.fft_hasPreCallback)
+						{
+							str += callbackstr;
+						}
+
+						str += ")\n";
 					}
 					else
 					{
@@ -2941,8 +3030,8 @@ namespace StockhamGenerator
 
 					if(inInterleaved)
 					{
-						if(!rcSimple)	{	str += "__global "; str += r2Type; str += " *lwbIn2;\n\t"; }
-											str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+						if(!rcSimple && !params.fft_hasPreCallback)	{	str += "__global "; str += r2Type; str += " *lwbIn2;\n\t"; }
+						if(!params.fft_hasPreCallback)	{   str += "__global "; str += r2Type; str += " *lwbIn;\n\t";  }
 					}
 					else if(inReal)
 					{
@@ -2952,10 +3041,13 @@ namespace StockhamGenerator
 					}
 					else
 					{
-						if(!rcSimple)	{	str += "__global "; str += rType; str += " *lwbInRe2;\n\t"; }
-						if(!rcSimple)	{	str += "__global "; str += rType; str += " *lwbInIm2;\n\t"; }
-											str += "__global "; str += rType; str += " *lwbInRe;\n\t";
-											str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+						if(!rcSimple && !params.fft_hasPreCallback)	{	str += "__global "; str += rType; str += " *lwbInRe2;\n\t"; }
+						if(!rcSimple && !params.fft_hasPreCallback)	{	str += "__global "; str += rType; str += " *lwbInIm2;\n\t"; }
+						if (!params.fft_hasPreCallback)	
+						{  
+							str += "__global "; str += rType; str += " *lwbInRe;\n\t"; 
+							str += "__global "; str += rType; str += " *lwbInIm;\n\t"; 
+						}
 					}
 
 					if(outInterleaved)
@@ -3108,8 +3200,8 @@ namespace StockhamGenerator
 					{
 						if(inInterleaved)
 						{
-							if(!rcSimple) {	str += "lwbIn2 = (__global "; str += r2Type; str += " *)gb + iOffset2;\n\t"; }
-											str += "lwbIn  = (__global "; str += r2Type; str += " *)gb + iOffset;\n\t";
+							if(!rcSimple && !params.fft_hasPreCallback) {	str += "lwbIn2 = (__global "; str += r2Type; str += " *)gb + iOffset2;\n\t"; }
+							if(!params.fft_hasPreCallback) {	str += "lwbIn  = (__global "; str += r2Type; str += " *)gb + iOffset;\n\t"; }
 						}
 						else
 						{
@@ -3219,9 +3311,9 @@ namespace StockhamGenerator
 				}
 
 				std::string inOffset;
-				if (!r2c2r)
+				if (!r2c)
 				{
-					if (params.fft_placeness == CLFFT_INPLACE)
+					if (params.fft_placeness == CLFFT_INPLACE && !c2r)
 					{
 						inOffset += "ioOffset";
 					}
@@ -3340,8 +3432,9 @@ namespace StockhamGenerator
 					}
 					else
 					{
-						if(inInterleaved || inReal)		inBuf  = "lwbIn, lwbIn2, ";
-						else							inBuf  = "lwbInRe, lwbInRe2, lwbInIm, lwbInIm2, ";
+						if(inInterleaved || inReal)		inBuf  = (inInterleaved && params.fft_hasPreCallback) ? "gb, gb, " : "lwbIn, lwbIn2, ";
+						else							inBuf  = (params.fft_hasPreCallback) ? "gbInRe, gbInRe, gbInIm, gbInIm, " : "lwbInRe, lwbInRe2, lwbInIm, lwbInIm2, ";
+
 						if(outInterleaved || outReal)	outBuf = "lwbOut, lwbOut2";
 						else							outBuf = "lwbOutRe, lwbOutRe2, lwbOutIm, lwbOutIm2";
 					}
@@ -3390,7 +3483,14 @@ namespace StockhamGenerator
 					str += "\t";
 					str += PassName(0, fwd);
 					str += "("; str += rw; str += me;
-					str += (!r2c2r) ? inOffset : "0";
+					if (r2c)
+					{
+						str += "0";
+					}
+					else
+					{
+						str += (params.fft_hasPreCallback || !c2r) ? inOffset : "0";
+					}
 					str += ", 0, ";
 					str += inBuf; str += outBuf;
 					str += IterRegs("&");
@@ -3454,7 +3554,14 @@ namespace StockhamGenerator
 							}
 							else
 							{
-								str += (!r2c2r) ? inOffset : "0";
+								if (r2c)
+								{
+									str += "0";
+								}
+								else
+								{
+									str += (params.fft_hasPreCallback || !c2r) ? inOffset : "0";
+								}
 							}
 							str += ", ";
 							str += ldsOff;
@@ -3463,9 +3570,9 @@ namespace StockhamGenerator
 							str += ldsArgs; str += IterRegs("&"); 
 							
 							//if precalback set, append additional arguments
-							if (!r2c2r && !blockCompute && params.fft_hasPreCallback)
+							if (!r2c && !blockCompute && params.fft_hasPreCallback)
 							{
-								str += ", userdata";
+								str += c2r ?  ", iOffset2, userdata" : ", userdata";
 
 								if (params.fft_preCallback.localMemSize > 0)
 								{

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list