[clfft] 45/128: Precallback - Merge from upstream develop branch and fix broken C2C/C2R precallback

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:37 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 3a1625cdab0cc2eeac7bccd62e84db97d22823b6
Merge: 16b6cf2 714e25d
Author: Pradeep <pradeep.rao at amd.com>
Date:   Wed Aug 26 16:10:40 2015 +0530

    Precallback - Merge from upstream develop branch and fix broken C2C/C2R precallback

 ReleaseNotes.txt                                   |   54 +
 {doc => docs}/clFFT.doxy                           |    0
 .../clFFT_2.6.0/FirePro_W9100/C2C_1D_double.csv    |   25 +
 .../clFFT_2.6.0/FirePro_W9100/C2C_1D_single.csv    |   25 +
 .../clFFT_2.6.0/FirePro_W9100/C2C_2D_double.csv    |   13 +
 .../clFFT_2.6.0/FirePro_W9100/C2C_2D_single.csv    |   13 +
 .../clFFT_2.6.0/FirePro_W9100/C2C_3D_double.csv    |    9 +
 .../clFFT_2.6.0/FirePro_W9100/C2C_3D_single.csv    |    9 +
 .../clFFT_2.6.0/FirePro_W9100/R2C_1D_double.csv    |   23 +
 .../clFFT_2.6.0/FirePro_W9100/R2C_1D_single.csv    |   25 +
 .../clFFT_2.6.0/FirePro_W9100/R2C_2D_double.csv    |   13 +
 .../clFFT_2.6.0/FirePro_W9100/R2C_2D_single.csv    |   13 +
 .../clFFT_2.6.0/FirePro_W9100/R2C_3D_double.csv    |    9 +
 .../clFFT_2.6.0/FirePro_W9100/R2C_3D_single.csv    |    9 +
 .../cuFFT_7.0/Tesla_K40/C2C_1D_double.csv          |   25 +
 .../cuFFT_7.0/Tesla_K40/C2C_1D_single.csv          |   25 +
 .../cuFFT_7.0/Tesla_K40/C2C_2D_double.csv          |   13 +
 .../cuFFT_7.0/Tesla_K40/C2C_2D_single.csv          |   13 +
 .../cuFFT_7.0/Tesla_K40/C2C_3D_double.csv          |    9 +
 .../cuFFT_7.0/Tesla_K40/C2C_3D_single.csv          |    9 +
 .../cuFFT_7.0/Tesla_K40/R2C_1D_double.csv          |   25 +
 .../cuFFT_7.0/Tesla_K40/R2C_1D_single.csv          |   25 +
 .../cuFFT_7.0/Tesla_K40/R2C_2D_double.csv          |   13 +
 .../cuFFT_7.0/Tesla_K40/R2C_2D_single.csv          |   13 +
 .../cuFFT_7.0/Tesla_K40/R2C_3D_double.csv          |    9 +
 .../cuFFT_7.0/Tesla_K40/R2C_3D_single.csv          |    9 +
 {doc => docs}/realfft_1dlen.jpg                    |  Bin
 {doc => docs}/realfft_ex_n7.jpg                    |  Bin
 {doc => docs}/realfft_ex_n8.jpg                    |  Bin
 {doc => docs}/realfft_expl_01.jpg                  |  Bin
 {doc => docs}/realfft_expl_02.jpg                  |  Bin
 {doc => docs}/realfft_expl_03.jpg                  |  Bin
 {doc => docs}/realfft_expl_04.jpg                  |  Bin
 {doc => docs}/realfft_expl_05.jpg                  |  Bin
 {doc => docs}/realfft_expl_06.jpg                  |  Bin
 {doc => docs}/realfft_expl_07.jpg                  |  Bin
 {doc => docs}/realfft_expl_08.jpg                  |  Bin
 {doc => docs}/realfft_fwdinv.jpg                   |  Bin
 src/CMakeLists.txt                                 |    4 +-
 src/client-callback/callback-client.cpp            |   21 -
 src/client/client.cpp                              |  105 +-
 src/client/client.h                                |   23 +
 src/client/openCL.misc.cpp                         |    7 +-
 src/cuFFT-client/CMakeLists.txt                    |   41 +
 src/cuFFT-client/cuFFT-client.cpp                  | 2748 ++++++++++++++++++++
 src/library/CMakeLists.txt                         |    2 +-
 src/library/fft_binary_lookup.cpp                  |    6 +-
 src/library/generator.copy.cpp                     |    5 +-
 src/library/generator.stockham.cpp                 |  272 +-
 src/library/plan.cpp                               |    4 +-
 src/scripts/perf/fftPerformanceTesting.py          |   30 +-
 src/scripts/perf/manual.pdf                        |  Bin 0 -> 180095 bytes
 src/scripts/perf/measurePerformance.py             |  325 ++-
 src/scripts/perf/plotPerformance.py                |   40 +-
 src/statTimer/statisticalTimer.GPU.cpp             |   46 +-
 src/statTimer/statisticalTimer.GPU.h               |    6 +
 src/tests/accuracy_test_pow3.cpp                   |    2 +-
 src/tests/accuracy_test_pow5.cpp                   |    2 +-
 src/tests/accuracy_test_precallback.cpp            |   66 +
 59 files changed, 3953 insertions(+), 230 deletions(-)

diff --cc src/client-callback/callback-client.cpp
index 2b1f810,0000000..3c4e67f
mode 100644,000000..100644
--- a/src/client-callback/callback-client.cpp
+++ b/src/client-callback/callback-client.cpp
@@@ -1,1545 -1,0 +1,1524 @@@
 +#include "stdafx.h"
 +#include <functional>
 +#include <cmath>
 +
 +#include "client.h"
 +#include "../library/private.h"
 +#include "openCL.misc.h"
 +#include "../statTimer/statisticalTimer.extern.h"
 +#include "../include/sharedLibrary.h"
 +#include "../include/unicode.compatibility.h"
 +
 +#include <fftw3.h>
 +
 +namespace po = boost::program_options;
 +
 +#define SCALAR 10
 +
 +#define MULVAL float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
 +				{ \n \
 +				int scalar = *((__global int*)userdata + offset); \n \
 +				float2 ret = *((__global float2*)in + offset) * scalar; \n \
 +				return ret; \n \
 +				}
 +
 +#define MULVAL_C2R float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
 +				{ \n \
 +				int scalar = *((__global int*)userdata + offset); \n \
 +				float2 ret = *((__global float2*)in + offset) * scalar; \n \
 +				return ret; \n \
 +				}
 +
 +#define MULVAL_DP double2 mulval(__global void* in, uint offset, __global void* userdata)\n \
 +				{ \n \
 +				int scalar = *((__global int*)userdata + offset); \n \
 +				double2 ret = *((__global double2*)in + offset) * scalar; \n \
 +				return ret; \n \
 +				}
 +
 +#define MULVAL_PLANAR float2 mulval(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
 +				{ \n \
 +				__global USER_DATA *data = ((__global USER_DATA *)userdata + offset); \n \
 +				int scalar = (int)data->scalar1 + (int)data->scalar2; \n \
 +				float2 ret; \n \
 +				ret.x = *((__global float*)inRe + offset) * scalar; \n \
 +				ret.y = *((__global float*)inIm + offset) * scalar; \n \
 +				return ret; \n \
 +				}
 +
 +#define MULVAL_PLANAR_DP double2 mulval(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
 +				{ \n \
 +				__global USER_DATA *data = ((__global USER_DATA *)userdata + offset); \n \
 +				int scalar = (int)data->scalar1 + (int)data->scalar2; \n \
 +				double2 ret; \n \
 +				ret.x = *((__global double*)inRe + offset) * scalar; \n \
 +				ret.y = *((__global double*)inIm + offset) * scalar; \n \
 +				return ret; \n \
 +				}
 +
 +#define STRUCT_USERDATA typedef struct USER_DATA  \
 +					   {  \
 +						int scalar1;  \
 +						int scalar2;  \
 +						} USER_DATA; 
 +STRUCT_USERDATA
 +
 +//Compare reference and opencl output 
 +template < typename T1, typename T2>
 +bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
 +             size_t length, const float epsilon = 1e-6f)
 +{
 +    float error = 0.0f;
 +    T1 ref;
 +	T1 diff;
 +	float normRef = 0.0f;
 +	float normError = 0.0f;
 +
 +    for(size_t i = 0; i < length; ++i)
 +    {
 +        diff[0] = refData[i][0] - data[i].real();
 +        error += (float)(diff[0] * diff[0]);
 +        ref[0] += refData[i][0] * refData[i][0];
 +    }
 +	if (error != 0)
 +	{
 +		normRef =::sqrtf((float) ref[0]);
 +		if (::fabs((float) ref[0]) < 1e-7f)
 +		{
 +			return false;
 +		}
 +		normError = ::sqrtf((float) error);
 +		error = normError / normRef;
 +    
 +		if (error > epsilon)
 +			return false;
 +	}
 +
 +	//imag
 +	error = 0.0f;
 +	ref[1] = 0.0;
 +	for(size_t i = 0; i < length; ++i)
 +    {
 +        diff[1] = refData[i][1] - data[i].imag();
 +        error += (float)(diff[1] * diff[1]);
 +        ref[1] += refData[i][1] * refData[i][1];
 +    }
 +	
 +	if (error == 0)
 +		return true;
 +
 +	normRef =::sqrtf((float) ref[1]);
 +    if (::fabs((float) ref[1]) < 1e-7f)
 +    {
 +        return false;
 +    }
 +	normError = ::sqrtf((float) error);
 +    error = normError / normRef;
 +    
 +	if (error > epsilon)
 +		return false;
 +
 +	return true;
 +}
 +
 +//Compare reference and opencl output
 +template < typename T1, typename T2 >
 +bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
 +             size_t length, const float epsilon = 1e-6f)
 +{
 +    float error = 0.0f;
 +    T1 ref;
 +	T1 diff;
 +	float normRef = 0.0f;
 +	float normError = 0.0f;
 +
 +	//real compare
 +    for(size_t i = 0; i < length; ++i)
 +    {
 +        diff[0] = refData[i][0] - real[i];
 +        error += (float)(diff[0] * diff[0]);
 +        ref[0] += refData[i][0] * refData[i][0];
 +    }
 +	if (error != 0)
 +	{
 +		normRef =::sqrtf((float) ref[0]);
 +		if (::fabs((float) ref[0]) < 1e-7f)
 +		{
 +			return false;
 +		}
 +		normError = ::sqrtf((float) error);
 +		error = normError / normRef;
 +    
 +		if (error > epsilon)
 +			return false;
 +	}
 +
 +	//imag compare
 +	error = 0.0f;
 +    ref[1] = 0.0;
 +
 +	for(size_t i = 0; i < length; ++i)
 +    {
 +        diff[1] = refData[i][1] - imag[i];
 +        error += (float)(diff[1] * diff[1]);
 +        ref[1] += refData[i][1] * refData[i][1];
 +    }
 +	
 +	if (error == 0)
 +		return true;
 +
 +    normRef =::sqrtf((float) ref[1]);
 +    if (::fabs((float) ref[1]) < 1e-7f)
 +    {
 +        return false;
 +    }
 +    normError = ::sqrtf((float) error);
 +    error = normError / normRef;
 +    
 +	if (error > epsilon)
 +		return false;
 +
 +	return true;
 +}
 +
 +//Compare reference and opencl output
 +template < typename T1 , typename T2 >
 +bool compare(T1 *refData, std::valarray< T2 > real, 
 +             size_t batch_size, size_t *o_strides, size_t *lengths, const float epsilon = 1e-6f)
 +{
 +    float error = 0.0f;
 +    T1 ref = 0.0;
 +	T1 diff;
 +	float normRef = 0.0f;
 +	float normError = 0.0f;
 +	
 +	//real compare
 +	for(size_t b = 0; b < batch_size; b++)
 +	{
 +		size_t p3 = b * o_strides[3];
 +		for(size_t k = 0; k < lengths[2]; k++)
 +		{
 +			size_t p2 = p3 + k * o_strides[2];
 +			for(size_t j = 0; j < lengths[1]; j++)
 +			{
 +				size_t p1 = p2 + j * o_strides[1];
 +				for(size_t i = 0; i < lengths[0]; i++)
 +				{
 +					size_t p0 = p1 + i * o_strides[0];
 +
 +					diff = refData[p0] - (real[p0] * lengths[0] * lengths[1] * lengths[2]);
 +					error += (float)(diff * diff);
 +					ref += refData[p0] * refData[p0];
 +				}
 +			}
 +		}
 +	}
 +	
 +	if (error != 0)
 +	{
 +		normRef =::sqrtf((float) ref);
 +		if (::fabs((float) ref) < 1e-7f)
 +		{
 +			return false;
 +		}
 +		normError = ::sqrtf((float) error);
 +		error = normError / normRef;
 +    
 +		if (error > epsilon)
 +			return false;
 +	}
 +
 +	return true;
 +}
 +
 +// Compute reference output using fftw for float type
 +fftwf_complex* get_fftwf_output(size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
 +								size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
 +								size_t outfftVectorSizePadded, size_t fftVectorSize, clfftDim dim, clfftDirection dir)
 +{
 +	//In FFTW last dimension has the fastest changing index
 +	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
 +
 +	fftwf_plan refPlan;
 +
 +	fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftBatchSize);
 +	fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*outfftBatchSize);
 +
 +	refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], (int)batch_size, 
 +									refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded, 
 +									refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded, 
 +									dir, FFTW_ESTIMATE);
 +
 +	int scalar;
 +	for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
 +	{
 +		switch (in_layout)
 +		{
 +		case CLFFT_COMPLEX_INTERLEAVED:
 +			scalar = SCALAR + (int)(i % fftVectorSize);
 +			break;
 +		case CLFFT_COMPLEX_PLANAR:
 +			scalar = (int)((SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1));
 +			break;
 +		default:
 +			break;
 +		}
 +
 +		refin[i][0] = (float)(1 * scalar);
 +		refin[i][1] = (float)(0 * scalar);
 +	}
 +
 +	fftwf_execute(refPlan);
 +
 +	fftw_free(refin);
 +
 +	fftwf_destroy_plan(refPlan);
 +
 +	return refout;
 +}
 +
 +// Compute reference output using fftw for double type
 +fftw_complex* get_fftw_output(size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
 +								size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
 +								size_t outfftVectorSizePadded, size_t fftVectorSize, clfftDim dim, clfftDirection dir)
 +{
 +	fftw_plan refPlan;
 +
 +	fftw_complex *refin = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*fftBatchSize);
 +	fftw_complex *refout = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*outfftBatchSize);
 +	
 +	//In FFTW last dimension has the fastest changing index
 +	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
 +
 +	refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], (int)batch_size, 
 +									refin, &fftwLengths[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded, 
 +									refout, &fftwLengths[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded, 
 +									dir, FFTW_ESTIMATE);
 +							
 +	int scalar;
 +	for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
 +	{
 +		switch (in_layout)
 +		{
 +		case CLFFT_COMPLEX_INTERLEAVED:
 +			scalar = SCALAR + (int)(i % fftVectorSize);
 +			break;
 +		case CLFFT_COMPLEX_PLANAR:
 +			scalar = (int)((SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1));
 +			break;
 +		default:
 +			break;
 +		}
 +
 +		refin[i][0] = 1 * scalar;
 +		refin[i][1] = 0 * scalar;
 +	}
 +
 +	fftw_execute(refPlan);
 +
 +	fftw_free(refin);
 +
 +	fftw_destroy_plan(refPlan);
 +
 +	return refout;
 +}
 +
 +// Compute C2R reference output using fftw for float type
 +float* get_fftwf_output_c2r(size_t* lengths, size_t *strides, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
 +								size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
 +								size_t outfftVectorSizePadded, size_t outfftVectorSize, clfftDim dim, clfftDirection dir, clfftResultLocation place)
 +{
 +	//In FFTW last dimension has the fastest changing index
 +	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
 +	int inembed[3] = {(int)lengths[2], (int)lengths[1], (int)(lengths[0]/2 + 1)};
 +	int lsd = (place == CLFFT_INPLACE) ? (int)(lengths[0]/2 + 1)*2 : (int)(lengths[0]);
 +	int outembed[3] = {(int)lengths[2], (int)lengths[1], lsd};
 +	
 +	fftwf_plan refPlan;
 +
 +	fftwf_complex *refin = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*fftBatchSize);
 +	float *refout = (float*) malloc(sizeof(float)*outfftBatchSize);
 +
 +	refPlan = fftwf_plan_many_dft_c2r(dim, &fftwLengths[3 - dim], (int)batch_size, 
 +									refin, &inembed[3 - dim], (int)inStrides[0], (int)fftVectorSizePadded, 
 +									refout, &outembed[3 - dim], (int)outStrides[0], (int)outfftVectorSizePadded,
 +									FFTW_ESTIMATE);
 +
 +	// set zero
 +	for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0] )
 +	{
 +		refin[ i ][0] = 0; refin[ i ][1] = 0;
 +	}
 +
 +	// impulse test case
 +	for(size_t b = 0; b < batch_size; b++)
 +	{
 +		size_t p3 = b * strides[3];
 +		refin[ p3 ][0] = static_cast<float>(outfftVectorSize);
 +	}
 +
 +	int scalar;
 +	for(size_t b = 0; b < batch_size; b++)
 +	{
 +		size_t p3 = b * strides[3];
 +	
 +		for( size_t i = 0; i < fftVectorSizePadded; i = i + inStrides[0])
 +		{
 +			scalar = SCALAR + i;
 +			
 +			refin[p3 + i][0] *= (float)(scalar);
 +			refin[p3 + i][1] *= (float)(scalar);
 +		}
 +	}
 +
 +	fftwf_execute(refPlan);
 +
 +	fftw_free(refin);
 +
 +	fftwf_destroy_plan(refPlan);
 +
 +	return refout;
 +}
 +
 +//	This is used with the program_options class so that the user can type an integer on the command line
 +//	and we store into an enum varaible
 +template<class _Elem, class _Traits>
 +std::basic_istream<_Elem, _Traits> & operator>> (std::basic_istream<_Elem, _Traits> & stream, clfftLayout & layout)
 +{
 +	cl_uint tmp;
 +	stream >> tmp;
 +	layout = clfftLayout(tmp);
 +	return stream;
 +}
 +
 +//Validate the input and output data layout
 +void validateDataLayout(clfftLayout in_layout, clfftLayout out_layout, clfftResultLocation place)
 +{
 +	switch( in_layout )
 +	{
 +	case CLFFT_COMPLEX_INTERLEAVED:
 +	case CLFFT_COMPLEX_PLANAR:
 +	case CLFFT_HERMITIAN_INTERLEAVED:
 +	case CLFFT_HERMITIAN_PLANAR:
 +	case CLFFT_REAL:
 +		break;
 +	default:
 +		//	Don't recognize input layout
 +		{
 +			throw std::runtime_error( "Un-recognized data layout" );
 +		}
 +		break;
 +	}
 +
 +	switch( out_layout )
 +	{
 +	case CLFFT_COMPLEX_INTERLEAVED:
 +	case CLFFT_COMPLEX_PLANAR:
 +	case CLFFT_HERMITIAN_INTERLEAVED:
 +	case CLFFT_HERMITIAN_PLANAR:
 +	case CLFFT_REAL:
 +		break;
 +	default:
 +		//	Don't recognize output layout
 +		{
 +			throw std::runtime_error( "Un-recognized data layout" );
 +		}
 +		break;
 +	}
 +
 +	if (( place == CLFFT_INPLACE ) &&  ( in_layout != out_layout )) 
 +	{
 +		switch( in_layout )
 +		{
 +		case CLFFT_COMPLEX_INTERLEAVED:
 +			{
 +				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
 +				{
 +					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
 +				}
 +				break;
 +			}
 +		case CLFFT_COMPLEX_PLANAR:
 +			{
 +				if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
 +				{
 +					throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
 +				}
 +				break;
 +			}
 +		case CLFFT_HERMITIAN_INTERLEAVED:
 +			{
 +				if( out_layout != CLFFT_REAL )
 +				{
 +					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
 +				}
 +				break;
 +			}
 +		case CLFFT_HERMITIAN_PLANAR:
 +			{
 +				throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
 +				break;
 +			}
 +		case CLFFT_REAL:
 +			{
 +				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
 +				{
 +					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
 +				}
 +				break;
 +			}
 +		default:
 +			{
 +				throw std::runtime_error( "Input layout format not yet supported" );
 +			}
 +			break;
 +		}
 +	}
 +}
 +
 +//FFT data initializations
 +template < typename T >
 +cl_int dataInitialize(size_t* lengths, clfftDim *dim, size_t batch_size,
 +				  const size_t *inStrides, size_t *strides, const size_t *outStrides, size_t *o_strides,
 +				  size_t *fftBatchSize, size_t *outfftBatchSize, size_t *fftVectorSizePadded, clfftLayout in_layout, clfftLayout out_layout,
 +				  size_t *outfftVectorSizePadded, size_t *fftVectorSize, size_t *outfftVectorSize,
 +				  clfftResultLocation place, size_t *size_of_output_buffers_in_bytes,
 +				  size_t *size_of_input_buffers_in_bytes, cl_mem *input_cl_mem_buffers, cl_mem *output_cl_mem_buffers,
 +				  cl_context *context, cl_command_queue *queue, 
 +				  cl_device_type deviceType, cl_int deviceId, cl_int platformId, cl_uint command_queue_flags)
 +{
 +	cl_event outEvent = NULL;
 +	cl_uint number_of_output_buffers = 0;
 +	const size_t max_dimensions = 3;
 +	std::vector< cl_device_id > device_id;
 +	
 +	for (unsigned u = 0; u < max_dimensions; ++u) {
 +		if (0 != lengths[u])
 +			continue;
 +		lengths[u] = 1;
 +	}
 +
 +	if( lengths[ 1 ] > 1 )
 +	{
 +		*dim	= CLFFT_2D;
 +	}
 +	if( lengths[ 2 ] > 1 )
 +	{
 +		*dim	= CLFFT_3D;
 +	}
 +
 +	strides[ 0 ] = inStrides[0];
 +	strides[ 1 ] = inStrides[1];
 +	strides[ 2 ] = inStrides[2];
 +	strides[ 3 ] = inStrides[3];
 +
 +	o_strides[ 0 ] = outStrides[0];
 +	o_strides[ 1 ] = outStrides[1];
 +	o_strides[ 2 ] = outStrides[2];
 +	o_strides[ 3 ] = outStrides[3];
 +
 +	*fftVectorSize = lengths[0] * lengths[1] * lengths[2];
 +	*fftVectorSizePadded = strides[3];
 +	*fftBatchSize = *fftVectorSizePadded * batch_size;
 +
 +	if(place == CLFFT_INPLACE)
 +	{
 +		*outfftVectorSize = *fftVectorSize;
 +		*outfftVectorSizePadded = *fftVectorSizePadded;
 +		*outfftBatchSize = *fftBatchSize;
 +	}
 +	else
 +	{
 +		*outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
 +		*outfftVectorSizePadded = o_strides[3];
 +		*outfftBatchSize = *outfftVectorSizePadded * batch_size;
 +	}
 +
 +	// Real to complex case
 +	if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) )
 +	{
 +		*fftVectorSizePadded = strides[3];
 +		*fftBatchSize = *fftVectorSizePadded * batch_size;
 +
 +		*outfftVectorSizePadded = o_strides[3];
 +		*outfftBatchSize = *outfftVectorSizePadded * batch_size;
 +
 +		*fftVectorSize = lengths[0] * lengths[1] * lengths[2];
 +		*outfftVectorSize = *fftVectorSize;
 +	}
 +
 +	switch( out_layout )
 +	{
 +	case CLFFT_COMPLEX_INTERLEAVED:
 +		number_of_output_buffers = 1;
 +		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof( std::complex< T > );
 +		break;
 +	case CLFFT_COMPLEX_PLANAR:
 +		number_of_output_buffers = 2;
 +		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
 +		break;
 +	case CLFFT_HERMITIAN_INTERLEAVED:
 +		number_of_output_buffers = 1;
 +		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof( std::complex< T > );
 +		break;
 +	case CLFFT_HERMITIAN_PLANAR:
 +		number_of_output_buffers = 2;
 +		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
 +		break;
 +	case CLFFT_REAL:
 +		number_of_output_buffers = 1;
 +		*size_of_output_buffers_in_bytes = *outfftBatchSize * sizeof(T);
 +		break;
 +	}
 +
 +
 +	// Fill the input buffers
 +	switch( in_layout )
 +	{
 +	case CLFFT_COMPLEX_INTERLEAVED:
 +		{
 +			//	This call creates our openCL context and sets up our devices; expected to throw on error
 +			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( std::complex< T > );
 +
 +			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
 +			createOpenCLCommandQueue( *context,
 +				command_queue_flags, *queue,
 +				device_id,
 +				*size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
 +				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 +
 +			std::vector< std::complex< T > > input( *fftBatchSize );
 +
 +			// set zero
 +			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 +			{
 +				input[ i ] = 0;
 +			}
 +
 +			// impulse test case
 +			for(size_t b = 0; b < batch_size; b++)
 +			{
 +				size_t p3 = b * strides[3];
 +				for(size_t k = 0; k < lengths[2]; k++)
 +				{
 +					size_t p2 = p3 + k * strides[2];
 +					for(size_t j = 0; j < lengths[1]; j++)
 +					{
 +						size_t p1 = p2 + j * strides[1];
 +						for(size_t i = 0; i < lengths[0]; i++)
 +						{
 +							size_t p0 = p1 + i * strides[0];
 +							input[p0] = 1;
 +						}
 +					}
 +				}
 +			}
 +
 +			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &input[ 0 ],
 +				0, NULL, &outEvent ),
 +				"clEnqueueWriteBuffer failed" );
 +
 +		}
 +		break;
 +	case CLFFT_COMPLEX_PLANAR:
 +		{
 +			//	This call creates our openCL context and sets up our devices; expected to throw on error
 +			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
 +
 +			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
 +			createOpenCLCommandQueue( *context,
 +				command_queue_flags, *queue,
 +				device_id,
 +				*size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
 +				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 +
 +			std::vector< T > real( *fftBatchSize );
 +			std::vector< T > imag( *fftBatchSize );
 +
 +			// set zero
 +			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 +			{
 +				real[ i ] = 0;
 +				imag[ i ] = 0;
 +			}
 +
 +			// impulse test case
 +			for(size_t b = 0; b < batch_size; b++)
 +			{
 +				size_t p3 = b * strides[3];
 +				for(size_t k = 0; k < lengths[2]; k++)
 +				{
 +					size_t p2 = p3 + k * strides[2];
 +					for(size_t j = 0; j < lengths[1]; j++)
 +					{
 +						size_t p1 = p2 + j * strides[1];
 +						for(size_t i = 0; i < lengths[0]; i++)
 +						{
 +							size_t p0 = p1 + i * strides[0];
 +							real[p0] = 1;
 +						}
 +					}
 +				}
 +			}
 +
 +			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
 +				0, NULL, &outEvent ),
 +				"clEnqueueWriteBuffer failed" );
 +			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &imag[ 0 ],
 +				0, NULL, &outEvent ),
 +				"clEnqueueWriteBuffer failed" );
 +		}
 +		break;
 +	case CLFFT_HERMITIAN_INTERLEAVED:
 +		{
 +			//	This call creates our openCL context and sets up our devices; expected to throw on error
 +			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( std::complex< T > );
 +
 +			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
 +			createOpenCLCommandQueue( *context,
 +				command_queue_flags, *queue,
 +				device_id,
 +				*size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
 +				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 +
 +			std::vector< std::complex< T > > input( *fftBatchSize );
 +
 +			// set zero
 +			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 +			{
 +				input[ i ] = 0;
 +			}
 +
 +			// impulse test case
 +			for(size_t b = 0; b < batch_size; b++)
 +			{
 +				size_t p3 = b * strides[3];
 +				input[p3] = static_cast<T>(*outfftVectorSize);
 +
 +			}
 +
 +			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &input[ 0 ],
 +				0, NULL, &outEvent ),
 +				"clEnqueueWriteBuffer failed" );
 +		}
 +		break;
 +	case CLFFT_HERMITIAN_PLANAR:
 +		{
 +			//	This call creates our openCL context and sets up our devices; expected to throw on error
 +			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
 +
 +			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
 +			createOpenCLCommandQueue( *context,
 +				command_queue_flags, *queue,
 +				device_id,
 +				*size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
 +				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 +
 +			std::vector< T > real( *fftBatchSize );
 +			std::vector< T > imag( *fftBatchSize );
 +
 +			// set zero
 +			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 +			{
 +				real[ i ] = 0;
 +				imag[ i ] = 0;
 +			}
 +
 +			// impulse test case
 +			for(size_t b = 0; b < batch_size; b++)
 +			{
 +				size_t p3 = b * strides[3];
 +				real[p3] = static_cast<T>(*outfftVectorSize);
 +			}
 +
 +			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
 +				0, NULL, &outEvent ),
 +				"clEnqueueWriteBuffer failed" );
 +			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &imag[ 0 ],
 +				0, NULL, &outEvent ),
 +				"clEnqueueWriteBuffer failed" );
 +		}
 +		break;
 +	case CLFFT_REAL:
 +		{
 +			//	This call creates our openCL context and sets up our devices; expected to throw on error
 +			*size_of_input_buffers_in_bytes = *fftBatchSize * sizeof( T );
 +
 +			device_id = initializeCL( deviceType, deviceId, platformId, *context, false );
 +			createOpenCLCommandQueue( *context,
 +				command_queue_flags, *queue,
 +				device_id,
 +				*size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
 +				*size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
 +
 +			std::vector< T > real( *fftBatchSize );
 +
 +			// set zero
 +			for( cl_uint i = 0; i < *fftBatchSize; ++i )
 +			{
 +				real[ i ] = 0;
 +			}
 +
 +			// impulse test case
 +			for(size_t b = 0; b < batch_size; b++)
 +			{
 +				size_t p3 = b * strides[3];
 +				for(size_t k = 0; k < lengths[2]; k++)
 +				{
 +					size_t p2 = p3 + k * strides[2];
 +					for(size_t j = 0; j < lengths[1]; j++)
 +					{
 +						size_t p1 = p2 + j * strides[1];
 +						for(size_t i = 0; i < lengths[0]; i++)
 +						{
 +							size_t p0 = p1 + i * strides[0];
 +							real[p0] = 1;
 +						}
 +					}
 +				}
 +			}
 +
 +			OPENCL_V_THROW( clEnqueueWriteBuffer( *queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, *size_of_input_buffers_in_bytes, &real[ 0 ],
 +				0, NULL, &outEvent ),
 +				"clEnqueueWriteBuffer failed" );
 +		}
 +		break;
 +	default:
 +		{
 +			throw std::runtime_error( "Input layout format not yet supported" );
 +		}
 +		break;
 +	}
 +
 +	return 0;
 +}
 +
 +//Compare output with reference C/FFTW code
 +template < typename T >
 +void compareWithReference(clfftLayout in_layout, clfftLayout out_layout, size_t outfftBatchSize, clfftResultLocation place, clfftPrecision precision,
 +						  cl_command_queue queue, cl_mem *input_cl_mem_buffers, size_t size_of_input_buffers_in_bytes, size_t size_of_output_buffers_in_bytes,
 +						  cl_mem *BuffersOut, size_t* lengths, size_t * strides, const size_t *inStrides, const size_t *outStrides, size_t *o_strides,
 +						  size_t batch_size, size_t fftBatchSize, size_t fftVectorSizePadded, size_t outfftVectorSize,
 +						  size_t outfftVectorSizePadded, size_t fftVectorSize, clfftDim dim, clfftDirection dir, bool hasPrecallback)
 +{
 +	bool checkflag= false;
 +
 +	switch( out_layout )
 +	{
 +	case CLFFT_HERMITIAN_INTERLEAVED:
 +	case CLFFT_COMPLEX_INTERLEAVED:
 +		{
 +			std::vector< std::complex< T > > output( outfftBatchSize );
 +
 +			if( place == CLFFT_INPLACE )
 +			{
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +			}
 +			else
 +			{
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +			}
 +
 +			//check output data
 +			if (hasPrecallback)
 +			{
 +				switch(in_layout)
 +				{
 +				case CLFFT_HERMITIAN_INTERLEAVED:
 +				case CLFFT_COMPLEX_INTERLEAVED:
 +					{
 +						if (precision == CLFFT_SINGLE)
 +						{
 +							fftwf_complex *refout;
 +
 +							refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
 +														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
 +
 +							if (!compare<fftwf_complex, T>(refout, output, outfftBatchSize))
 +								checkflag = true;
 +
- 							//for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
- 							//{
- 							//	std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
- 							//}
- 							
- 							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
- 							{
- 								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
- 							}*/
- 
 +							fftwf_free(refout);
 +						}
 +						else if (precision == CLFFT_DOUBLE)
 +						{
 +							fftw_complex *refout;
 +							
 +							refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
 +														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
 +
 +							if (!compare<fftw_complex, T>(refout, output, outfftBatchSize))
 +								checkflag = true;
 +
- 							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
- 							{
- 								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
- 							}*/
- 							
 +							fftw_free(refout);
 +						}
 +					}
 +					break;
 +				}
 +			}
 +			else
 +			{
 +				for( cl_uint i = 0; i < outfftBatchSize; ++i )
 +				{
 +					if (0 == (i % outfftVectorSizePadded))
 +					{
 +						if (output[i].real() != outfftVectorSize)
 +						{
 +							checkflag = true;
 +							break;
 +						}
 +							
 +					}
 +					else
 +					{
 +						if (output[ i ].real() != 0)
 +						{
 +							checkflag = true;
 +							break;
 +						}
 +					}
 +
 +					if (output[ i ].imag() != 0)
 +					{
 +						checkflag = true;
 +						break;
 +					}
 +				}
 +			}
 +		}
 +		break;
 +	case CLFFT_HERMITIAN_PLANAR:
 +	case CLFFT_COMPLEX_PLANAR:
 +		{
 +			std::valarray< T > real( outfftBatchSize );
 +			std::valarray< T > imag( outfftBatchSize );
 +
 +			if( place == CLFFT_INPLACE )
 +			{
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +			}
 +			else
 +			{
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +			}
 +
 +			//  Check output data
 +			if (hasPrecallback)
 +			{
 +				switch(in_layout)
 +				{
 +				case CLFFT_COMPLEX_PLANAR:
 +					{
 +						if (precision == CLFFT_SINGLE)
 +						{
 +							fftwf_complex *refout;
 +
 +							refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
 +														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
 +
 +							if (!compare<fftwf_complex, T>(refout, real, imag, outfftBatchSize))
 +								checkflag = true;
 +
 +							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
 +							{
 +								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
 +							}*/
 +							
 +							fftwf_free(refout);
 +						}
 +						else if (precision == CLFFT_DOUBLE)
 +						{
 +							fftw_complex *refout;
 +
 +							refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
 +														in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
 +
 +							if (!compare<fftw_complex, T>(refout, real, imag, outfftBatchSize))
 +								checkflag = true;
 +
 +							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
 +							{
 +								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
 +							}*/
 +							
 +							fftw_free(refout);
 +						}
 +					}
 +					break;
 +				}
 +			}
 +			else
 +			{
 +				for( cl_uint i = 0; i < outfftBatchSize; ++i )
 +				{
 +					if (0 == (i % outfftVectorSizePadded))
 +					{
 +						if (real[i] != outfftVectorSize)
 +						{
 +							checkflag = true;
 +							break;
 +						}
 +					}
 +					else
 +					{
 +						if (real[i] != 0)
 +						{
 +							checkflag = true;
 +							break;
 +						}
 +					}
 +
 +					if (imag[i] != 0)
 +					{
 +						checkflag = true;
 +						break;
 +					}
 +				}
 +			}
 +		}
 +		break;
 +	case CLFFT_REAL:
 +		{
 +			std::valarray< T > real( outfftBatchSize );
 +
 +			if( place == CLFFT_INPLACE )
 +			{
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +			}
 +			else
 +			{
 +				OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
 +					0, NULL, NULL ),
 +					"Reading the result buffer failed" );
 +			}
 +
 +			//  Check output data
 +			if (hasPrecallback)
 +			{
 +				if (precision == CLFFT_SINGLE)
 +				{
 +					float *refout;
 +
 +					refout = get_fftwf_output_c2r(lengths, strides,  inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
 +												in_layout, outfftVectorSizePadded, outfftVectorSize, dim, dir, place);
 +
 +					if (!compare<float, T>(refout, real, batch_size, o_strides, lengths))
 +						checkflag = true;
 +
 +					/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
 +					{
 +						std::cout << "i " << i << " refreal " << refout[i] << " clreal " << (real[i] * outfftVectorSize) << std::endl;
 +					}*/
 +					
 +					if (refout)
 +						free(refout);
 +				}
 +			}
 +			else
 +			{
 +				for(size_t b = 0; b < batch_size; b++)
 +				{
 +					size_t p3 = b * o_strides[3];
 +					for(size_t k = 0; k < lengths[2]; k++)
 +					{
 +						size_t p2 = p3 + k * o_strides[2];
 +						for(size_t j = 0; j < lengths[1]; j++)
 +						{
 +							size_t p1 = p2 + j * o_strides[1];
 +							for(size_t i = 0; i < lengths[0]; i++)
 +							{
 +								size_t p0 = p1 + i * o_strides[0];
 +
 +								if (real[p0] != 1)
 +								{
 +									checkflag = true;
 +									break;
 +								}							
 +							}
 +						}
 +					}
 +				}
 +			}
 +		}
 +		break;
 +	default:
 +		{
 +			throw std::runtime_error( "Input layout format not yet supported" );
 +		}
 +		break;
 +	}
 +
 +	if (checkflag)
 +	{
 +		std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
 +	}
 +	else
 +	{
 +		std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
 +	}
 +}
 +
 +template < typename T >
 +int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
 +				clfftLayout in_layout, clfftLayout out_layout,
 +				clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
 +				cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
 +				cl_uint command_queue_flags, cl_uint profile_count,
 +				std::auto_ptr< clfftSetupData > setupData,
 +				bool hasPrecallback)
 +{
 +	//	Our command line does not specify what dimension FFT we wish to transform; we decode
 +	//	this from the lengths that the user specifies for X, Y, Z.  A length of one means that
 +	//	The user does not want that dimension.
 +
 +	size_t strides[ 4 ];
 +	size_t o_strides[ 4 ];
 +	size_t fftVectorSize = 0;
 +	size_t fftVectorSizePadded = 0;
 +	size_t fftBatchSize = 0;
 +	size_t outfftVectorSize = 0;
 +	size_t outfftVectorSizePadded = 0;
 +	size_t outfftBatchSize = 0;
 +	size_t size_of_input_buffers_in_bytes = 0;
 +	size_t size_of_output_buffers_in_bytes = 0;
 +	
 +	clfftDim	dim = CLFFT_1D;
 +	cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
 +	cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
 +	cl_context context;
 +	cl_command_queue queue;
 +	cl_event outEvent = NULL;
 +	clfftPlanHandle plan_handle;
 +
 +	//Valudate input and output data layout
 +	validateDataLayout(in_layout, out_layout, place);
 +	
- 	if (hasPrecallback && !(in_layout == CLFFT_COMPLEX_INTERLEAVED || in_layout == CLFFT_COMPLEX_PLANAR || in_layout == CLFFT_HERMITIAN_INTERLEAVED || in_layout == CLFFT_HERMITIAN_PLANAR))
- 	{
- 		terr << _T("Pre-callback feature is currently supported only for Complex-Complex and Complex-Real FFT " ) << std::endl;
- 		return 1;
- 	}
- 
 +	//Initializations
 +	OPENCL_V_THROW( dataInitialize<T>(lengths, &dim, batch_size, inStrides, strides, outStrides, o_strides, &fftBatchSize, &outfftBatchSize, 
 +						&fftVectorSizePadded, in_layout, out_layout, &outfftVectorSizePadded, &fftVectorSize, &outfftVectorSize, place, 
 +						&size_of_output_buffers_in_bytes, &size_of_input_buffers_in_bytes, input_cl_mem_buffers, output_cl_mem_buffers, &context, &queue,
 +						deviceType, deviceId, platformId, command_queue_flags), "Data Initialization failed");
 +
 +	//	Discover and load the timer module if present
 +	void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
 +	if( timerLibHandle == NULL )
 +	{
 +		terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
 +	}
 +
 +
 +	//	Timer module discovered and loaded successfully
 +	//	Initialize function pointers to call into the shared module
 +	PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );
 +
 +	//	Create and initialize our timer class, if the external timer shared library loaded
 +	baseStatTimer* timer = NULL;
 +	size_t	clFFTID = 0;
 +	if( get_timer )
 +	{
 +		timer = get_timer( CLFFT_GPU );
 +		timer->Reserve( 1, profile_count );
 +		timer->setNormalize( true );
 +
 +		clFFTID	= timer->getUniqueID( "clFFT", 0 );
 +	}
 +
 +	OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
 +	OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );
 +
 +	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
 +	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
 +	OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
 +	OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
 +	OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
 +
 +	OPENCL_V_THROW (clfftSetPlanInStride  ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
 +	OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
 +	OPENCL_V_THROW (clfftSetPlanDistance  ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );
 +
 +	// Set backward scale factor to 1.0 for non real FFTs to do correct output checks
 +	if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
 +		OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );
 +
 +	//Check for Precallback
 +	//Currently test includes only for 1D
 +	if (hasPrecallback)
 +	{
 +		cl_mem userdata;
 +
 +		//C2C 1D Interleaved 
 +		if (in_layout == CLFFT_COMPLEX_INTERLEAVED )
 +		{
 +			char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
 +
 +			int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
 +			for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
 +			{
 +				h_userdata[ i ] = SCALAR + (i % fftVectorSize);
 +			}
 +			userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
 +
 +			//Register the callback
 +			OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
 +		}
 +		else if (in_layout == CLFFT_HERMITIAN_INTERLEAVED)
 +		{	
 +			char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_C2R) : STRINGIFY(MULVAL_DP);
 +			
 +			int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
 +			for(size_t b = 0; b < batch_size; b++)
 +			{
 +				size_t p3 = b * strides[3];
 +	
 +				for( size_t i = 0; i < fftVectorSizePadded; i = i + inStrides[0])
 +				{
 +					h_userdata[ p3 + i ] = SCALAR + i;
 +				}
 +			}
 +			userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
 +
 +			//Register the callback
 +			OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
 +		}
 +		else if (in_layout == CLFFT_COMPLEX_PLANAR)
 +		{	
 +			//C2C PLANAR 
 +			char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
 +			USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
 +			for( size_t i = 0; i < fftBatchSize; i = i + inStrides[0])
 +			{
 +				h_userdata[i].scalar1 = SCALAR + (int)(i % fftVectorSize);
 +				h_userdata[i].scalar2 = SCALAR + (int)(i % fftVectorSize) + 1;
 +			}
 +			userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
 +
 +			//Register the callback
 +			OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
 +		}
 +		else if (in_layout == CLFFT_HERMITIAN_PLANAR)
 +		{	
 +			//C2C PLANAR 
 +			char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
 +			USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
 +			for(size_t b = 0; b < batch_size; b++)
 +			{
 +				size_t p3 = b * strides[3];
 +	
 +				for( size_t i = 0; i < fftVectorSizePadded; i = i + inStrides[0])
 +				{
 +					h_userdata[p3 + i].scalar1 = SCALAR + i ;
 +					h_userdata[p3 + i].scalar2 = 0;
 +				}
 +			}
 +			userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
 +
 +			//Register the callback
 +			OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
 +		}
 +	}
 +
 +	OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" );
 +
 +	//get the buffersize
 +	size_t buffersize=0;
 +	OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
 +
 +	//allocate the intermediate buffer
 +	cl_mem clMedBuffer=NULL;
 +
 +	if (buffersize)
 +	{
 +		cl_int medstatus;
 +		clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
 +		OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
 +	}
 +
 +
 +	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
 +
 +	Timer tr;
 +	tr.Start();
 +
 +	//	Loop as many times as the user specifies to average out the timings
 +	for( cl_uint i = 0; i < profile_count; ++i )
 +	{
 +		if( timer ) timer->Start( clFFTID );
 +
 +		OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent,
 +			&input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ),
 +			"clfftEnqueueTransform failed" );
 +
 +		if( timer ) timer->Stop( clFFTID );
 +	}
 +	OPENCL_V_THROW( clFinish( queue ), "clFinish failed" );
 +	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
 +
 +	double wtime = tr.Sample()/((double)profile_count);
 +	size_t totalLen = 1;
 +	for(int i=0; i<dim; i++) totalLen *= lengths[i];
 +	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
 +
 +	if(profile_count > 1)
 +	{
 +		tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl;
 +		tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl;
 +	}
 +
 +	if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) )
 +	{
 +		//	Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result
 +		timer->pruneOutliers( 2.0 );
 +		timer->Print( );
 +		timer->Reset( );
 +	}
 +
 +	/*****************/
 +	FreeSharedLibrary( timerLibHandle );
 +
 +		// Read and check output data
 +	// This check is not valid if the FFT is executed multiple times inplace.
 +	//
 +	if (( place == CLFFT_OUTOFPLACE )
 +	||  ( profile_count == 1))
 +	{
 +		compareWithReference<T>(in_layout, out_layout, outfftBatchSize, place, precision, queue, input_cl_mem_buffers, size_of_input_buffers_in_bytes, size_of_output_buffers_in_bytes,
 +								BuffersOut, lengths, strides, inStrides, outStrides, o_strides, batch_size, fftBatchSize, fftVectorSizePadded, outfftVectorSize, outfftVectorSizePadded, fftVectorSize,
 +								dim, dir, hasPrecallback);
 +	}
 +
 +	OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
 +	OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
 +
 +	cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent );
 +	return 0;
 +}
 +
 +int main(int argc, char **argv)
 +{
 +	//	OpenCL state 
 +	cl_device_type		deviceType	= CL_DEVICE_TYPE_ALL;
 +	cl_int				deviceId = 0;
 +	cl_int				platformId = 0;
 +
 +	//	FFT state
 +
 +	clfftResultLocation	place = CLFFT_INPLACE;
 +	clfftLayout	inLayout  = CLFFT_COMPLEX_INTERLEAVED;
 +	clfftLayout	outLayout = CLFFT_COMPLEX_INTERLEAVED;
 +	clfftPrecision precision = CLFFT_SINGLE;
 +	clfftDirection dir = CLFFT_FORWARD;
 +	size_t lengths[ 3 ] = {1,1,1};
 +	size_t iStrides[ 4 ] = {0,0,0,0};
 +	size_t oStrides[ 4 ] = {0,0,0,0};
 +	cl_uint profile_count = 0;
 +
 +	cl_uint command_queue_flags = 0;
 +	size_t batchSize = 1;
 +
 +	//callback
 +	bool hasPrecallback = true;
 +
 +	//	Initialize flags for FFT library
 +	std::auto_ptr< clfftSetupData > setupData( new clfftSetupData );
 +	OPENCL_V_THROW( clfftInitSetupData( setupData.get( ) ),
 +		"clfftInitSetupData failed" );
 +
 +	try
 +	{
 +		// Declare the supported options.
 +		po::options_description desc( "clFFT client command line options" );
 +		desc.add_options()
 +			( "help,h",        "produces this help message" )
 +			( "gpu,g",         "Force selection of OpenCL GPU devices only" )
 +			( "cpu,c",         "Force selection of OpenCL CPU devices only" )
 +			( "all,a",         "Force selection of all OpenCL devices (default)" )
 +			( "outPlace,o",    "Out of place FFT transform (default: in place)" )
 +			( "double",		   "Double precision transform (default: single)" )
 +			( "inv",			"Backward transform (default: forward)" )
 +			( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" )
 +			( "noprecall",		"Disable Precallback (default: precallback on)" )
 +			( "lenX,x",        po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ),   "Specify the length of the 1st dimension of a test array" )
 +			( "lenY,y",        po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ),      "Specify the length of the 2nd dimension of a test array" )
 +			( "lenZ,z",        po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ),      "Specify the length of the 3rd dimension of a test array" )
 +			( "isX",   po::value< size_t >( &iStrides[ 0 ] )->default_value( 1 ),						"Specify the input stride of the 1st dimension of a test array" )
 +			( "isY",   po::value< size_t >( &iStrides[ 1 ] )->default_value( 0 ),	"Specify the input stride of the 2nd dimension of a test array" )
 +			( "isZ",   po::value< size_t >( &iStrides[ 2 ] )->default_value( 0 ),	"Specify the input stride of the 3rd dimension of a test array" )
 +			( "iD", po::value< size_t >( &iStrides[ 3 ] )->default_value( 0 ), "input distance between subsequent sets of data when batch size > 1" )
 +			( "osX",   po::value< size_t >( &oStrides[ 0 ] )->default_value( 1 ),						"Specify the output stride of the 1st dimension of a test array" )
 +			( "osY",   po::value< size_t >( &oStrides[ 1 ] )->default_value( 0 ),	"Specify the output stride of the 2nd dimension of a test array" )
 +			( "osZ",   po::value< size_t >( &oStrides[ 2 ] )->default_value( 0 ),	"Specify the output stride of the 3rd dimension of a test array" )
 +			( "oD", po::value< size_t >( &oStrides[ 3 ] )->default_value( 0 ), "output distance between subsequent sets of data when batch size > 1" )
 +			( "batchSize,b",   po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " )
 +			( "profile,p",     po::value< cl_uint >( &profile_count )->default_value( 1 ), "Time and report the kernel speed of the FFT (default: profiling off)" )
 +			( "inLayout",      po::value< clfftLayout >( &inLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" )
 +			( "outLayout",     po::value< clfftLayout >( &outLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" )
 +			;
 +
 +		po::variables_map vm;
 +		po::store( po::parse_command_line( argc, argv, desc ), vm );
 +		po::notify( vm );
 +
 +		if( vm.count( "help" ) )
 +		{
 +			std::cout << desc << std::endl;
 +			return 0;
 +		}
 +
 +		size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
 +			| ((vm.count( "cpu" ) > 0) ? 2 : 0)
 +			| ((vm.count( "all" ) > 0) ? 4 : 0);
 +		if ((mutex & (mutex-1)) != 0) {
 +			terr << _T("You have selected mutually-exclusive OpenCL device options:") << std::endl;
 +			if (vm.count ( "gpu" )  > 0) terr << _T("    gpu,g   Force selection of OpenCL GPU devices only" ) << std::endl;
 +			if (vm.count ( "cpu" )  > 0) terr << _T("    cpu,c   Force selection of OpenCL CPU devices only" ) << std::endl;
 +			if (vm.count ( "all" )  > 0) terr << _T("    all,a   Force selection of all OpenCL devices (default)" ) << std::endl;
 +			return 1;
 +		}
 +
 +		if( vm.count( "gpu" ) )
 +		{
 +			deviceType	= CL_DEVICE_TYPE_GPU;
 +		}
 +
 +		if( vm.count( "cpu" ) )
 +		{
 +			deviceType	= CL_DEVICE_TYPE_CPU;
 +		}
 +
 +		if( vm.count( "all" ) )
 +		{
 +			deviceType	= CL_DEVICE_TYPE_ALL;
 +		}
 +
 +		if( vm.count( "outPlace" ) )
 +		{
 +			place = CLFFT_OUTOFPLACE;
 +		}
 +
 +		if( vm.count( "double" ) )
 +		{
 +			precision = CLFFT_DOUBLE;
 +		}
 +
 +		if( vm.count( "inv" ) )
 +		{
 +			dir = CLFFT_BACKWARD;
 +		}
 +
 +		if( profile_count > 1 )
 +		{
 +			command_queue_flags |= CL_QUEUE_PROFILING_ENABLE;
 +		}
 +
 +		if( vm.count( "dumpKernels" ) )
 +		{
 +			setupData->debugFlags	|= CLFFT_DUMP_PROGRAMS;
 +		}
 +
 +		if( vm.count( "noprecall" ) )
 +		{
 +			hasPrecallback = false;
 +		}
 +
 +		int inL = (int)inLayout;
 +		int otL = (int)outLayout;
 +
 +		// input output layout support matrix
 +		int ioLayoutSupport[5][5] =		{
 +										{ 1, 1, 0, 0, 1 },
 +										{ 1, 1, 0, 0, 1 },
 +										{ 0, 0, 0, 0, 1 },
 +										{ 0, 0, 0, 0, 1 },
 +										{ 1, 1, 1, 1, 0 },
 +										};
 +
 +		if((inL < 1) || (inL > 5)) throw std::runtime_error( "Invalid Input layout format" );
 +		if((otL < 1) || (otL > 5)) throw std::runtime_error( "Invalid Output layout format" );
 +
 +		if(ioLayoutSupport[inL-1][otL-1] == 0) throw std::runtime_error( "Invalid combination of Input/Output layout formats" );
 +
 +		if( ((inL == 1) || (inL == 2)) && ((otL == 1) || (otL == 2)) ) // Complex-Complex cases
 +		{
 +			iStrides[1] = iStrides[1] ? iStrides[1] : lengths[0] * iStrides[0];
 +			iStrides[2] = iStrides[2] ? iStrides[2] : lengths[1] * iStrides[1];
 +			iStrides[3] = iStrides[3] ? iStrides[3] : lengths[2] * iStrides[2];
 +			
 +			if(place == CLFFT_INPLACE)
 +			{
 +				oStrides[0] = iStrides[0];
 +				oStrides[1] = iStrides[1];
 +				oStrides[2] = iStrides[2];
 +				oStrides[3] = iStrides[3];
 +			}
 +			else
 +			{
 +				oStrides[1] = oStrides[1] ? oStrides[1] : lengths[0] * oStrides[0];
 +				oStrides[2] = oStrides[2] ? oStrides[2] : lengths[1] * oStrides[1];
 +				oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2];
 +			}
 +		}
 +		else // Real-Complex and Complex-Real cases
 +		{
 +			size_t *rst, *cst;
 +			size_t N = lengths[0];
 +			size_t Nt = 1 + lengths[0]/2;
 +			bool iflag = false;
 +			bool rcFull = (inL == 1) || (inL == 2) || (otL == 1) || (otL == 2);
 +
 +			if(inLayout == CLFFT_REAL) { iflag = true; rst = iStrides; }
 +			else { rst = oStrides; } // either in or out should be REAL
 +
 +			// Set either in or out strides whichever is real
 +			if(place == CLFFT_INPLACE)
 +			{
 +				if(rcFull)	{ rst[1] = rst[1] ? rst[1] :  N * 2 * rst[0]; }
 +				else		{ rst[1] = rst[1] ? rst[1] : Nt * 2 * rst[0]; }
 +
 +				rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1];
 +				rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2];
 +			}
 +			else
 +			{
 +				rst[1] = rst[1] ? rst[1] : lengths[0] * rst[0];
 +				rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1];
 +				rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2];
 +			}
 +
 +			// Set the remaining of in or out strides that is not real
 +			if(iflag) { cst = oStrides; }
 +			else	  { cst = iStrides; }
 +
 +			if(rcFull)	{ cst[1] = cst[1] ? cst[1] :  N * cst[0]; }
 +			else		{ cst[1] = cst[1] ? cst[1] : Nt * cst[0]; }
 +
 +			cst[2] = cst[2] ? cst[2] : lengths[1] * cst[1];
 +			cst[3] = cst[3] ? cst[3] : lengths[2] * cst[2];
 +		}
 +
 +		if( precision == CLFFT_SINGLE )
 +			transform<float>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, false, command_queue_flags, profile_count, setupData, hasPrecallback );
 +		else
 +			transform<double>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, false, command_queue_flags, profile_count, setupData, hasPrecallback );
 +	}
 +	catch( std::exception& e )
 +	{
 +		terr << _T( "clFFT error condition reported:" ) << std::endl << e.what() << std::endl;
 +		return 1;
 +	}
 +	return 0;
 +}
diff --cc src/library/generator.copy.cpp
index 6f4eda0,b5d89ef..68cb77b
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@@ -274,10 -246,13 +274,10 @@@ namespace CopyGenerato
  					str += "__global "; str += rType; str += " *lwbOutIm2;\n\n";
  				}
  			}
 -
 -
 -
 +			
  			// Setup registers
  			str += "\t"; str += RegBaseType<PR>(2); str += " R;\n\n";
 -
--
++			
  			size_t NtRounded64 = DivRoundingUp<size_t>(Nt,64) * 64;
  
  			if(!general)
diff --cc src/library/generator.stockham.cpp
index 56b6d48,4161279..1f7d7cd
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@@ -727,7 -723,7 +727,7 @@@ namespace StockhamGenerato
  		void SweepRegs(	size_t flag, bool fwd, bool interleaved, size_t stride, size_t component,
  						double scale, bool frontTwiddle,
  						const std::string &bufferRe, const std::string &bufferIm, const std::string &offset,
--						size_t regC, size_t numB, size_t numPrev, std::string &passStr) const
++						size_t regC, size_t numB, size_t numPrev, std::string &passStr, bool isPrecallVector = false) const
  		{
  			assert( (flag == SR_READ )			||
  					(flag == SR_TWMUL)			||
@@@ -825,7 -821,150 +825,217 @@@
  				return;
  			}
  
 +			int hid;
++
+ 			// block to rearrange reads of adjacent memory locations together
+ 			if(linearRegs && (flag == SR_READ))
+ 			{
+ 				for(size_t r=0; r<radix; r++)
+ 				{
+ 					for(size_t i=0; i<numB; i++)
+ 					{
+ 						for(size_t c=cStart; c<cEnd; c++) // component loop: 0 - real, 1 - imaginary
+ 						{
+ 							std::string tail;
+ 							std::string regIndex;
++							std::string regIndexC;
+ 							regIndex = "(*R";
+ 							std::string buffer;
+ 
+ 							// Read real & imag at once
+ 							if(interleaved && (component == SR_COMP_BOTH))
+ 							{
+ 								assert(bufferRe.compare(bufferIm) == 0); // Make sure Real & Imag buffer strings are same for interleaved data
+ 								buffer = bufferRe;
+ 								RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ")";
+ 								tail = ";";
+ 							}
+ 							else
+ 							{
+ 								if(c == 0)
+ 								{
 -									RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ").x";
++									RegBaseAndCountAndPos("", i*radix + r, regIndex); 
++									
++									hid = (i * radix + r) / (numB * radix / 2);
++									if (fft_doPreCallback && c2r && component == SR_COMP_REAL && hid != 0)
++									{
++										regIndexC = regIndex; regIndexC += ").y";
++									}
++									
++									regIndex += ").x";
+ 									buffer = bufferRe;
+ 									tail = interleaved ? ".x;" : ";";
+ 								}
+ 								else
+ 								{
+ 									RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ").y";
+ 									buffer = bufferIm;
+ 									tail = interleaved ? ".y;" : ";";
+ 								}
+ 							}
+ 
++							//get offset 
++							std::string bufOffset;
++							bufOffset += offset; bufOffset += " + ( "; bufOffset += SztToStr(numPrev); bufOffset += " + ";
++							bufOffset += "me*"; bufOffset += SztToStr(numButterfly); bufOffset += " + ";
++							bufOffset += SztToStr(i); bufOffset += " + ";
++							bufOffset += SztToStr(r*length/radix); bufOffset += " )*";
++							bufOffset += SztToStr(stride);
++
++							//If precallback is set invoke callback function
++							//Invoke callback only once in Planar data layout (i.e.c==0)
++							if (fft_doPreCallback && c == 0 && component == SR_COMP_BOTH)
++							{
++								passStr += "\n\t";
++								passStr += "retPrecallback = "; passStr += fft_preCallback.funcname; passStr += "("; 
++								if(interleaved)
++								{
++									passStr += buffer; passStr += ", ";
++								}
++								else
++								{
++									passStr += bufferRe; passStr += ", "; passStr += bufferIm; passStr += ", ";
++								}
++								passStr += bufOffset; passStr += ", userdata";
++								if (fft_preCallback.localMemSize > 0)
++								{
++									passStr += ", localmem";
++								}
++								passStr += ");";
++							}
++
++							if (fft_doPreCallback && c2r && component == SR_COMP_REAL && hid != 0)
++							{
++								passStr += "\n\t";
++								passStr += regIndexC; passStr += " = "; passStr += regIndex; passStr += ";";
++							}
+ 
+ 							passStr += "\n\t";
+ 							passStr += regIndex;
 -							passStr += " = "; passStr += buffer;
 -							passStr += "["; passStr += offset; passStr += " + ( "; passStr += SztToStr(numPrev); passStr += " + ";
 -							passStr += "me*"; passStr += SztToStr(numButterfly); passStr += " + ";
 -							passStr += SztToStr(i); passStr += " + ";
 -							passStr += SztToStr(r*length/radix); passStr += " )*";
 -							passStr += SztToStr(stride); passStr += "]"; passStr += tail;
 -
 -								// Since we read real & imag at once, we break the loop
++							passStr += " = ";
++
++							//Use the return value from precallback if set
++							if (fft_doPreCallback && (component == SR_COMP_BOTH || r2c))
++							{
++								if (component == SR_COMP_BOTH)
++								{
++									passStr += "retPrecallback"; 
++									passStr += interleaved ? tail : (c == 0) ? ".x;" : ".y;";
++								}
++								else if (r2c)
++								{
++									passStr += fft_preCallback.funcname; passStr += "("; passStr += buffer; passStr += ", ";
++									passStr += bufOffset; passStr += ", userdata";
++
++									if (fft_preCallback.localMemSize > 0)
++									{
++										passStr += ", localmem";
++									}
++									passStr += ");";
++								}
++							}
++							else
++							{
++								passStr += buffer;
++								passStr += "["; passStr += bufOffset; passStr += "]"; passStr += tail;
++							}
++
++							// Since we read real & imag at once, we break the loop
+ 							if(interleaved && (component == SR_COMP_BOTH) )
+ 								break;
+ 						}
+ 					}
+ 				}
+ 				return;
+ 			}
+ 
+ 			// block to rearrange writes of adjacent memory locations together
+ 			if(linearRegs && (flag == SR_WRITE) && (nextPass == NULL))
+ 			{
+ 				for(size_t r=0; r<radix; r++)
+ 				{
+ 					butterflyIndex = numPrev;
+ 
+ 					for(size_t i=0; i<numB; i++)
+ 					{
+ 						if(realSpecial && (nextPass == NULL) && (r > (radix/2)))
+ 							break;
+ 
+ 						if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i != 0))
+ 							break;
+ 
+ 						if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
+ 							passStr += "\n\t}\n\tif( rw && !me)\n\t{";
+ 
+ 						for(size_t c=cStart; c<cEnd; c++) // component loop: 0 - real, 1 - imaginary
+ 						{
+ 							std::string tail;
+ 							std::string regIndex;
+ 							regIndex = "(*R";
+ 							std::string buffer;
+ 
+ 							// Write real & imag at once
+ 							if(interleaved && (component == SR_COMP_BOTH))
+ 							{
+ 								assert(bufferRe.compare(bufferIm) == 0); // Make sure Real & Imag buffer strings are same for interleaved data
+ 								buffer = bufferRe;
+ 								RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ")";
+ 								tail = "";
+ 							}
+ 							else
+ 							{
+ 								if(c == 0)
+ 								{
+ 									RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ").x";
+ 									buffer = bufferRe;
+ 									tail = interleaved ? ".x" : "";
+ 								}
+ 								else
+ 								{
+ 									RegBaseAndCountAndPos("", i*radix + r, regIndex); regIndex += ").y";
+ 									buffer = bufferIm;
+ 									tail = interleaved ? ".y" : "";
+ 								}
+ 							}
+ 
+ 							passStr += "\n\t";
+ 							passStr += buffer; passStr += "["; passStr += offset; passStr += " + ( ";
+ 
+ 							if( (numButterfly * workGroupSize) > algLS )
+ 							{
+ 								passStr += "(("; passStr += SztToStr(numButterfly);
+ 								passStr += "*me + "; passStr += SztToStr(butterflyIndex); passStr += ")/";
+ 								passStr += SztToStr(algLS); passStr += ")*"; passStr += SztToStr(algL); passStr += " + (";
+ 								passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
+ 								passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
+ 							}
+ 							else
+ 							{
+ 								passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
+ 								passStr += " + ";
+ 							}
+ 
+ 							passStr += SztToStr(r*algLS); passStr += " )*"; passStr += SztToStr(stride); passStr += "]";
+ 							passStr += tail; passStr += " = "; passStr += regIndex;
+ 							if(scale != 1.0f) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); }
+ 							passStr += ";";
+ 
+ 							// Since we write real & imag at once, we break the loop
+ 							if(interleaved && (component == SR_COMP_BOTH))
+ 								break;
+ 						}
+ 
+ 						if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
+ 							passStr += "\n\t}\n\tif(rw)\n\t{";
+ 
+ 						butterflyIndex++;
+ 					}
+ 				}
+ 
+ 				return;
+ 			}
 -
 -
++			
++			
  			for(size_t i=0; i<numB; i++)
  			{
  				std::string regBaseCount = regBase;
@@@ -891,71 -1019,14 +1101,83 @@@
  									regIndexSub += SztToStr(v);
  								}
  
 +								//get offset 
 +								std::string bufOffset;
 +								bufOffset += offset; bufOffset += " + ( "; bufOffset += SztToStr(numPrev); bufOffset += " + ";
 +								bufOffset += "me*"; bufOffset += SztToStr(numButterfly); bufOffset += " + ";
 +								bufOffset += SztToStr(i*regC + v); bufOffset += " + ";
 +								bufOffset += SztToStr(r*length/radix); bufOffset += " )*";
 +								bufOffset += SztToStr(stride);
 +
 +								//If precallback is set invoke callback function
 +								//Invoke callback only once in Planar data layout (i.e.c==0)
 +								if (fft_doPreCallback && c == 0 && component == SR_COMP_BOTH)
 +								{
 +									passStr += "\n\t";
- 									passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "] = "; passStr += fft_preCallback.funcname; passStr += "("; 
++									passStr += "retPrecallback"; 
++									
++									if (isPrecallVector)
++									{
++										passStr += "["; passStr += SztToStr(v); passStr += "]"; 
++									}
++
++									passStr += " = "; passStr += fft_preCallback.funcname; passStr += "("; 
 +									if(interleaved)
 +									{
 +										passStr += buffer; passStr += ", ";
 +									}
 +									else
 +									{
 +										passStr += bufferRe; passStr += ", "; passStr += bufferIm; passStr += ", ";
 +									}
 +									passStr += bufOffset; passStr += ", userdata";
 +									if (fft_preCallback.localMemSize > 0)
 +									{
 +										passStr += ", localmem";
 +									}
 +									passStr += ");";
 +								}
 +
 +								if (fft_doPreCallback && c2r && component == SR_COMP_REAL && hid != 0)
 +								{
 +									passStr += "\n\t";
 +									passStr += regIndexC; passStr += " = "; passStr += regIndexSub; passStr += ";";
 +								}
 +
  								passStr += "\n\t";
  								passStr += regIndexSub;
 -								passStr += " = "; passStr += buffer;
 -								passStr += "["; passStr += offset; passStr += " + ( "; passStr += SztToStr(numPrev); passStr += " + ";
 -								passStr += "me*"; passStr += SztToStr(numButterfly); passStr += " + ";
 -								passStr += SztToStr(i*regC + v); passStr += " + ";
 -								passStr += SztToStr(r*length/radix); passStr += " )*";
 -								passStr += SztToStr(stride); passStr += "]"; passStr += tail;
 +								passStr += " = "; 
 +
 +								//Use the return value from precallback if set
 +								if (fft_doPreCallback && (component == SR_COMP_BOTH || r2c))
 +								{
 +									if (component == SR_COMP_BOTH)
 +									{
- 										passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "]"; 
++										passStr += "retPrecallback"; 
++									
++										if (isPrecallVector)
++										{
++											passStr += "["; passStr += SztToStr(v); passStr += "]"; 
++										} 
 +										passStr += interleaved ? tail : (c == 0) ? ".x;" : ".y;";
 +									}
 +									else if (r2c)
 +									{
 +										passStr += fft_preCallback.funcname; passStr += "("; passStr += buffer; passStr += ", ";
 +										passStr += bufOffset; passStr += ", userdata";
 +
 +										if (fft_preCallback.localMemSize > 0)
 +										{
 +											passStr += ", localmem";
 +										}
 +										passStr += ");";
 +									}
 +								}
 +								else
 +								{
 +									passStr += buffer;
 +									passStr += "["; passStr += bufOffset; passStr += "]"; passStr += tail;
 +								}
  							}
  
  							// Since we read real & imag at once, we break the loop
@@@ -2052,17 -2037,10 +2295,26 @@@
  			{
  				if( (!halfLds) || (halfLds && (position == 0)) )
  				{
++					bool isPrecallVector = false;
 +					//If precallback is set
 +					if (fft_doPreCallback)
 +					{
- 						passStr += "\n\t"; passStr += regB2Type; passStr += " retPrecallback["; 
- 						passStr += (numB4 > 0) ? "4" : (numB2 > 0) ? "2" : "1"; 
- 						passStr += "];";
++						passStr += "\n\t"; passStr += regB2Type; passStr += " retPrecallback"; 
++						
++						if (numB4 > 0 || numB2 > 0)
++						{
++							passStr += "["; 
++							passStr += (numB4 > 0) ? "4" : (numB2 > 0) ? "2" : "1"; 
++							passStr += "]";
++
++							isPrecallVector = true;
++						}
++						passStr += ";";
 +					}
  					passStr += "\n\tif(rw)\n\t{";
--					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
--					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
--					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 4, numB4, 2*numB2 + numB1, passStr);
++					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr, isPrecallVector);
++					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr, isPrecallVector);
++					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 4, numB4, 2*numB2 + numB1, passStr, isPrecallVector);
  					passStr += "\n\t}\n";
  				}
  			}
diff --cc src/tests/accuracy_test_precallback.cpp
index f2a9a9c,0000000..a09d2f6
mode 100644,000000..100644
--- a/src/tests/accuracy_test_precallback.cpp
+++ b/src/tests/accuracy_test_precallback.cpp
@@@ -1,1515 -1,0 +1,1581 @@@
 +/* ************************************************************************
 + * Copyright 2015 Advanced Micro Devices, Inc.
 + *
 + * Licensed under the Apache License, Version 2.0 (the "License");
 + * you may not use this file except in compliance with the License.
 + * You may obtain a copy of the License at
 + *
 + * http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + * ************************************************************************/
 +
 +#include <gtest/gtest.h>
 +#include<math.h>
 +
 +#include "test_constants.h"
 +#include "fftw_transform.h"
 +#include "cl_transform.h"
 +#include "typedefs.h"
 +#include "accuracy_test_common.h"
 +#include <stdexcept>
 +#include <vector>
 +
 +/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
 +/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
 +class accuracy_test_precallback_single : public ::testing::Test {
 +protected:
 +	accuracy_test_precallback_single(){}
 +	virtual ~accuracy_test_precallback_single(){}
 +	virtual void SetUp(){}
 +	virtual void TearDown(){
 +	}
 +};
 +
 +class accuracy_test_precallback_double : public ::testing::Test {
 +protected:
 +	accuracy_test_precallback_double(){}
 +	virtual ~accuracy_test_precallback_double(){}
 +	virtual void SetUp(){}
 +	virtual void TearDown(){
 +	}
 +};
 +
 +class mixed_radix_precallback : public ::testing::TestWithParam<size_t> {
 +	protected:
 +		mixed_radix_precallback(){}
 +		virtual ~mixed_radix_precallback(){}
 +		virtual void SetUp(){}
 +		virtual void TearDown(){}
 +};
 +
 +class Supported_Fft_Sizes_precallback
 +{
 +public:
 +	std::vector<size_t> sizes;
 +	const size_t max_mixed_radices_to_test;
 +
 +	Supported_Fft_Sizes_precallback()
 +	: max_mixed_radices_to_test( 4096 )
 +	{
 +		size_t i=0, j=0, k=0;
 +		size_t sum, sumi, sumj, sumk;
 +
 +		sumi = 1; i = 0;
 +		while(1)
 +		{
 +			sumj = 1; j = 0;
 +			while(1)
 +			{
 +				sumk = 1; k = 0;
 +				while(1)
 +				{
 +					sum = (sumi*sumj*sumk);
 +					if( sum > max_mixed_radices_to_test ) break;
 +
 +					sizes.push_back(sum);
 +					k++;
 +					sumk *= 2;
 +				}
 +
 +				if(k == 0) break;
 +				j++;
 +				sumj *= 3;
 +			}
 +
 +			if( (j == 0) && (k == 0) ) break;
 +			i++;
 +			sumi *= 5;
 +		}
 +	}
 +} supported_sizes;
 +
 +INSTANTIATE_TEST_CASE_P(
 +	mixed_radices_precallback,
 +	mixed_radix_precallback,
 +	::testing::ValuesIn( supported_sizes.sizes )
 +);
 +
 +namespace precallback
 +{
 +
++/**********************************************************************************************
++**************************************Complex To Real***************************************
++**********************************************************************************************/
++#pragma region Complex_To_Real
++
 +template< typename T, typename cl_T, typename fftw_T >
 +void mixed_radix_real_to_hermitian( size_t problem_size )
 +{
 +	try
 +	{
 +		if(verbose) std::cout << "Now testing problem size " << problem_size << std::endl;
 +
 +		std::vector<size_t> lengths;
 +		lengths.push_back( problem_size );
 +		size_t batch = 1;
 +
 +		std::vector<size_t> input_strides;
 +		std::vector<size_t> output_strides;
 +
 +		size_t input_distance = 0;
 +		size_t output_distance = 0;
 +
 +		layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +
 +		placeness::placeness_t placeness = placeness::in_place;
 +
 +		data_pattern pattern = sawtooth;
 +		precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +	}
 +	catch( const std::exception& err ) {
 +		handle_exception(err);
 +	}
 +}
 +
 +TEST_P( mixed_radix_precallback, single_precision_real_to_hermitian_auto_generated ) {
 +	size_t problem_size = GetParam();
 +	RecordProperty("problem_size", (int)problem_size);
 +	mixed_radix_real_to_hermitian<float, cl_float, fftwf_complex>(problem_size);
 +}
 +
 +TEST_P( mixed_radix_precallback, double_precision_real_to_hermitian_auto_generated ) {
 +	size_t problem_size = GetParam();
 +	RecordProperty("problem_size", (int)problem_size);
 +	mixed_radix_real_to_hermitian<double, cl_double, fftw_complex>(problem_size);
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_large_1D_in_place_hermitian_interleaved_to_real()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( large2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_large_1D_in_place_hermitian_interleaved_to_real)
 +{
 +	try { pow2_large_1D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_large_1D_in_place_hermitian_interleaved_to_real)
 +{
 +	try { pow2_large_1D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_large_1D_out_of_place_hermitian_planar_to_real()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( large2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_large_1D_out_of_place_hermitian_planar_to_real)
 +{
 +	try { pow2_large_1D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_large_1D_out_of_place_hermitian_planar_to_real)
 +{
 +	try { pow2_large_1D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
++#pragma endregion
++
++/**********************************************************************************************
++**************************************Complex To Complex***************************************
++**********************************************************************************************/
++#pragma region Complex_To_Complex
++
++template< typename T, typename cl_T, typename fftw_T >
++void mixed_radix_complex_to_complex( size_t problem_size )
++{
++	try
++	{
++		if(verbose) std::cout << "Now testing problem size " << problem_size << std::endl;
++
++		std::vector<size_t> lengths;
++		lengths.push_back( problem_size );
++		size_t batch = 1;
++
++		std::vector<size_t> input_strides;
++		std::vector<size_t> output_strides;
++
++		size_t input_distance = 0;
++		size_t output_distance = 0;
++
++		layout::buffer_layout_t in_layout = layout::complex_planar;
++		layout::buffer_layout_t out_layout = layout::complex_planar;
++
++		placeness::placeness_t placeness = placeness::in_place;
++
++		direction::direction_t direction = direction::forward;
++
++		data_pattern pattern = sawtooth;
++		precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
++	}
++	catch( const std::exception& err ) {
++		handle_exception(err);
++	}
++}
++
++TEST_P( mixed_radix_precallback, single_precision_complex_to_complex_auto_generated ) {
++	size_t problem_size = GetParam();
++	RecordProperty("problem_size", (int)problem_size);
++	mixed_radix_complex_to_complex<float, cl_float, fftwf_complex>(problem_size);
++}
++
++TEST_P( mixed_radix_precallback, double_precision_complex_to_complex_auto_generated ) {
++	size_t problem_size = GetParam();
++	RecordProperty("problem_size", (int)problem_size);
++	mixed_radix_complex_to_complex<double, cl_double, fftw_complex>(problem_size);
++}
++
++
 +// *****************************************************
 +// *****************************************************
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t in_layout = layout::complex_interleaved;
 +	layout::buffer_layout_t out_layout = layout::complex_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +	direction::direction_t direction = direction::forward;
 +
 +	data_pattern pattern = impulse;
 +	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
 +{
 +	try { pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
 +{
 +	try { pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +// *****************************************************
 +// *****************************************************
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t in_layout = layout::complex_planar;
 +	layout::buffer_layout_t out_layout = layout::complex_planar;
 +	placeness::placeness_t placeness = placeness::in_place;
 +	direction::direction_t direction = direction::forward;
 +
 +	data_pattern pattern = impulse;
 +	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar)
 +{
 +	try { pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar)
 +{
 +	try { pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( large2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t in_layout = layout::complex_interleaved;
 +	layout::buffer_layout_t out_layout = layout::complex_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +	direction::direction_t direction = direction::forward;
 +
 +	data_pattern pattern = impulse;
 +	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
 +{
 +	try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
 +{
 +	try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t in_layout = layout::complex_interleaved;
 +	layout::buffer_layout_t out_layout = layout::complex_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +	direction::direction_t direction = direction::forward;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
 +{
 +	try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
 +{
 +	try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +// *****************************************************
 +// *****************************************************
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_forward_in_place_complex_to_complex_userdatatype()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t in_layout = layout::complex_interleaved;
 +	layout::buffer_layout_t out_layout = layout::complex_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +	direction::direction_t direction = direction::forward;
 +
 +	data_pattern pattern = impulse;
 +	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness, true );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_forward_in_place_complex_to_complex_userdatatype)
 +{
 +	try { pow2_normal_1D_forward_in_place_complex_to_complex_userdatatype< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
++#pragma endregion
++
++/**********************************************************************************************
++**************************************Real To Complex***************************************
++**********************************************************************************************/
++#pragma region Real_To_Complex
++
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_forward_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t out_layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_forward_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_1D_forward_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_1D_forward_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_1D_forward_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_small_1D_forward_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( small2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t out_layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_small_1D_forward_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_small_1D_forward_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_small_1D_forward_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_small_1D_forward_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_large_1D_forward_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( large2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t out_layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_large_1D_forward_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_large_1D_forward_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_large_1D_forward_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_large_1D_forward_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_large_1D_4M_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 4194304 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_large_1D_4M_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_large_1D_4M_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_large_1D_4M_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_large_1D_4M_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_array_real_to_hermitian()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 8;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_array_real_to_hermitian)
 +{
 +	try { pow2_normal_1D_array_real_to_hermitian< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_1D_array_real_to_hermitian)
 +{
 +	try { pow2_normal_1D_array_real_to_hermitian< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_array_real_to_hermitian_with_odd_batch_size()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 5;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_array_real_to_hermitian_with_odd_batch_size)
 +{
 +	try { pow2_normal_1D_array_real_to_hermitian_with_odd_batch_size< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_1D_array_real_to_hermitian_with_odd_batch_size)
 +{
 +	try { pow2_normal_1D_array_real_to_hermitian_with_odd_batch_size< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_out_of_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_1D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_1D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_1D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_1D_out_of_place_real_to_hermitian_planar()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_1D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow2_normal_1D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_1D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow2_normal_1D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_normal_1D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal3 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_normal_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_normal_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_normal_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_normal_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_small_1D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( small3 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_small_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_small_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_small_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_small_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_large_1D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( large3 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_large_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_large_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_large_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_large_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_normal_1D_out_of_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal3 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_normal_1D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_normal_1D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_normal_1D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow3_normal_1D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_normal_1D_array_real_to_hermitian_with_odd_batch_size()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal3 );
 +	size_t batch = 5;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_normal_1D_array_real_to_hermitian_with_odd_batch_size)
 +{
 +	try { pow3_normal_1D_array_real_to_hermitian_with_odd_batch_size< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_normal_1D_array_real_to_hermitian_with_odd_batch_size)
 +{
 +	try { pow3_normal_1D_array_real_to_hermitian_with_odd_batch_size< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_very_small_1D_in_place_different_input_output_non_unit_stride_and_distance_real_to_complex()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 9 );
 +	size_t batch = 2;
 +
 +	std::vector<size_t> input_strides;
 +	input_strides.push_back( 16 );
 +
 +	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 128;
 +
 +	std::vector<size_t> output_strides;
 +	output_strides.push_back( 2 );
 +
 +	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 2;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = impulse;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_very_small_1D_in_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow3_very_small_1D_in_place_different_input_output_non_unit_stride_and_distance_real_to_complex< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_very_small_1D_in_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow3_very_small_1D_in_place_different_input_output_non_unit_stride_and_distance_real_to_complex< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_very_small_1D_non_unit_stride_and_distance_real_to_complex()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 9 );
 +	size_t batch = 2;
 +
 +	std::vector<size_t> input_strides;
 +	input_strides.push_back( 2 );
 +
 +	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 2;
 +
 +	std::vector<size_t> output_strides( input_strides );
 +	size_t output_distance = input_distance;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = impulse;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_very_small_1D_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow3_very_small_1D_non_unit_stride_and_distance_real_to_complex< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_very_small_1D_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow3_very_small_1D_non_unit_stride_and_distance_real_to_complex< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 9 );
 +	size_t batch = 2;
 +
 +	std::vector<size_t> input_strides;
 +	input_strides.push_back( 16 );
 +
 +	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 128;
 +
 +	std::vector<size_t> output_strides;
 +	output_strides.push_back( 2 );
 +
 +	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 2;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = impulse;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow3_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow3_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_normal_1D_user_defined_scale_real_to_hermitian()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal3 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = impulse;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness, 42.0f );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_normal_1D_user_defined_scale_real_to_hermitian)
 +{
 +	try { pow3_normal_1D_user_defined_scale_real_to_hermitian< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_normal_1D_user_defined_scale_real_to_hermitian)
 +{
 +	try { pow3_normal_1D_user_defined_scale_real_to_hermitian< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_normal_1D_round_trip_real_to_complex()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal3 );
 +	size_t batch = 1;
 +
 +	data_pattern pattern = impulse;
 +	precallback_real_to_complex_round_trip<T, cl_T, fftw_T>( pattern, lengths, batch );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_normal_1D_round_trip_real_to_complex)
 +{
 +	try { pow3_normal_1D_round_trip_real_to_complex< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_normal_1D_round_trip_real_to_complex)
 +{
 +	try { pow3_normal_1D_round_trip_real_to_complex< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow3_large_1D_round_trip_real_to_complex()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( large3 );
 +	size_t batch = 1;
 +
 +	data_pattern pattern = impulse;
 +	precallback_real_to_complex_round_trip<T, cl_T, fftw_T>( pattern, lengths, batch );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow3_large_1D_round_trip_real_to_complex)
 +{
 +	try { pow3_large_1D_round_trip_real_to_complex< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow3_large_1D_round_trip_real_to_complex)
 +{
 +	try { pow3_large_1D_round_trip_real_to_complex< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_2D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_2D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_2D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_2D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_2D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_2D_out_of_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_2D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_2D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_2D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_2D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_large_2D_out_of_place_real_to_hermitian_planar()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( MaxLength2D<T>(2) );
 +	lengths.push_back( normal2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_large_2D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow2_large_2D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_large_2D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow2_large_2D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_small_2D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( small2 );
 +	lengths.push_back( small2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_small_2D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_small_2D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_small_2D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_small_2D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 4 );
 +	lengths.push_back( 4 );
 +	size_t batch = 2;
 +
 +	std::vector<size_t> input_strides;
 +	input_strides.push_back( 5 );
 +	input_strides.push_back( lengths[0] * input_strides[0] + 1 );
 +
 +	std::vector<size_t> output_strides;
 +	output_strides.push_back( 2 );
 +	output_strides.push_back( lengths[0] * output_strides[0] + 2 );
 +
 +	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 30;
 +	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 42;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian)
 +{
 +	try { pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian)
 +{
 +	try { pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void mixed_another_targeted_real_to_hermitian_transform()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 30 );
 +	lengths.push_back( 10125 );
 +	size_t batch = 1;
 +
 +	std::vector<size_t> input_strides;
 +	input_strides.push_back( 1 );
 +	input_strides.push_back( 32 );
 +	size_t input_distance = 324000;
 +
 +	std::vector<size_t> output_strides;
 +	output_strides.push_back( 1 );
 +	output_strides.push_back( 16 );
 +	size_t output_distance = 162000;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, mixed_another_targeted_real_to_hermitian_transform)
 +{
 +	try { mixed_another_targeted_real_to_hermitian_transform< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, mixed_another_targeted_real_to_hermitian_transform)
 +{
 +	try { mixed_another_targeted_real_to_hermitian_transform< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void mixed_larger_targeted_real_to_hermitian_transform()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 15 );
 +	lengths.push_back( 4500 );
 +	size_t batch = 2;
 +
 +	std::vector<size_t> input_strides;
 +	input_strides.push_back( 1 );
 +	input_strides.push_back( 16 );
 +	size_t input_distance = 72000;
 +
 +	std::vector<size_t> output_strides;
 +	output_strides.push_back( 1 );
 +	output_strides.push_back( 8 );
 +	size_t output_distance = 36000;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, mixed_larger_targeted_real_to_hermitian_transform)
 +{
 +	try { mixed_larger_targeted_real_to_hermitian_transform< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, mixed_larger_targeted_real_to_hermitian_transform)
 +{
 +	try { mixed_larger_targeted_real_to_hermitian_transform< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void mixed_1D_length_375_real_to_hermitian()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 375 );
 +	size_t batch = 1;
 +
 +	std::vector<size_t> input_strides;
 +	size_t input_distance = 0;
 +
 +	std::vector<size_t> output_strides;
 +	size_t output_distance = 0;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, mixed_1D_length_375_real_to_hermitian)
 +{
 +	try { mixed_1D_length_375_real_to_hermitian< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, mixed_1D_length_375_real_to_hermitian)
 +{
 +	try { mixed_1D_length_375_real_to_hermitian< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow5_normal_1D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal5 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow5_normal_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow5_normal_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow5_normal_1D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow5_normal_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow5_small_1D_out_of_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( small5 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow5_small_1D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow5_small_1D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow5_small_1D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow5_small_1D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow5_large_1D_out_of_place_real_to_hermitian_planar()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( large5 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow5_large_1D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow5_large_1D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow5_large_1D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow5_large_1D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow5_normal_2D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal5 );
 +	lengths.push_back( normal5 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow5_normal_2D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow5_normal_2D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow5_normal_2D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow5_normal_2D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow5_large_2D_out_of_place_real_to_hermitian_planar()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( MaxLength2D<T>(5) );
 +	lengths.push_back( normal5 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow5_large_2D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow5_large_2D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow5_large_2D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow5_large_2D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow5_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 25 );
 +	size_t batch = 2;
 +
 +	std::vector<size_t> input_strides;
 +	input_strides.push_back( 16 );
 +
 +	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 128;
 +
 +	std::vector<size_t> output_strides;
 +	output_strides.push_back( 2 );
 +
 +	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 2;
 +
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = impulse;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow5_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow5_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow5_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
 +{
 +	try { pow5_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_3D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( small2 );
 +	lengths.push_back( normal2 );
 +	lengths.push_back( small2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_3D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_3D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_3D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_normal_3D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_normal_3D_out_of_place_real_to_hermitian_planar()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( normal2 );
 +	lengths.push_back( small2 );
 +	lengths.push_back( small2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_planar;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_normal_3D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow2_normal_3D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_normal_3D_out_of_place_real_to_hermitian_planar)
 +{
 +	try { pow2_normal_3D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_small_3D_out_of_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( small2 );
 +	lengths.push_back( small2 );
 +	lengths.push_back( small2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::out_of_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_small_3D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_small_3D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_small_3D_out_of_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_small_3D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +template< class T, class cl_T, class fftw_T >
 +void pow2_large_3D_in_place_real_to_hermitian_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 2 );
 +	lengths.push_back( 2 );
 +	lengths.push_back( large2 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t layout = layout::hermitian_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +
 +	data_pattern pattern = sawtooth;
 +	precallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, pow2_large_3D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_large_3D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +TEST_F(accuracy_test_precallback_double, pow2_large_3D_in_place_real_to_hermitian_interleaved)
 +{
 +	try { pow2_large_3D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
++#pragma endregion
++
 +template< class T, class cl_T, class fftw_T >
 +void lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved()
 +{
 +	std::vector<size_t> lengths;
 +	lengths.push_back( 64 );
 +	size_t batch = 1;
 +	std::vector<size_t> input_strides;
 +	std::vector<size_t> output_strides;
 +	size_t input_distance = 0;
 +	size_t output_distance = 0;
 +	layout::buffer_layout_t in_layout = layout::complex_interleaved;
 +	layout::buffer_layout_t out_layout = layout::complex_interleaved;
 +	placeness::placeness_t placeness = placeness::in_place;
 +	direction::direction_t direction = direction::forward;
 +
 +	data_pattern pattern = impulse;
 +	precallback_complex_to_complex_lds<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
 +}
 +
 +TEST_F(accuracy_test_precallback_single, lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved)
 +{
 +	try { lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
 +	catch( const std::exception& err ) { handle_exception(err);	}
 +}
 +
 +}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list