[clfft] 47/74: merging post call back changes from Pradeep's branch.

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Jan 14 19:52:16 UTC 2016


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository clfft.

commit 2197426973b1eb509ab9ca0c5ef1444bea9a5229
Author: santanu-thangaraj <t.santanu at gmail.com>
Date:   Fri Dec 18 11:44:48 2015 +0530

    merging post call back changes from Pradeep's branch.
---
 src/callback-client/callback-client.cpp            |  149 +-
 src/callback-client/client.h                       |   18 +-
 src/library/accessors.cpp                          |   21 +
 src/library/action.cpp                             |   23 +-
 src/library/generator.copy.cpp                     |   85 +-
 src/library/generator.stockham.cpp                 |  603 ++++-
 src/library/generator.stockham.h                   |    3 +
 src/library/generator.transpose.gcn.cpp            |   92 +-
 src/library/generator.transpose.nonsquare.cpp      |  205 +-
 src/library/generator.transpose.square.cpp         |  225 +-
 src/library/mainpage.h                             |   83 +-
 src/library/plan.cpp                               |  151 +-
 src/library/plan.h                                 |   12 +
 src/tests/CMakeLists.txt                           |    3 +-
 src/tests/accuracy_test_common.h                   |  403 +++
 ...llback.cpp => accuracy_test_mixed_callback.cpp} |  237 +-
 src/tests/accuracy_test_postcallback.cpp           | 2738 ++++++++++++++++++++
 src/tests/cl_transform.h                           |   65 +-
 src/tests/fftw_transform.h                         |   34 +
 src/tests/test_constants.h                         |   82 +-
 20 files changed, 4884 insertions(+), 348 deletions(-)

diff --git a/src/callback-client/callback-client.cpp b/src/callback-client/callback-client.cpp
index 2082aff..39032c6 100644
--- a/src/callback-client/callback-client.cpp
+++ b/src/callback-client/callback-client.cpp
@@ -29,7 +29,7 @@ int main(int argc, char **argv)
 			( "help,h",        "produces this help message" )
 			( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" )
 			( "batchSize,b",   po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " )
-			( "profile,p",     po::value< cl_uint >( &profile_count )->default_value( 10 ), "Time and report the kernel speed of the FFT (default: profiling off)" )
+			( "profile,p",     po::value< cl_uint >( &profile_count )->default_value( 10 ), "Time and report the kernel speed of the FFT (default: profiling on)" )
 			;
 
 		po::variables_map vm;
@@ -48,14 +48,8 @@ int main(int argc, char **argv)
 		}
 			
 		clfftDim dim = CLFFT_1D;
-		if( lengths[ 1 ] > 1 )
-		{
-			dim	= CLFFT_2D;
-		}
-		if( lengths[ 2 ] > 1 )
-		{
-			dim	= CLFFT_3D;
-		}
+		
+		tout << "\nRunning FFT for length " << BATCH_LENGTH << " and batch size " << batchSize << std::endl;
 
 		 // Real-Complex cases, SP
 		
@@ -100,11 +94,11 @@ void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
 	if (precision == CLFFT_SINGLE)
 	{
 		//Run clFFT with seaparate Pre-process Kernel
-		runR2CPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, 
+		runR2C_FFT_PreAndPostprocessKernel<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, 
 										batchSize, vectorLength, fftLength, profile_count);
 
 		//Run clFFT using pre-callback 
-		runR2CPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision, 
+		runR2C_FFT_WithCallback<float>(setupData, context, commandQueue, inlengths, dim, precision, 
 									batchSize, vectorLength, fftLength, profile_count);
 	}
 
@@ -113,7 +107,7 @@ void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
 }
 
 template < typename T >
-void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+void runR2C_FFT_WithCallback(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
 						size_t* inlengths, clfftDim dim, clfftPrecision precision,
 						size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
 {
@@ -158,8 +152,12 @@ void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context
 	//Precallback setup
 	char* precallbackstr = STRINGIFY(ConvertToFloat);
 
+	//Postcallback setup
+	char* postcallbackstr = STRINGIFY(MagnitudeExtraction);
+
 	//Register the callback
 	OPENCL_V_THROW (clfftSetPlanCallback(plan_handle, "convert24To32bit", precallbackstr, 0, PRECALLBACK, NULL, 0), "clFFTSetPlanCallback failed");
+	OPENCL_V_THROW (clfftSetPlanCallback(plan_handle, "extractMagnitude", postcallbackstr, 0, POSTCALLBACK, NULL, 0), "clFFTSetPlanCallback failed");
 
 	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
 	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
@@ -209,14 +207,14 @@ void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context
 		double wtimesample = tr.Sample();
 		double wtime = wtimesample/((double)profile_count);
 	
-		tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
+		tout << "\nExecution wall time (with clFFT Callback): " << 1000.0*wtime << " ms" << std::endl;
 	}
 
 	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
 	
 	if (profile_count == 1)
 	{
-		std::vector< std::complex< T > > output( fftLength/2 );
+		std::vector< T > output( fftLength );
 
 		OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, outfftbuffer, CL_TRUE, 0, out_size_of_buffers, &output[ 0 ],
 			0, NULL, NULL ), "Reading the result buffer failed" );
@@ -228,11 +226,11 @@ void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context
 
 		if (!compare<fftwf_complex, T>(refout, output, fftLength/2))
 		{
-			std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****FAIL*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (with clFFT Callback) *****FAIL*****" << std::endl;
 		}
 		else
 		{
-			std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****PASS*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (with clFFT Callback) *****PASS*****" << std::endl;
 		}
 
 		fftwf_free(refout);
@@ -247,7 +245,7 @@ void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context
 }
 
 template < typename T >
-void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
+void runR2C_FFT_PreAndPostprocessKernel(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
 							cl_command_queue commandQueue, cl_device_id device_id,
 							size_t* inlengths, clfftDim dim, clfftPrecision precision,
 							size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
@@ -282,6 +280,10 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 	cl_mem outfftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, out_size_of_buffers, NULL, &status);
     OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(oufftbuffer) )" );
 
+	//output magnitude buffer transform. 	
+	cl_mem magoutfftbuffer = ::clCreateBuffer( context, CL_MEM_WRITE_ONLY, out_size_of_buffers/2, NULL, &status);
+    OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(magoutfftbuffer) )" );
+
 	//clFFT initializations
 	
 	//	FFT state
@@ -317,8 +319,13 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 		OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
 	}
 
-	//Pre-process kernel string
-	const char* source = STRINGIFY(ConvertToFloat_KERNEL);
+	//Pre and post process kernel string
+	std::string sourceStr;
+	sourceStr += STRINGIFY(ConvertToFloat_KERNEL);
+	sourceStr += "\n";
+	sourceStr += STRINGIFY(MagnitudeExtraction_KERNEL);
+
+	const char* source = sourceStr.c_str();
 	
 	cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
 	OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
@@ -351,22 +358,29 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 	}
 #endif
 
-	cl_kernel kernel = clCreateKernel( program, "convert24To32bit", &status );
-	OPENCL_V_THROW( status, "clCreateKernel failed" );
+	//For functional test
 
-	//for functional test
-	cl_uint uarg = 0;
+	//Pre-process kernel
+	cl_kernel prekernel = clCreateKernel( program, "convert24To32bit", &status );
+	OPENCL_V_THROW( status, "clCreateKernel convert24To32bit failed" );
 
 	//Input 24bit Buffer 
-	OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&in24bitfftbuffer ), "clSetKernelArg failed" );
+	OPENCL_V_THROW( clSetKernelArg( prekernel, 0, sizeof( cl_mem ), (void*)&in24bitfftbuffer ), "clSetKernelArg failed" );
 	
 	//output 32bit Buffer 
-	OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&in32bitfftbuffer ), "clSetKernelArg failed" );
+	OPENCL_V_THROW( clSetKernelArg( prekernel, 1, sizeof( cl_mem ), (void*)&in32bitfftbuffer ), "clSetKernelArg failed" );
+
+	//Post-process kernel
+	cl_kernel postkernel = clCreateKernel( program, "extractMagnitude", &status );
+	OPENCL_V_THROW( status, "clCreateKernel extractMagnitude failed" );
+
+	OPENCL_V_THROW( clSetKernelArg( postkernel, 0, sizeof( cl_mem ), (void*)&outfftbuffer ), "clSetKernelArg failed" );
+	OPENCL_V_THROW( clSetKernelArg( postkernel, 1, sizeof( cl_mem ), (void*)&magoutfftbuffer ), "clSetKernelArg failed" );
 
 	//Launch pre-process kernel
-	size_t gSize = fftLength;
-	status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
-											NULL, &gSize, NULL, 0, NULL, NULL );
+	size_t gSize_pre = fftLength;
+	status = clEnqueueNDRangeKernel( commandQueue, prekernel, 1,
+							NULL, &gSize_pre, NULL, 0, NULL, NULL );
 	OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
 	
 	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
@@ -376,6 +390,12 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 		&in32bitfftbuffer, &outfftbuffer, clMedBuffer ),
 		"clfftEnqueueTransform failed" );
 		
+	size_t gSize_post = fftLength/2;
+	//Launch post-process kernel
+	status = clEnqueueNDRangeKernel( commandQueue, postkernel, 1,
+						NULL, &gSize_post, NULL, 0, NULL, NULL );
+	OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+
 	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
 	
 	if (profile_count > 1)
@@ -386,17 +406,16 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 		//	Loop as many times as the user specifies to average out the timings
 		for( cl_uint i = 0; i < profile_count; ++i )
 		{
-			uarg = 0;
+			//Launch pre-process kernel
 
 			//Input 24bit Buffer 
-			OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&in24bitfftbuffer ), "clSetKernelArg failed" );
+			OPENCL_V_THROW( clSetKernelArg( prekernel, 0, sizeof( cl_mem ), (void*)&in24bitfftbuffer ), "clSetKernelArg failed" );
 	
 			//output 32bit Buffer 
-			OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&in32bitfftbuffer ), "clSetKernelArg failed" );
+			OPENCL_V_THROW( clSetKernelArg( prekernel, 1, sizeof( cl_mem ), (void*)&in32bitfftbuffer ), "clSetKernelArg failed" );
 
-			//Launch pre-process kernel
-			status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
-													NULL, &gSize, NULL, 0, NULL, NULL );
+			status = clEnqueueNDRangeKernel( commandQueue, prekernel, 1,
+										NULL, &gSize_pre, NULL, 0, NULL, NULL );
 			OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
 	
 			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
@@ -406,25 +425,34 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 				&in32bitfftbuffer,  &outfftbuffer, clMedBuffer ),
 				"clfftEnqueueTransform failed" );
 		
+			//Launch post-process kernel
+			OPENCL_V_THROW( clSetKernelArg( postkernel, 0, sizeof( cl_mem ), (void*)&outfftbuffer ), "clSetKernelArg failed" );
+			OPENCL_V_THROW( clSetKernelArg( postkernel, 1, sizeof( cl_mem ), (void*)&magoutfftbuffer ), "clSetKernelArg failed" );
+
+			status = clEnqueueNDRangeKernel( commandQueue, postkernel, 1,
+								NULL, &gSize_post, NULL, 0, NULL, NULL );
+			OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+
 			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
 		}
 		double wtimesample = tr.Sample();
 		double wtime = wtimesample/((double)profile_count);
 	
-		tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
+		tout << "\nExecution wall time (Separate Pre and Post process kernels): " << 1000.0*wtime << " ms" << std::endl;
 	}
 
 	//cleanup preprocess kernel opencl objects
 	OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
-	OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
+	OPENCL_V_THROW( clReleaseKernel( prekernel ), "Error: In clReleaseKernel\n" );
+	OPENCL_V_THROW( clReleaseKernel( postkernel ), "Error: In clReleaseKernel\n" );
 
 	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
 
 	if (profile_count == 1)
 	{
-		std::vector< std::complex< T > > output( fftLength/2 );
+		std::vector< T > output( fftLength/2 );
 
-		OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, outfftbuffer, CL_TRUE, 0, out_size_of_buffers, &output[ 0 ],
+		OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, magoutfftbuffer, CL_TRUE, 0, out_size_of_buffers/2, &output[ 0 ],
 			0, NULL, NULL ), "Reading the result buffer failed" );
 
 		//Reference fftw output
@@ -432,17 +460,13 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 
 		refout = get_R2C_fftwf_output(inlengths, fftLength, (int)batchSize, inLayout, dim);
 
-		/*for( cl_uint i = 0; i < fftLength/2; i++)
-		{
-			std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
-		}*/
 		if (!compare<fftwf_complex, T>(refout, output, fftLength/2))
 		{
-			std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****FAIL*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (Separate Pre and Post process kernels) *****FAIL*****" << std::endl;
 		}
 		else
 		{
-			std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****PASS*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (Separate Pre and Post process kernels) *****PASS*****" << std::endl;
 		}
 
 		fftwf_free(refout);
@@ -455,11 +479,12 @@ void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_con
 	OPENCL_V_THROW( clReleaseMemObject( in24bitfftbuffer ), "Error: In clReleaseMemObject\n" );
 	OPENCL_V_THROW( clReleaseMemObject( in32bitfftbuffer ), "Error: In clReleaseMemObject\n" );
 	OPENCL_V_THROW( clReleaseMemObject( outfftbuffer ), "Error: In clReleaseMemObject\n" );
+	OPENCL_V_THROW( clReleaseMemObject( magoutfftbuffer ), "Error: In clReleaseMemObject\n" );
 }
 
 //Compare reference and opencl output 
 template < typename T1, typename T2>
-bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
+bool compare(T1 *refData, std::vector< T2 > data,
              size_t length, const float epsilon)
 {
     float error = 0.0f;
@@ -470,7 +495,7 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
 
     for(size_t i = 0; i < length; ++i)
     {
-        diff[0] = refData[i][0] - data[i].real();
+        diff[0] = refData[i][0] - data[i];
         error += (float)(diff[0] * diff[0]);
         ref[0] += refData[i][0] * refData[i][0];
     }
@@ -488,33 +513,8 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
 			return false;
 	}
 
-	//imag
-	error = 0.0f;
-	ref[1] = 0.0;
-	for(size_t i = 0; i < length; ++i)
-    {
-        diff[1] = refData[i][1] - data[i].imag();
-        error += (float)(diff[1] * diff[1]);
-        ref[1] += refData[i][1] * refData[i][1];
-    }
-	
-	if (error == 0)
 		return true;
-
-	normRef =::sqrtf((float) ref[1]);
-    if (::fabs((float) ref[1]) < 1e-7f)
-    {
-        return false;
     }
-	normError = ::sqrtf((float) error);
-    error = normError / normRef;
-    
-	if (error > epsilon)
-		return false;
-
-	return true;
-}
-
 
 // Compute reference output using fftw for float type
 fftwf_complex* get_R2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size,
@@ -565,5 +565,12 @@ fftwf_complex* get_R2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int
 
 	fftwf_destroy_plan(refPlan);
 
+	//Execute post-process code
+	for (size_t idx = 0; idx < (outfftVectorLength*batch_size); ++idx)
+	{
+		float magnitude = sqrtf(pow(refout[idx][0], 2) + pow(refout[idx][1], 2));
+		refout[idx][0] = magnitude;
+	}
+
 	return refout;
 }
diff --git a/src/callback-client/client.h b/src/callback-client/client.h
index 1b75e29..474456a 100644
--- a/src/callback-client/client.h
+++ b/src/callback-client/client.h
@@ -52,18 +52,30 @@ typedef unsigned char uint24_t[3];
 								*((__global float*)output + inoffset) = val;  \n \
 							} \n
 
+#define MagnitudeExtraction void extractMagnitude(__global void *output, uint outoffset, __global void *userdata, float2 fftoutput) \n \
+							{ \n \
+								float magnitude = sqrt(fftoutput.x * fftoutput.x + fftoutput.y * fftoutput.y); \n \
+								*((__global float*)output + outoffset) = magnitude; \n \
+							} \n
+
+#define MagnitudeExtraction_KERNEL __kernel void extractMagnitude(__global float2 *output, __global float *magoutput) \n \
+							{ \n \
+								uint outoffset = get_global_id(0); \n \
+								float magnitude = sqrt(output[outoffset].x * output[outoffset].x + output[outoffset].y * output[outoffset].y); \n \
+								*(magoutput + outoffset) = magnitude; \n \
+							} \n
 
 template < typename T >
 void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize, 
 				   clfftDim dim, clfftPrecision precision,  cl_uint profile_count);
 
 template < typename T >
-void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+void runR2C_FFT_WithCallback(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
 						size_t* inlengths, clfftDim dim, clfftPrecision precision,
 						size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
 
 template < typename T >
-void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
+void runR2C_FFT_PreAndPostprocessKernel(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
 							cl_command_queue commandQueue, cl_device_id device_id,
 							size_t* inlengths, clfftDim dim, clfftPrecision precision,
 							size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
@@ -72,7 +84,7 @@ fftwf_complex* get_R2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int
 									clfftLayout in_layout, clfftDim dim);
 
 template < typename T1, typename T2>
-bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
+bool compare(T1 *refData, std::vector< T2 > data,
              size_t length, const float epsilon = 1e-6f);
 
 #ifdef WIN32
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index 1cc3ee9..f7ccbcc 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -806,7 +806,28 @@ clfftStatus clfftSetPlanCallback(clfftPlanHandle plHandle, const char* funcName,
 
 		break;
 	case POSTCALLBACK:
+		{
+			ARG_CHECK(funcName != NULL);
+			ARG_CHECK(funcString != NULL);
+			ARG_CHECK(numUserdataBuffers >= 0);
+
+			//	We do not currently support multiple user data buffers
+			if( numUserdataBuffers > 1 )
 		return CLFFT_NOTIMPLEMENTED;
+
+			fftPlan->hasPostCallback = true;
+			fftPlan->postCallbackParam.funcname = funcName;
+			fftPlan->postCallbackParam.funcstring = funcString;
+			fftPlan->postCallbackParam.localMemSize = (localMemSize > 0) ? localMemSize : 0;
+
+			cl_mem userdataBuf = NULL;
+			
+			if (userdata)
+				userdataBuf = userdata[0];
+
+			fftPlan->postcallUserData = userdataBuf;
+		}
+		break;
 	default:
 		ARG_CHECK (false);
 	}
diff --git a/src/library/action.cpp b/src/library/action.cpp
index a68f272..8501825 100644
--- a/src/library/action.cpp
+++ b/src/library/action.cpp
@@ -593,16 +593,31 @@ clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
         OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
     }
 
-	//If pre-callback function is set for the plan, pass the appropriate aruments
+	//If callback function is set for the plan, pass the appropriate aruments
+	if (this->plan->hasPreCallback || this->plan->hasPostCallback)
+	{
 	if (this->plan->hasPreCallback)
 	{
 		OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->precallUserData ), _T( "clSetKernelArg failed" ) );
+		}
+
+		//If post-callback function is set for the plan, pass the appropriate aruments
+		if (this->plan->hasPostCallback)
+		{
+			OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->postcallUserData ), _T( "clSetKernelArg failed" ) );
+		}
 
 		//Pass LDS size arument if set
-		if (this->plan->preCallback.localMemSize > 0)
+		if ((this->plan->hasPreCallback && this->plan->preCallback.localMemSize > 0) || 
+			(this->plan->hasPostCallback && this->plan->postCallbackParam.localMemSize > 0))
 		{
-			//TODO: Check for available LDS beyond what FFT already uses
-			OPENCL_V( clSetKernelArg( kern, uarg++, this->plan->preCallback.localMemSize, NULL ), _T( "clSetKernelArg failed" ) );
+			int localmemSize = 0;
+			if (this->plan->hasPreCallback && this->plan->preCallback.localMemSize > 0)
+				localmemSize = this->plan->preCallback.localMemSize;
+			if (this->plan->hasPostCallback && this->plan->postCallbackParam.localMemSize > 0)
+				localmemSize += this->plan->postCallbackParam.localMemSize;
+
+			OPENCL_V( clSetKernelArg( kern, uarg++, localmemSize, NULL ), _T( "clSetKernelArg failed" ) );
 		}
 	}
 
diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
index 67d5d90..b55f599 100644
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@ -169,6 +169,14 @@ namespace CopyGenerator
 				str += "\n\n";
 			}
 
+			//if postcallback is set
+			if (params.fft_hasPostCallback)
+			{
+				//Insert callback function code at the beginning 
+				str += params.fft_postCallback.funcstring;
+				str += "\n\n";
+			}
+
 			// Copy kernel begin
 			str += "__kernel void ";
 
@@ -205,13 +213,26 @@ namespace CopyGenerator
 
 			if (params.fft_hasPreCallback && h2c)
 			{
-				str += ", __global void* userdata";
+				assert(!params.fft_hasPostCallback);
+
+				str += ", __global void* pre_userdata";
 				if (params.fft_preCallback.localMemSize > 0)
 				{
 					str += ", __local void* localmem";
 				}
 			}
 
+			if (params.fft_hasPostCallback)
+			{
+				assert(!params.fft_hasPreCallback);
+
+				str += ", __global void* post_userdata";
+				if (params.fft_postCallback.localMemSize > 0)
+				{
+					str += ", __local void* localmem";
+				}
+			}
+
 			str += ")\n";
 
 			str += "{\n";
@@ -249,7 +270,10 @@ namespace CopyGenerator
 			// output
 			if(outIlvd)
 			{
+				if (!params.fft_hasPostCallback)
+				{
 					str += "__global "; str += r2Type; str += " *lwbOut;\n";
+				}
 				if(h2c)
 				{
 					str += "\t";
@@ -258,8 +282,11 @@ namespace CopyGenerator
 			}
 			else
 			{
+				if (!params.fft_hasPostCallback)
+				{
 					str += "__global "; str += rType; str += " *lwbOutRe;\n\t";
 					str += "__global "; str += rType; str += " *lwbOutIm;\n";
+				}
 				if(h2c)
 				{
 					str += "\t";
@@ -321,7 +348,10 @@ namespace CopyGenerator
 			// outputs
 			if(outIlvd)
 			{
+				if (!params.fft_hasPostCallback)
+				{
 					str += "lwbOut = gbOut + oOffset"; str += outF; str += ";\n";
+				}
 				if(h2c)
 				{
 					str += "\t";
@@ -330,8 +360,11 @@ namespace CopyGenerator
 			}
 			else
 			{
+				if (!params.fft_hasPostCallback)
+				{
 					str += "lwbOutRe = gbOutRe + oOffset"; str += outF; str += ";\n\t";
 					str += "lwbOutIm = gbOutIm + oOffset"; str += outF; str += ";\n";
+				}
 				if(h2c)
 				{
 					str += "\t";
@@ -386,25 +419,52 @@ namespace CopyGenerator
 				
 					if(outIlvd)
 					{
+						if (params.fft_hasPostCallback)
+						{
+							str += params.fft_postCallback.funcname; str += "(gbOut, oOffset"; str += outF;
+							str += ", post_userdata, R";
+							if (params.fft_postCallback.localMemSize > 0)
+							{
+								str += ", localmem";
+							}
+							str += ");\n\n";
+						}
+						else
+						{
 						str += "lwbOut[0] = R;\n\n";
 					}
+					}
+					else
+					{
+						if (params.fft_hasPostCallback)
+						{
+							str += params.fft_postCallback.funcname; str += "(gbOutRe, gbOutIm, oOffset"; str += outF;
+							str += ", post_userdata, R.x, R.y";
+
+							if (params.fft_postCallback.localMemSize > 0)
+							{
+								str += ", localmem";
+							}
+							str += ");\n\t";
+						}
 					else
 					{
 						str += "lwbOutRe[0] = R.x;\n\t";
 						str += "lwbOutIm[0] = R.y;\n\t";
 					}
 				}
+				}
 				else
 				{
 					if (params.fft_hasPreCallback)
 					{
 						if(inIlvd)
 						{
-							str += "R = "; str += params.fft_preCallback.funcname; str += "( gbIn, (iOffset"; str += inF; str += "), userdata"; 
+							str += "R = "; str += params.fft_preCallback.funcname; str += "( gbIn, (iOffset"; str += inF; str += "), pre_userdata"; 
 						}
 						else
 						{
-							str += "R = "; str += params.fft_preCallback.funcname; str += "( gbInRe, gbInIm, (iOffset"; str += inF; str += "), userdata";
+							str += "R = "; str += params.fft_preCallback.funcname; str += "( gbInRe, gbInIm, (iOffset"; str += inF; str += "), pre_userdata";
 						}
 						if (params.fft_preCallback.localMemSize > 0)
 						{
@@ -486,6 +546,8 @@ clfftStatus FFTGeneratedCopyAction::initParams ()
 	//Set callback if specified
 	if (this->plan->hasPreCallback)
 	{
+		assert(!this->plan->hasPostCallback);
+
 		this->signature.fft_hasPreCallback = true;
 		this->signature.fft_preCallback = this->plan->preCallback;
 
@@ -497,6 +559,23 @@ clfftStatus FFTGeneratedCopyAction::initParams ()
 		}
 	}
 
+	if (this->plan->hasPostCallback)
+	{
+		assert(!this->plan->hasPreCallback);
+
+		this->signature.fft_hasPostCallback = true;
+		this->signature.fft_postCallback = this->plan->postCallbackParam;
+
+		//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+		//Assumes copy kernel does not have both pre and post callback 
+		if (this->plan->postCallbackParam.localMemSize > this->plan->envelope.limit_LocalMemSize)
+		{
+			fprintf(stderr, "Requested local memory size not available\n");
+			return CLFFT_INVALID_ARG_VALUE;
+		}
+	}
+	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
+
     return CLFFT_SUCCESS;
 }
 
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 49a7ff4..2dedf4c 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -622,6 +622,9 @@ namespace StockhamGenerator
 		bool fft_doPreCallback;
 		clfftCallbackParam fft_preCallback;
 
+		bool fft_doPostCallback;
+		clfftCallbackParam fft_postCallback;
+
 		inline void RegBase(size_t regC, std::string &str) const
 		{
 			str += "B";
@@ -784,13 +787,14 @@ namespace StockhamGenerator
 			std::string rType  = RegBaseType<PR>(1);
 
 			size_t butterflyIndex = numPrev;
+			std::string bufOffset;
 
 			std::string regBase;
 			RegBase(regC, regBase);
 
 			// special write back to global memory with float4 grouping, writing 2 complex numbers at once
 			if( numB && (numB%2 == 0) && (regC == 1) && (stride == 1) && (numButterfly%2 == 0) && (algLS%2 == 0) && (flag == SR_WRITE) &&
-				(nextPass == NULL) && interleaved && (component == SR_COMP_BOTH) && linearRegs && enableGrouping )
+				(nextPass == NULL) && interleaved && (component == SR_COMP_BOTH) && linearRegs && enableGrouping && !fft_doPostCallback )
 			{
 				assert((numButterfly * workGroupSize) == algLS);
 				assert(bufferRe.compare(bufferIm) == 0); // Make sure Real & Imag buffer strings are same for interleaved data
@@ -887,7 +891,7 @@ namespace StockhamGenerator
 							}
 
 							//get offset 
-							std::string bufOffset;
+							bufOffset.clear();
 							bufOffset += offset; bufOffset += " + ( "; bufOffset += SztToStr(numPrev); bufOffset += " + ";
 							bufOffset += "me*"; bufOffset += SztToStr(numButterfly); bufOffset += " + ";
 							bufOffset += SztToStr(i); bufOffset += " + ";
@@ -908,7 +912,7 @@ namespace StockhamGenerator
 								{
 									passStr += bufferRe; passStr += ", "; passStr += bufferIm; passStr += ", ";
 								}
-								passStr += bufOffset; passStr += ", userdata";
+								passStr += bufOffset; passStr += ", pre_userdata";
 								if (fft_preCallback.localMemSize > 0)
 								{
 									passStr += ", localmem";
@@ -937,7 +941,7 @@ namespace StockhamGenerator
 								else if (r2c)
 								{
 									passStr += fft_preCallback.funcname; passStr += "("; passStr += buffer; passStr += ", ";
-									passStr += bufOffset; passStr += ", userdata";
+									passStr += bufOffset; passStr += ", pre_userdata";
 
 									if (fft_preCallback.localMemSize > 0)
 									{
@@ -979,6 +983,7 @@ namespace StockhamGenerator
 						if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
 							passStr += "\n\t}\n\tif( rw && !me)\n\t{";
 
+						std::string regIndexC0;
 						for(size_t c=cStart; c<cEnd; c++) // component loop: 0 - real, 1 - imaginary
 						{
 							std::string tail;
@@ -1010,27 +1015,58 @@ namespace StockhamGenerator
 								}
 							}
 
+							bufOffset.clear();
+							bufOffset += offset; bufOffset += " + ( "; 
+							if( (numButterfly * workGroupSize) > algLS )
+							{
+								bufOffset += "(("; bufOffset += SztToStr(numButterfly);
+								bufOffset += "*me + "; bufOffset += SztToStr(butterflyIndex); bufOffset += ")/";
+								bufOffset += SztToStr(algLS); bufOffset += ")*"; bufOffset += SztToStr(algL); bufOffset += " + (";
+								bufOffset += SztToStr(numButterfly); bufOffset += "*me + "; bufOffset += SztToStr(butterflyIndex);
+								bufOffset += ")%"; bufOffset += SztToStr(algLS); bufOffset += " + ";
+							}
+							else
+							{
+								bufOffset += SztToStr(numButterfly); bufOffset += "*me + "; bufOffset += SztToStr(butterflyIndex);
+								bufOffset += " + ";
+							}
+							bufOffset += SztToStr(r*algLS); bufOffset += " )*"; bufOffset += SztToStr(stride);
+
+							if(scale != 1.0f) { regIndex += " * "; regIndex += FloatToStr(scale); regIndex += FloatSuffix<PR>(); }
+							if (c == cStart)	regIndexC0 = regIndex;
+
+							if (fft_doPostCallback && !r2c)
+							{
+								if (interleaved || c == (cEnd - 1))
+								{
 							passStr += "\n\t";
-							passStr += buffer; passStr += "["; passStr += offset; passStr += " + ( ";
+									passStr += fft_postCallback.funcname; passStr += "(";
 
-							if( (numButterfly * workGroupSize) > algLS )
+									if (interleaved || (c2r && bufferRe.compare(bufferIm) == 0))
 							{
-								passStr += "(("; passStr += SztToStr(numButterfly);
-								passStr += "*me + "; passStr += SztToStr(butterflyIndex); passStr += ")/";
-								passStr += SztToStr(algLS); passStr += ")*"; passStr += SztToStr(algL); passStr += " + (";
-								passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
-								passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
+										passStr += buffer;
 							}
 							else
 							{
-								passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
-								passStr += " + ";
+										passStr += bufferRe; passStr += ", "; passStr += bufferIm;
 							}
+									passStr += ", ";
+									passStr += bufOffset; passStr += ", post_userdata, ("; passStr += regIndexC0; passStr += ")";
+									if (!(interleaved || (c2r && bufferRe.compare(bufferIm) == 0))) { passStr += ", ("; passStr += regIndex; passStr += ")"; }
 
-							passStr += SztToStr(r*algLS); passStr += " )*"; passStr += SztToStr(stride); passStr += "]";
-							passStr += tail; passStr += " = "; passStr += regIndex;
-							if(scale != 1.0f) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); }
-							passStr += ";";
+									if (fft_postCallback.localMemSize > 0)
+									{
+										passStr += ", post_localmem";
+									}
+									passStr += ");";
+								}
+							}
+							else
+							{	
+								passStr += "\n\t";
+								passStr += buffer; passStr += "["; passStr += bufOffset; passStr += "]";
+								passStr += tail; passStr += " = "; passStr += regIndex; passStr += ";";
+							}
 
 							// Since we write real & imag at once, we break the loop
 							if(interleaved && (component == SR_COMP_BOTH))
@@ -1114,7 +1150,7 @@ namespace StockhamGenerator
 								}
 
 								//get offset 
-								std::string bufOffset;
+								bufOffset.clear();
 								bufOffset += offset; bufOffset += " + ( "; bufOffset += SztToStr(numPrev); bufOffset += " + ";
 								bufOffset += "me*"; bufOffset += SztToStr(numButterfly); bufOffset += " + ";
 								bufOffset += SztToStr(i*regC + v); bufOffset += " + ";
@@ -1142,7 +1178,7 @@ namespace StockhamGenerator
 									{
 										passStr += bufferRe; passStr += ", "; passStr += bufferIm; passStr += ", ";
 									}
-									passStr += bufOffset; passStr += ", userdata";
+									passStr += bufOffset; passStr += ", pre_userdata";
 									if (fft_preCallback.localMemSize > 0)
 									{
 										passStr += ", localmem";
@@ -1176,7 +1212,7 @@ namespace StockhamGenerator
 									else if (r2c)
 									{
 										passStr += fft_preCallback.funcname; passStr += "("; passStr += buffer; passStr += ", ";
-										passStr += bufOffset; passStr += ", userdata";
+										passStr += bufOffset; passStr += ", pre_userdata";
 
 										if (fft_preCallback.localMemSize > 0)
 										{
@@ -1344,6 +1380,8 @@ namespace StockhamGenerator
 							if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
 								passStr += "\n\t}\n\tif( rw && !me)\n\t{";
 
+							std::string regIndexC0;
+								
 							for(size_t c=cStart; c<cEnd; c++) // component loop: 0 - real, 1 - imaginary
 							{
 								std::string tail;
@@ -1384,26 +1422,66 @@ namespace StockhamGenerator
 								}
 
 								passStr += "\n\t";
-								passStr += buffer; passStr += "["; passStr += offset; passStr += " + ( ";
 
+								if(scale != 1.0f) { regIndex += " * "; regIndex += FloatToStr(scale); regIndex += FloatSuffix<PR>(); }
+								if (c == 0) regIndexC0 += regIndex;
+
+								bufOffset.clear();
+								bufOffset += offset; bufOffset += " + ( ";
 								if( (numButterfly * workGroupSize) > algLS )
 								{
-									passStr += "(("; passStr += SztToStr(numButterfly);
-									passStr += "*me + "; passStr += SztToStr(butterflyIndex); passStr += ")/";
-									passStr += SztToStr(algLS); passStr += ")*"; passStr += SztToStr(algL); passStr += " + (";
-									passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
-									passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
+									bufOffset += "(("; bufOffset += SztToStr(numButterfly);
+									bufOffset += "*me + "; bufOffset += SztToStr(butterflyIndex); bufOffset += ")/";
+									bufOffset += SztToStr(algLS); bufOffset += ")*"; bufOffset += SztToStr(algL); bufOffset += " + (";
+									bufOffset += SztToStr(numButterfly); bufOffset += "*me + "; bufOffset += SztToStr(butterflyIndex);
+									bufOffset += ")%"; bufOffset += SztToStr(algLS); bufOffset += " + ";
 								}
 								else
 								{
-									passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex);
-									passStr += " + ";
+									bufOffset += SztToStr(numButterfly); bufOffset += "*me + "; bufOffset += SztToStr(butterflyIndex);
+									bufOffset += " + ";
 								}
 
-								passStr += SztToStr(r*algLS); passStr += " )*"; passStr += SztToStr(stride); passStr += "]";
+								bufOffset += SztToStr(r*algLS); bufOffset += " )*"; bufOffset += SztToStr(stride); 
+								
+								if (fft_doPostCallback)
+								{
+									if(interleaved && (component == SR_COMP_BOTH))
+									{
+										if (c == (cEnd - 1))
+										{
+											passStr += "tempC.x = "; passStr += regIndexC0; passStr += ";\n\t";
+											passStr += "tempC.y = "; passStr += regIndex; passStr += ";\n\t";
+
+											passStr += fft_postCallback.funcname; passStr += "("; 
+											passStr += buffer; passStr += ", (";
+											passStr += bufOffset; passStr += "), post_userdata, tempC";
+											if (fft_postCallback.localMemSize > 0)
+											{
+												passStr += ", post_localmem";
+											}
+											passStr += ");";
+										}
+									}
+									else if (c == (cEnd - 1))
+									{
+										passStr += fft_postCallback.funcname; passStr += "("; 
+										passStr += bufferRe; passStr += ", "; passStr += bufferIm; passStr += ", (";
+										passStr += bufOffset; passStr += "), post_userdata, ("; 
+										passStr += regIndexC0; passStr += "), ("; passStr += regIndex; passStr += ")";
+										if (fft_postCallback.localMemSize > 0)
+										{
+											passStr += ", post_localmem";
+										}
+										passStr += ");";
+									}
+								}
+								else
+								{
+									passStr += buffer; passStr += "["; passStr += bufOffset; passStr += "]";
 								passStr += tail; passStr += " = "; passStr += regIndex;
-								if(scale != 1.0f) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); }
 								passStr += ";";
+								}
 
 								// Since we write real & imag at once, we break the loop
 								if(interleaved && (component == SR_COMP_BOTH) && linearRegs)
@@ -1468,6 +1546,8 @@ namespace StockhamGenerator
 
 			for(size_t r=rStart; r<rEnd; r++)
 			{
+				std::string val1StrExt;
+
 				for(size_t c=cStart; c<cEnd; c++) // component loop: 0 - real, 1 - imaginary
 				{
 					if(flag == SR_READ) // read operation
@@ -1581,7 +1661,7 @@ namespace StockhamGenerator
 								
 								if (fft_doPreCallback)
 								{
-									passStr += ", userdata";
+									passStr += ", pre_userdata";
 									passStr += (fft_preCallback.localMemSize > 0) ? ", localmem);" : ");";
 								}
 								else
@@ -1677,10 +1757,30 @@ namespace StockhamGenerator
 
 								std::string val1Str, val2Str;
 
+								if (fft_doPostCallback && !rcFull)
+								{
+									if (interleaved)
+									{
+										val1Str += "\n\t";
+										val1Str += fft_postCallback.funcname; val1Str += "("; val1Str += buffer; val1Str += ", ";
+										val1Str += offset; val1Str += " + ( "; val1Str += idxStr; val1Str += " )*"; val1Str += SztToStr(stride);
+										val1Str += ", post_userdata, ";
+									}
+									else if (c == 0)
+									{
+										val1StrExt += "\n\t";
+										val1StrExt += fft_postCallback.funcname; val1StrExt += "("; val1StrExt += bufferRe; val1StrExt += ", ";
+										val1StrExt += bufferIm; val1StrExt += ", "; val1StrExt += offset; val1StrExt += " + ( "; val1StrExt += idxStr; 
+										val1StrExt += " )*"; val1StrExt += SztToStr(stride); val1StrExt += ", post_userdata, ";
+									}									
+								}
+								else
+								{
 								val1Str += "\n\t";
 								val1Str += buffer; val1Str += "["; val1Str += offset; val1Str += " + ( ";
 								val1Str += idxStr; val1Str += " )*"; val1Str += SztToStr(stride); val1Str += "]";
 								val1Str += tail; val1Str += " = ";
+								}
 
 								val2Str += "\n\t";
 								val2Str += buffer; val2Str += "["; val2Str += offset; val2Str += " + ( ";
@@ -1732,7 +1832,29 @@ namespace StockhamGenerator
 								val1Str += sclStr;
 								val2Str += sclStr;
 
-												passStr += val1Str; passStr += ";";
+								if (fft_doPostCallback && !rcFull) 
+								{
+									if (!interleaved) 
+									{
+										val1StrExt += val1Str;
+										val1Str.clear();
+
+										if(c == 0) val1StrExt += ", ";
+										else	val1Str += val1StrExt;
+									}
+
+									if (interleaved || c == (cEnd - 1))
+									{
+										if (fft_postCallback.localMemSize > 0)	val1Str += ", localmem";
+										val1Str += ");";
+									}
+								}
+								else
+								{
+									val1Str += ";";
+								}
+
+												passStr += val1Str; 
 								if(rcFull)	{	passStr += val2Str; passStr += ";"; }
 							}
 							else
@@ -1847,7 +1969,7 @@ namespace StockhamGenerator
 			r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal), realSpecial(realSpecialVal),
 			enableGrouping(true),
 			numB1(0), numB2(0), numB4(0),
-			nextPass(NULL), fft_doPreCallback(false)
+			nextPass(NULL), fft_doPreCallback(false), fft_doPostCallback(false)
 		{
 			assert(radix <= length);
 			assert(length%radix == 0);
@@ -1897,6 +2019,12 @@ namespace StockhamGenerator
 			fft_preCallback = precallbackParam;
 		}
 
+		void SetPostcallback(bool hasPostcallback, clfftCallbackParam postcallbackParam) 
+		{ 
+			fft_doPostCallback = hasPostcallback;
+			fft_postCallback = postcallbackParam;
+		}
+
 		void GeneratePass(	bool fwd, std::string &passStr, bool fft_3StepTwiddle,
 							bool inInterleaved, bool outInterleaved,
 							bool inReal, bool outReal,
@@ -2058,7 +2186,9 @@ namespace StockhamGenerator
 				passStr += ", "; passStr += IterRegArgs();
 			}
 
-			//Include callback parameters if callback is set
+			if (fft_doPreCallback || fft_doPostCallback)
+			{
+				//Include pre-callback parameters if pre-callback is set
 			if (fft_doPreCallback )
 			{
 				if ((r2c && !rcSimple) || c2r)
@@ -2066,12 +2196,27 @@ namespace StockhamGenerator
 					passStr += ", uint inOffset2";
 				}
 
-				passStr += ", __global void* userdata";
+					passStr += ", __global void* pre_userdata";
+				}
 
-				if (fft_preCallback.localMemSize > 0)
+				//Include post-callback parameters if post-callback is set
+				if (fft_doPostCallback )
+				{
+					if (r2c || (c2r && !rcSimple))
+					{
+						passStr += ", uint outOffset2";
+					}
+					passStr += ", __global void* post_userdata";
+				}
+
+				if (fft_doPreCallback && fft_preCallback.localMemSize > 0)
 				{
 					passStr += ", __local void* localmem";
 				}
+				if (fft_doPostCallback && fft_postCallback.localMemSize > 0)
+				{
+					passStr += ", __local void* post_localmem";
+				}
 			}
 
 			passStr += ")\n{\n";
@@ -2169,7 +2314,7 @@ namespace StockhamGenerator
 					{
 						passStr += fft_preCallback.funcname; passStr += "("; passStr += bufferInRe; 
 						if (!inInterleaved) { passStr += ", "; passStr += bufferInIm; }
-						passStr += ", inOffset, userdata";
+						passStr += ", inOffset, pre_userdata";
 						passStr += fft_preCallback.localMemSize > 0 ? ", localmem)" : ")";
 					}
 					else
@@ -2259,7 +2404,7 @@ namespace StockhamGenerator
 					{
 						passStr += fft_preCallback.funcname; passStr += "("; passStr += bufferInRe2; 
 						if (!inInterleaved) { passStr += ", "; passStr += bufferInIm2; }
-						passStr += ", inOffset2, userdata";
+						passStr += ", inOffset2, pre_userdata";
 						passStr += fft_preCallback.localMemSize > 0 ? ", localmem)" : ")";
 					}
 					else
@@ -2428,16 +2573,44 @@ namespace StockhamGenerator
 							passStr += "\n\tif(rw && !me)\n\t{\n\t";
 							if(outInterleaved)
 							{
+								if (fft_doPostCallback)
+								{
+									passStr += fft_postCallback.funcname; passStr += "(bufOut, outOffset, post_userdata, ";
+									passStr += "("; passStr += RegBaseType<PR>(2); passStr += ") ( ("; passStr += bufferInRe; passStr += "[inOffset]";
+									if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ") , 0 )";
+									if (fft_postCallback.localMemSize > 0)
+									{
+										passStr += ", localmem";
+									}
+									passStr += ");\n\t}";
+								}
+								else
+								{
 								passStr += bufferOutRe; passStr+= "[outOffset].x = "; passStr += bufferInRe; passStr += "[inOffset]";
 								if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ";\n\t";
 								passStr += bufferOutIm; passStr+= "[outOffset].y = "; passStr += "0;\n\t}";
 							}
+							}
+							else
+							{
+								if (fft_doPostCallback)
+								{
+									passStr += fft_postCallback.funcname; passStr += "("; passStr += bufferOutRe; passStr += ", "; passStr += bufferOutIm;
+									passStr += ", outOffset, post_userdata, "; passStr += bufferInRe; passStr += "[inOffset]";
+									if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ", 0";
+									if (fft_postCallback.localMemSize > 0)
+									{
+										passStr += ", localmem";
+									}
+									passStr += ");\n\t}";
+								}
 							else
 							{
 								passStr += bufferOutRe; passStr+= "[outOffset] = ";   passStr += bufferInRe; passStr += "[inOffset]";
 								if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ";\n\t";
 								passStr += bufferOutIm; passStr+= "[outOffset] = ";   passStr += "0;\n\t}";
 							}
+							}
 							passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 
 
@@ -2454,16 +2627,44 @@ namespace StockhamGenerator
 							passStr += "\n\tif((rw > 1) && !me)\n\t{\n\t";
 							if(outInterleaved)
 							{
+								if (fft_doPostCallback)
+								{
+									passStr += fft_postCallback.funcname; passStr += "(bufOut2, outOffset2, post_userdata, ";
+									passStr += "("; passStr += RegBaseType<PR>(2); passStr += ") ( ("; passStr += bufferInIm; passStr += "[inOffset]";
+									if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ") , 0 )";
+									if (fft_postCallback.localMemSize > 0)
+									{
+										passStr += ", localmem";
+									}
+									passStr += ");\n\t}";
+								}
+								else
+								{
 								passStr += bufferOutRe2; passStr+= "[outOffset].x = "; passStr += bufferInIm; passStr += "[inOffset]";
 								if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ";\n\t";
 								passStr += bufferOutIm2; passStr+= "[outOffset].y = "; passStr += "0;\n\t}";
 							}
+							}
+							else
+							{
+								if (fft_doPostCallback)
+								{
+									passStr += fft_postCallback.funcname; passStr += "("; passStr += bufferOutRe2; passStr += ", "; passStr += bufferOutIm2;
+									passStr+= ", outOffset2, post_userdata, "; passStr += bufferInIm; passStr += "[inOffset]";
+									if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ", 0";
+									if (fft_postCallback.localMemSize > 0)
+									{
+										passStr += ", localmem";
+									}
+									passStr += ");\n\t}";
+								}
 							else
 							{
 								passStr += bufferOutRe2; passStr+= "[outOffset] = ";   passStr += bufferInIm; passStr += "[inOffset]";
 								if(scale != 1.0) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix<PR>(); } passStr += ";\n\t";
 								passStr += bufferOutIm2; passStr+= "[outOffset] = ";   passStr += "0;\n\t}";
 							}
+							}
 							passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 						}
 
@@ -2480,13 +2681,18 @@ namespace StockhamGenerator
 						}
 
 						passStr += "\n\n\tif(rw > 1)\n\t{";
-						SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, true, false, bufferOutRe2, bufferOutIm2, "outOffset", passStr);
+						
+						std::string outOffset;
+						outOffset += "outOffset";
+						if (fft_doPostCallback) outOffset += "2";
+
+						SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, true, false, bufferOutRe2, bufferOutIm2, outOffset, passStr);
 						passStr += "\n\t}\n";
 						if(oddp)
 						{
 							passStr += "\n\n\tbrv = ((rw > 1) & (me%2 == 1));\n\t";
 							passStr += "if(brv)\n\t{";
-							SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, true, true, bufferOutRe2, bufferOutIm2, "outOffset", passStr);
+							SweepRegsRC(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, true, true, bufferOutRe2, bufferOutIm2, outOffset, passStr);
 							passStr += "\n\t}\n";
 						}
 
@@ -2499,8 +2705,12 @@ namespace StockhamGenerator
 
 						if(!rcSimple)
 						{
+							std::string outOffset;
+							outOffset += "outOffset";
+							if (fft_doPostCallback) outOffset += "2";
+
 							passStr += "\n\tif(rw > 1)\n\t{";
-							SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe2, bufferOutIm2, "outOffset", 1, numB1, 0, passStr);
+							SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe2, bufferOutIm2, outOffset, 1, numB1, 0, passStr);
 							passStr += "\n\t}\n";
 						}
 					}
@@ -2533,6 +2743,10 @@ namespace StockhamGenerator
 			}
 			else
 			{
+				if (fft_doPostCallback && outInterleaved)
+				{
+					passStr += "\n\t"; passStr += regB2Type; passStr += " tempC;";
+				}
 				passStr += "\n\tif(rw)\n\t{";
 				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
 				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 2, numB2, numB1, passStr);
@@ -2790,6 +3004,13 @@ namespace StockhamGenerator
 						passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
 					}
 
+					//Pass post-callback information to Pass object if its the last pass. 
+					//This will be used in single kernel transforms
+					if (params.fft_hasPostCallback && i == (nPasses - 1) && !params.blockCompute)
+					{
+						passes[i].SetPostcallback(params.fft_hasPostCallback, params.fft_postCallback);
+					}
+
 					LS *= rad;
 				}
 				assert(R == 1); // this has to be true for correct radix composition of the length
@@ -2843,6 +3064,13 @@ namespace StockhamGenerator
 						break;
 				}
 				numPasses = pid;
+
+				//Pass post-callback information to Pass object if its the last pass. 
+				//This will be used in single kernel transforms
+				if (params.fft_hasPostCallback)
+				{
+					passes[numPasses - 1].SetPostcallback(params.fft_hasPostCallback, params.fft_postCallback);
+				}
 			}
 
 			assert(numPasses == passes.size());
@@ -3045,13 +3273,21 @@ namespace StockhamGenerator
 			std::string callbackstr;
 			if (params.fft_hasPreCallback)
 			{
-				//Insert callback function code at the beginning 
+				//Insert pre-callback function code at the beginning 
 				callbackstr += params.fft_preCallback.funcstring;
 				callbackstr += "\n\n";
 
 				str += callbackstr;
 			}
 
+			//If post-callback is set for the plan
+			if (params.fft_hasPostCallback)
+			{
+				//Insert post-callback function code 
+				str += params.fft_postCallback.funcstring;
+				str += "\n\n";
+			}
+
 			typename std::vector< Pass<PR> >::const_iterator p;
 			if(length > 1)
 			{
@@ -3167,17 +3403,24 @@ namespace StockhamGenerator
 
         delete [] nameVendor;
 
-		//If plan has pre-callback
+		//If plan has pre/post callback
 		callbackstr.clear();
+		bool hasCallback = params.fft_hasPreCallback || params.fft_hasPostCallback;
+
+		if (hasCallback)
+		{
 		if (params.fft_hasPreCallback)
 		{
-			if (params.fft_preCallback.localMemSize > 0)
+				callbackstr += ", __global void* pre_userdata";
+			}
+			if (params.fft_hasPostCallback)
 			{
-				callbackstr += ", __global void* userdata, __local void* localmem";
+				callbackstr += ", __global void* post_userdata";
 			}
-			else
+
+			if (params.fft_preCallback.localMemSize > 0 || params.fft_postCallback.localMemSize > 0)
 			{
-				callbackstr += ", __global void* userdata";
+				callbackstr += ", __local void* localmem";
 			}
 		}
 
@@ -3195,8 +3438,8 @@ namespace StockhamGenerator
 							str += "__global "; str += rType; str += " * restrict gb";
 						}
 
-						//If plan has pre-callback
-						if (params.fft_hasPreCallback)
+						//If plan has callback
+						if (hasCallback)
 						{
 							str += callbackstr;
 						}
@@ -3213,8 +3456,8 @@ namespace StockhamGenerator
 						{
 							str += "__global "; str += r2Type; str += " * restrict gb";
 
-							//If plan has pre-callback
-							if (params.fft_hasPreCallback)
+							//If plan has callback
+							if (hasCallback)
 							{
 								str += callbackstr;
 							}
@@ -3226,8 +3469,8 @@ namespace StockhamGenerator
 							str += "__global "; str += rType; str += " * restrict gbRe, ";
 							str += "__global "; str += rType; str += " * restrict gbIm";
 
-							//If plan has pre-callback
-							if (params.fft_hasPreCallback)
+							//If plan has callback
+							if (hasCallback)
 							{
 								str += callbackstr;
 							}
@@ -3268,8 +3511,8 @@ namespace StockhamGenerator
 							str += "__global const "; str += rType; str += " * restrict gbOutIm";
 						}
 
-						//If plan has pre-callback
-						if (params.fft_hasPreCallback)
+						//If plan has callback
+						if (hasCallback)
 						{
 							str += callbackstr;
 						}
@@ -3298,8 +3541,8 @@ namespace StockhamGenerator
 							str += "__global "; str += rType; str += " * restrict gbOutIm";
 						}
 
-						//If plan has pre-callback
-						if (params.fft_hasPreCallback)
+						//If plan has callback
+						if (hasCallback)
 						{
 							str += callbackstr;
 						}
@@ -3375,24 +3618,31 @@ namespace StockhamGenerator
 
 					if(outInterleaved)
 					{
+						if (!params.fft_hasPostCallback)	
+						{ 
 						if(!rcSimple)	{	str += "__global "; str += r2Type; str += " *lwbOut2;\n\t"; }
-											str += "__global "; str += r2Type; str += " *lwbOut;\n\n";
-
+												str += "__global "; str += r2Type; str += " *lwbOut;\n";
+						}
 					}
 					else if(outReal)
 					{
+						if (!params.fft_hasPostCallback)	
+						{
 						if(!rcSimple)	{	str += "__global "; str += rType; str += " *lwbOut2;\n\t"; }
-											str += "__global "; str += rType; str += " *lwbOut;\n\n";
-
+												str += "__global "; str += rType; str += " *lwbOut;\n";
+						}
 					}
 					else
 					{
+						if (!params.fft_hasPostCallback)	
+						{
 						if(!rcSimple)	{	str += "__global "; str += rType; str += " *lwbOutRe2;\n\t"; }
 						if(!rcSimple)	{	str += "__global "; str += rType; str += " *lwbOutIm2;\n\t"; }
 											str += "__global "; str += rType; str += " *lwbOutRe;\n\t";
-											str += "__global "; str += rType; str += " *lwbOutIm;\n\n";
-
+												str += "__global "; str += rType; str += " *lwbOutIm;\n";
+						}
 					}
+					str += "\n";
 				}
 				else
 				{
@@ -3400,27 +3650,28 @@ namespace StockhamGenerator
 					{
 						str += "uint ioOffset;\n\t";
 
-						//Skip if precallback is set and its blockcompute
-						if (!(blockCompute && params.fft_hasPreCallback))
+						//Skip if callback is set 
+						if (!params.fft_hasPreCallback || !params.fft_hasPostCallback)
 						{
 							if(inInterleaved)
 							{
-								str += "__global "; str += r2Type; str += " *lwb;\n\n";
+								str += "__global "; str += r2Type; str += " *lwb;\n";
 							}
 							else
 							{
 								str += "__global "; str += rType; str += " *lwbRe;\n\t";
-								str += "__global "; str += rType; str += " *lwbIm;\n\n";
+								str += "__global "; str += rType; str += " *lwbIm;\n";
 							}
 						}
+						str += "\n";
 					}
 					else
 					{
 						str += "uint iOffset;\n\t";
 						str += "uint oOffset;\n\t";
 
-						//Skip if precallback is set and its blockcompute
-						if (!(blockCompute && params.fft_hasPreCallback))
+						//Skip if precallback is set 
+						if (!(params.fft_hasPreCallback))
 						{
 							if(inInterleaved)
 							{
@@ -3433,15 +3684,20 @@ namespace StockhamGenerator
 							}
 						}
 
+						//Skip if postcallback is set 
+						if (!params.fft_hasPostCallback)
+						{
 						if(outInterleaved)
 						{
-							str += "__global "; str += r2Type; str += " *lwbOut;\n\n";
+								str += "__global "; str += r2Type; str += " *lwbOut;\n";
 						}
 						else
 						{
 							str += "__global "; str += rType; str += " *lwbOutRe;\n\t";
-							str += "__global "; str += rType; str += " *lwbOutIm;\n\n";
+								str += "__global "; str += rType; str += " *lwbOutIm;\n";
+							}
 						}
+						str += "\n";
 					}
 				}
 
@@ -3536,9 +3792,12 @@ namespace StockhamGenerator
 							}
 						}
 
+						if(!params.fft_hasPostCallback)
+						{
 						if(!rcSimple) {	str += "lwbOut2 = gb + oOffset2;\n\t"; }
-										str += "lwbOut = gb + oOffset;\n\n";
-
+											str += "lwbOut = gb + oOffset;\n";
+						}
+						str += "\n";
 					}
 					else
 					{
@@ -3558,19 +3817,24 @@ namespace StockhamGenerator
 							}
 						}
 
+						if (!params.fft_hasPostCallback)
+						{
 						if(outInterleaved || outReal)
 						{
 							if(!rcSimple) {	str += "lwbOut2 = gbOut + oOffset2;\n\t"; }
-											str += "lwbOut = gbOut + oOffset;\n\n";
+												str += "lwbOut = gbOut + oOffset;\n";
 						}
 						else
 						{
+								
 							if(!rcSimple) {	str += "lwbOutRe2 = gbOutRe + oOffset2;\n\t"; }
 							if(!rcSimple) {	str += "lwbOutIm2 = gbOutIm + oOffset2;\n\t"; }
 											str += "lwbOutRe = gbOutRe + oOffset;\n\t";
-											str += "lwbOutIm = gbOutIm + oOffset;\n\n";
+												str += "lwbOutIm = gbOutIm + oOffset;\n";
 						}
 					}
+						str += "\n";
+					}
 				}
 				else
 				{
@@ -3583,19 +3847,20 @@ namespace StockhamGenerator
 
 						str += "\t";
 
-						//Skip if precallback is set and its blockcompute
-						if (!(blockCompute && params.fft_hasPreCallback))
+						//Skip if callback is set 
+						if (!params.fft_hasPreCallback || !params.fft_hasPostCallback)
 						{
 							if(inInterleaved)
 							{
-								str += "lwb = gb + ioOffset;\n\n";
+								str += "lwb = gb + ioOffset;\n";
 							}
 							else
 							{
 								str += "lwbRe = gbRe + ioOffset;\n\t";
-								str += "lwbIm = gbIm + ioOffset;\n\n";
+								str += "lwbIm = gbIm + ioOffset;\n";
 							}
 						}
+						str += "\n";
 					}
 					else
 					{
@@ -3612,8 +3877,8 @@ namespace StockhamGenerator
 
 						str += "\t";
 
-						//Skip if precallback is set and its blockcompute
-						if (!(blockCompute && params.fft_hasPreCallback))
+						//Skip if precallback is set 
+						if (!(params.fft_hasPreCallback))
 						{
 							if(inInterleaved)
 							{
@@ -3626,26 +3891,34 @@ namespace StockhamGenerator
 							}
 						}
 
+						//Skip if postcallback is set 
+						if (!params.fft_hasPostCallback)
+						{
 						if(outInterleaved)
 						{
-							str += "lwbOut = gbOut + oOffset;\n\n";
+								str += "lwbOut = gbOut + oOffset;\n";
 						}
 						else
 						{
 							str += "lwbOutRe = gbOutRe + oOffset;\n\t";
-							str += "lwbOutIm = gbOutIm + oOffset;\n\n";
+								str += "lwbOutIm = gbOutIm + oOffset;\n";
+							}
 						}
+						str += "\n";
 					}
 				}
 
 				std::string inOffset;
+				std::string outOffset;
 				if (params.fft_placeness == CLFFT_INPLACE && !r2c2r)
 				{
 					inOffset += "ioOffset";
+					outOffset += "ioOffset";
 				}
 				else
 				{
 					inOffset += "iOffset";
+					outOffset += "oOffset";
 				}
 				
 				// Read data into LDS for blocked access
@@ -3695,7 +3968,7 @@ namespace StockhamGenerator
 										str += (params.fft_placeness == CLFFT_INPLACE) ? "gbRe, gbIm, " : "gbInRe, gbInIm, ";
 									}
 
-									str += inOffset; str += " + "; str += bufOffset; str += ", userdata";
+									str += inOffset; str += " + "; str += bufOffset; str += ", pre_userdata";
 									str += (params.fft_preCallback.localMemSize > 0) ? str += ", localmem);\n" : ");\n";
 								}
 
@@ -3752,7 +4025,7 @@ namespace StockhamGenerator
 					{
 						if(inInterleaved || inReal)		inBuf  = params.fft_hasPreCallback ?  "gbIn, " : "lwbIn, ";
 						else							inBuf  = "lwbInRe, lwbInIm, ";
-						if(outInterleaved || outReal)	outBuf = "lwbOut";
+						if(outInterleaved || outReal)	outBuf = params.fft_hasPostCallback ? "gbOut" : "lwbOut";
 						else							outBuf = "lwbOutRe, lwbOutIm";
 					}
 					else
@@ -3778,23 +4051,31 @@ namespace StockhamGenerator
 						}
 						else							inBuf  = (params.fft_hasPreCallback) ? "gbInRe, gbInRe, gbInIm, gbInIm, " : "lwbInRe, lwbInRe2, lwbInIm, lwbInIm2, ";
 
-						if(outInterleaved || outReal)	outBuf = "lwbOut, lwbOut2";
-						else							outBuf = "lwbOutRe, lwbOutRe2, lwbOutIm, lwbOutIm2";
+						if(outInterleaved || outReal)	outBuf = params.fft_hasPostCallback ? ((params.fft_placeness == CLFFT_INPLACE) ? "gb, gb" : "gbOut, gbOut") : "lwbOut, lwbOut2";
+						else							outBuf = params.fft_hasPostCallback ? "gbOutRe, gbOutRe, gbOutIm, gbOutIm" : "lwbOutRe, lwbOutRe2, lwbOutIm, lwbOutIm2";
 					}
 				}
 				else
 				{
 					if(params.fft_placeness == CLFFT_INPLACE)
 					{
-						if(inInterleaved)	{ inBuf = "gb, "; outBuf = "lwb"; }
-						else				{ inBuf = "gbRe, gbIm, "; outBuf = "lwbRe, lwbIm"; }
+						if(inInterleaved)	
+						{
+							inBuf = params.fft_hasPreCallback ? "gb, " : "lwb, ";
+							outBuf = params.fft_hasPostCallback ? "gb" : "lwb"; 
+						}
+						else	
+						{ 
+							inBuf = params.fft_hasPreCallback ? "gbRe, gbIm, " : "lwbRe, lwbIm, ";
+							outBuf = params.fft_hasPostCallback ? "gbRe, gbIm" : "lwbRe, lwbIm"; 
+						}
 					}
 					else
 					{
-						if(inInterleaved)	inBuf  = "gbIn, ";
-						else				inBuf  = "gbInRe, gbInIm, ";
-						if(outInterleaved)	outBuf = "lwbOut";
-						else				outBuf = "lwbOutRe, lwbOutIm";
+						if(inInterleaved)	inBuf  = params.fft_hasPreCallback ? "gbIn, " : "lwbIn, ";
+						else				inBuf  = params.fft_hasPreCallback ? "gbInRe, gbInIm, " : "lwbInRe, lwbInIm, ";
+						if(outInterleaved)	outBuf = params.fft_hasPostCallback ? "gbOut" : "lwbOut";
+						else				outBuf = params.fft_hasPostCallback ? "gbOutRe, gbOutIm" : "lwbOutRe, lwbOutIm";
 					}
 				}
 
@@ -3827,22 +4108,55 @@ namespace StockhamGenerator
 					str += PassName(0, fwd);
 					str += "("; str += rw; str += me;
 					
-					str += (params.fft_hasPreCallback || !r2c2r) ? inOffset : "0";
+					str += (params.fft_hasPreCallback) ? inOffset : "0";
 					
+					if (params.fft_hasPostCallback)
+					{
+						str += ", "; str += outOffset; str += ", ";
+					}
+					else
+					{
 					str += ", 0, ";
+					}
+
 					str += inBuf; str += outBuf;
 					str += IterRegs("&");
 
-					//if precalback set 
+					//If callback is set
+					if (hasCallback)
+					{
+						//if pre-calback set 
 					if (params.fft_hasPreCallback)
 					{
-						str += (r2c2r && !rcSimple) ?  ", iOffset2, userdata" : ", userdata";
+							str += (r2c2r && !rcSimple) ?  ", iOffset2, pre_userdata" : ", pre_userdata";
+						}
+
+						//if post-calback set 
+						if (params.fft_hasPostCallback)
+						{
+							if ((r2c || c2r) && !rcSimple) { str += ", "; str += outOffset; str += "2"; }
+
+							str += ", post_userdata";
+						}
 
 						if (params.fft_preCallback.localMemSize > 0)
 						{
 							str += ", localmem";
 						}
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							//if precallback localmem also requested, send the localmem with the right offset
+							if (params.fft_hasPreCallback && params.fft_preCallback.localMemSize > 0)
+							{
+								str += ", ((__local char *)localmem + "; str += SztToStr(params.fft_preCallback.localMemSize); str += ")";
+							}
+							else
+							{
+								str += ", localmem";
+							}
+						}
 					}
+
 					str += ");\n";
 				}
 				else
@@ -3892,7 +4206,7 @@ namespace StockhamGenerator
 							}
 							else
 							{
-								str += (params.fft_hasPreCallback || !r2c2r) ? inOffset : "0";
+								str += (params.fft_hasPreCallback) ? inOffset : "0";
 							}
 							str += ", ";
 							str += ldsOff;
@@ -3903,7 +4217,7 @@ namespace StockhamGenerator
 							//if precalback set, append additional arguments
 							if (!blockCompute && params.fft_hasPreCallback)
 							{
-								str += (r2c2r && !rcSimple) ?  ", iOffset2, userdata" : ", userdata";
+								str += (r2c2r && !rcSimple) ?  ", iOffset2, pre_userdata" : ", pre_userdata";
 
 								if (params.fft_preCallback.localMemSize > 0)
 								{
@@ -3918,11 +4232,41 @@ namespace StockhamGenerator
 						{
 							str += ldsOff;
 							str += ", ";
-							str += blockCompute ? ldsOff : "0";
+							if (blockCompute)
+							{
+								str += ldsOff;
+							}
+							else
+							{
+								str += (params.fft_hasPostCallback) ? outOffset : "0";
+							}
 							str += ", ";
 							str += ldsArgs; str += ", ";
 							str += outBuf;
-							str += IterRegs("&"); str += ");\n";
+
+							str += IterRegs("&"); 
+							
+							if (!blockCompute && params.fft_hasPostCallback)
+							{
+								if ((c2r || r2c) && !rcSimple) { str += ", "; str += outOffset; str += "2"; }
+
+								str += ", post_userdata";
+
+								if (params.fft_postCallback.localMemSize > 0)
+								{
+									//if precallback localmem also requested, send the localmem with the right offset
+									if (params.fft_hasPreCallback && params.fft_preCallback.localMemSize > 0)
+									{
+										str += ", ((__local char *)localmem + "; str += SztToStr(params.fft_preCallback.localMemSize); str += ")";
+									}
+									else
+									{
+										str += ", localmem";
+									}
+								}
+							}
+							str += ");\n";
+							
 							if (!halfLds) { str += exTab; str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"; }
 						}
 						else // intermediate pass
@@ -3988,10 +4332,44 @@ namespace StockhamGenerator
 
 						if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_R2C) )
 						{
+							if (blockComputeType == BCT_R2C && params.fft_hasPostCallback)
+							{
+								if (outInterleaved)
+									writeBuf = (params.fft_placeness == CLFFT_INPLACE) ? "gb" : "gbOut";
+								else
+									writeBuf = (params.fft_placeness == CLFFT_INPLACE) ? "gbRe, gbIm" : "gbOutRe, gbOutIm";
+								
+								str += "\t\t"; str += params.fft_postCallback.funcname; str += "("; str += writeBuf; str += ", (";
+								str += outOffset; str += " + (me%"; str+= SztToStr(blockWidth); str += ") + ";
+								str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_outStride[0]);
+								str += " + t*"; str += SztToStr(params.fft_outStride[0]*blockWGS/blockWidth); 
+								str += "), post_userdata, R0"; 
+								if (!outInterleaved) str += ".x, R0.y";
+
+								if (params.fft_postCallback.localMemSize > 0)
+								{
+									if (params.fft_hasPreCallback && params.fft_preCallback.localMemSize > 0)
+									{
+										str += ", (char *)(localmem + "; str += SztToStr(params.fft_preCallback.localMemSize); str += ")";
+									}
+									else
+									{
+										str += ", localmem";
+									}
+								}
+								str += ");\n";
+
+								//in the planar case, break from for loop since both real and imag components are handled
+								//together in post-callback
+								if (!outInterleaved) break;
+							}
+							else
+							{
 							str += "\t\t"; str += writeBuf; str += "[(me%"; str+= SztToStr(blockWidth); str += ") + ";
 							str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_outStride[0]);
 							str += " + t*"; str += SztToStr(params.fft_outStride[0]*blockWGS/blockWidth); str += "] = R0"; str+= comp; str += ";\n";
 						}
+						}
 						else
 						{
 							str += "\t\t"; str += writeBuf; str += "[me + t*"; str += SztToStr(blockWGS); str += "] = R0"; str+= comp; str += ";\n";
@@ -4126,13 +4504,21 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
 	this->signature.fft_R = (nt * this->signature.fft_N[0])/wgs;
 	this->signature.fft_SIMD = wgs;
 
-	//Set callback if specified
+	//Set pre-callback if specified
 	if (this->plan->hasPreCallback)
 	{
 		this->signature.fft_hasPreCallback = true;
 		this->signature.fft_preCallback = this->plan->preCallback;
 	}
 
+	//Set post-callback if specified
+	if (this->plan->hasPostCallback)
+	{
+		this->signature.fft_hasPostCallback = true;
+		this->signature.fft_postCallback = this->plan->postCallbackParam;
+	}
+	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
+
     if (this->plan->large1D != 0) {
         ARG_CHECK (this->signature.fft_N[0] != 0)
         ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
@@ -4237,12 +4623,20 @@ clfftStatus FFTGeneratedStockhamAction::generateKernel(FFTRepo& fftRepo, const c
 	}
 
 	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
-	if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+	if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) || 
+		(this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
 	{
 		bool validLDSSize = false;
+		size_t requestedCallbackLDS = 0;
+
+		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+		if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
+			requestedCallbackLDS += this->signature.fft_postCallback.localMemSize;
+
 		if (this->plan->blockCompute)
 		{
-			validLDSSize = ((this->signature.blockLDS * this->plan->ElementSize()) +  this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+			validLDSSize = ((this->signature.blockLDS * this->plan->ElementSize()) +  requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
 		}
 		else
 		{
@@ -4263,8 +4657,9 @@ clfftStatus FFTGeneratedStockhamAction::generateKernel(FFTRepo& fftRepo, const c
 			size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
 			size_t elementSize = ((this->signature.fft_precision == CLFFT_DOUBLE) || (this->signature.fft_precision == CLFFT_DOUBLE_FAST)) ? sizeof(double) : sizeof(float);
 
-			validLDSSize = ((ldsSize * elementSize) + this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+			validLDSSize = ((ldsSize * elementSize) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
 		}
+
 		if(!validLDSSize)
 		{
 			fprintf(stderr, "Requested local memory size not available\n");
diff --git a/src/library/generator.stockham.h b/src/library/generator.stockham.h
index 792037d..10e9079 100644
--- a/src/library/generator.stockham.h
+++ b/src/library/generator.stockham.h
@@ -385,6 +385,8 @@ namespace StockhamGenerator
 			// Temporary variables
 			// Allocate temporary variables if we are not using complex registers (cReg = 0) or if cReg is true, then
 			// allocate temporary variables only for non power-of-2 radices
+			if (!(radix == 7 && cReg))
+			{
 			if( (radix & (radix-1)) || (!cReg) )
 			{
 				bflyStr += "\t";
@@ -414,6 +416,7 @@ namespace StockhamGenerator
 				bflyStr += "\t";
 				bflyStr += RegBaseType<PR>(2);
 				bflyStr += " T;";
+				}
 			}
 
 
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
index 36c3d66..1f1f205 100644
--- a/src/library/generator.transpose.gcn.cpp
+++ b/src/library/generator.transpose.gcn.cpp
@@ -317,13 +317,29 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeGCNAction::
 
 	if (params.fft_hasPreCallback)
 	{
+		assert(!params.fft_hasPostCallback);
+
 		if (params.fft_preCallback.localMemSize > 0)
 		{
-			clKernWrite( transKernel, 0 ) << ", __global void* userdata, __local void* localmem";
+			clKernWrite( transKernel, 0 ) << ", __global void* pre_userdata, __local void* localmem";
+		}
+		else
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* pre_userdata";
+		}
+	}
+
+	if (params.fft_hasPostCallback)
+	{
+		assert(!params.fft_hasPreCallback);
+
+		if (params.fft_postCallback.localMemSize > 0)
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* post_userdata, __local void* localmem";
 		}
 		else
 		{
-			clKernWrite( transKernel, 0 ) << ", __global void* userdata";
+			clKernWrite( transKernel, 0 ) << ", __global void* post_userdata";
 		}
 	}
 
@@ -406,6 +422,13 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 		clKernWrite( transKernel, 0 ) << std::endl;
 	}
 
+	//If post-callback is set for the plan
+	if (params.fft_hasPostCallback)
+	{
+		//Insert callback function code at the beginning 
+		clKernWrite( transKernel, 0 ) << params.fft_postCallback.funcstring << std::endl;
+		clKernWrite( transKernel, 0 ) << std::endl;
+	}
 
 	for(size_t bothDir=0; bothDir<2; bothDir++)
 	{
@@ -677,11 +700,11 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 					{
 						if (params.fft_preCallback.localMemSize > 0)
 						{
-							clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, userdata, localmem);" << std::endl;
+							clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, pre_userdata, localmem);" << std::endl;
 						}
 						else
 						{
-							clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, userdata);" << std::endl;
+							clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, pre_userdata);" << std::endl;
 						}
 					}
 					else
@@ -696,11 +719,11 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 					{
 						if (params.fft_preCallback.localMemSize > 0)
 						{
-							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, userdata, localmem);" << std::endl;
+							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, pre_userdata, localmem);" << std::endl;
 						}
 						else
 						{
-							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, userdata);" << std::endl;
+							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, pre_userdata);" << std::endl;
 						}
 						clKernWrite( transKernel, 9 ) << "tmp.s0 = retCallback.x;" << std::endl;
 						clKernWrite( transKernel, 9 ) << "tmp.s1 = retCallback.y;" << std::endl;
@@ -720,11 +743,11 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 				{
 					if (params.fft_preCallback.localMemSize > 0)
 					{
-						clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", iOffset + gInd, userdata, localmem);" << std::endl;
+						clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", iOffset + gInd, pre_userdata, localmem);" << std::endl;
 					}
 					else
 					{
-						clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", iOffset + gInd, userdata);" << std::endl;
+						clKernWrite( transKernel, 9 ) << "tmp = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", iOffset + gInd, pre_userdata);" << std::endl;
 					}
 				}
 				else
@@ -764,11 +787,19 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 		switch( params.fft_outputLayout )
 		{
 		case CLFFT_COMPLEX_INTERLEAVED:
+			//No need of tileOut declaration when postcallback is set as the global buffer is used directly
+			if (!params.fft_hasPostCallback)
+			{
 			clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* tileOut = " << pmComplexOut << " + oOffset;" << std::endl << std::endl;
+			}
 			break;
 		case CLFFT_COMPLEX_PLANAR:
+			//No need of tileOut declaration when postcallback is set as the global buffer is used directly
+			if (!params.fft_hasPostCallback)
+			{
 			clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* realTileOut = " << pmRealOut << " + oOffset;" << std::endl;
 			clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* imagTileOut = " << pmImagOut << " + oOffset;" << std::endl;
+			}
 			break;
 		case CLFFT_HERMITIAN_INTERLEAVED:
 		case CLFFT_HERMITIAN_PLANAR:
@@ -929,11 +960,35 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 			switch( params.fft_outputLayout )
 			{
 			case CLFFT_COMPLEX_INTERLEAVED:
+				if (params.fft_hasPostCallback)
+				{
+					clKernWrite( transKernel, 9 ) << params.fft_postCallback.funcname << "(" << pmComplexOut << ", (oOffset + gInd), post_userdata, tmp";
+					if (params.fft_postCallback.localMemSize > 0)
+					{
+						clKernWrite( transKernel, 0 ) << ", localmem";
+					}
+					clKernWrite( transKernel, 0 ) << ");" << std::endl;
+				}
+				else
+				{
 				clKernWrite( transKernel, 9 ) << "tileOut[ gInd ] = tmp;" << std::endl;
+				}
 				break;
 			case CLFFT_COMPLEX_PLANAR:
+				if (params.fft_hasPostCallback)
+				{
+					clKernWrite( transKernel, 9 ) << params.fft_postCallback.funcname << "(" << pmRealOut << ", " << pmImagOut << ", (oOffset + gInd), post_userdata, tmp.s0, tmp.s1";
+					if (params.fft_postCallback.localMemSize > 0)
+					{
+						clKernWrite( transKernel, 0 ) << ", localmem";
+					}
+					clKernWrite( transKernel, 0 ) << ");" << std::endl;
+				}
+				else
+				{
 				clKernWrite( transKernel, 9 ) << "realTileOut[ gInd ] = tmp.s0;" << std::endl;
 				clKernWrite( transKernel, 9 ) << "imagTileOut[ gInd ] = tmp.s1;" << std::endl;
+				}
 				break;
 			case CLFFT_HERMITIAN_INTERLEAVED:
 			case CLFFT_HERMITIAN_PLANAR:
@@ -1036,6 +1091,12 @@ clfftStatus FFTGeneratedTransposeGCNAction::initParams ()
 		this->signature.fft_hasPreCallback = true;
 		this->signature.fft_preCallback = this->plan->preCallback;
 	}
+	if (this->plan->hasPostCallback)
+	{
+		this->signature.fft_hasPostCallback = true;
+		this->signature.fft_postCallback = this->plan->postCallbackParam;
+	}
+	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
 
     return CLFFT_SUCCESS;
 }
@@ -1082,12 +1143,22 @@ clfftStatus FFTGeneratedTransposeGCNAction::generateKernel ( FFTRepo& fftRepo, c
 	OPENCL_V( CalculateBlockSize(this->signature.fft_precision, loopCount, blockSize), _T("CalculateBlockSize() failed!") );
 
 	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
-	if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+	if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) || 
+		(this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
 	{
+		assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
+
 		bool validLDSSize = false;
 		size_t length = blockSize.x * blockSize.y;
 		
-		validLDSSize = ((length * this->plan->ElementSize()) + this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+		size_t requestedCallbackLDS = 0;
+
+		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+		else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
+			requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
+		
+		validLDSSize = ((length * this->plan->ElementSize()) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
 		
 		if(!validLDSSize)
 		{
@@ -1096,7 +1167,6 @@ clfftStatus FFTGeneratedTransposeGCNAction::generateKernel ( FFTRepo& fftRepo, c
 		}
 	}
 
-
     std::string programCode;
     OPENCL_V( genTransposeKernel( this->signature, programCode, lwSize, reShapeFactor, loopCount, blockSize ), _T( "GenerateTransposeKernel() failed!" ) );
 
diff --git a/src/library/generator.transpose.nonsquare.cpp b/src/library/generator.transpose.nonsquare.cpp
index eaa370a..8c12e6b 100644
--- a/src/library/generator.transpose.nonsquare.cpp
+++ b/src/library/generator.transpose.nonsquare.cpp
@@ -227,13 +227,27 @@ static clfftStatus genTransposePrototype(const FFTGeneratedTransposeNonSquareAct
 
     if (params.fft_hasPreCallback)
     {
+		assert(!params.fft_hasPostCallback);
         if (params.fft_preCallback.localMemSize > 0)
         {
-            clKernWrite(transKernel, 0) << ", __global void* userdata, __local void* localmem";
+            clKernWrite(transKernel, 0) << ", __global void* pre_userdata, __local void* localmem";
         }
         else
         {
-            clKernWrite(transKernel, 0) << ", __global void* userdata";
+            clKernWrite(transKernel, 0) << ", __global void* pre_userdata";
+        }
+    }
+	if (params.fft_hasPostCallback)
+	{
+		assert(!params.fft_hasPreCallback);
+
+		if (params.fft_postCallback.localMemSize > 0)
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* post_userdata, __local void* localmem";
+		}
+		else
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* post_userdata";
         }
     }
 
@@ -381,6 +395,28 @@ static clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Sig
         local_work_size_swap = (num_elements_loaded < 256) ? num_elements_loaded : 256;
     }
     
+	//If post-callback is set for the plan
+	if (params.fft_hasPostCallback)
+	{
+		//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by swap kernel
+		if (params.fft_postCallback.localMemSize > 0)
+		{
+			bool validLDSSize = false;
+			
+			validLDSSize = ((2 * input_elm_size_in_bytes * (num_elements_loaded * 2)) + params.fft_postCallback.localMemSize) < params.limit_LocalMemSize;
+		
+			if(!validLDSSize)
+			{
+				fprintf(stderr, "Requested local memory size not available\n");
+				return CLFFT_INVALID_ARG_VALUE;
+			}
+		}
+
+		//Insert callback function code at the beginning 
+		clKernWrite( transKernel, 0 ) << params.fft_postCallback.funcstring << std::endl;
+		clKernWrite( transKernel, 0 ) << std::endl;
+	}
+
     /*Generating the  swapping logic*/
     {
         size_t num_reduced_row;
@@ -451,21 +487,32 @@ static clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Sig
         switch (params.fft_inputLayout)
         {
         case CLFFT_COMPLEX_INTERLEAVED:
-            clKernWrite(transKernel, 0) << "void swap(global " << dtComplex << "* inputA, " << tmpBuffType << " " << dtComplex << "* Ls, "<< tmpBuffType << " " << dtComplex << " * Ld, int is, int id, int pos, int end_indx, int work_id){" << std::endl;
+            clKernWrite(transKernel, 0) << "void swap(global " << dtComplex << "* inputA, " << tmpBuffType << " " << dtComplex << "* Ls, "<< tmpBuffType << " " << dtComplex << " * Ld, int is, int id, int pos, int end_indx, int work_id";
             break;
         case CLFFT_COMPLEX_PLANAR:
-            clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA_R, global " << dtPlanar << "* inputA_I, " << tmpBuffType << " " <<dtComplex << "* Ls, "<< tmpBuffType << " " << dtComplex << "* Ld, int is, int id, int pos, int end_indx, int work_id){" << std::endl;
+            clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA_R, global " << dtPlanar << "* inputA_I, " << tmpBuffType << " " <<dtComplex << "* Ls, "<< tmpBuffType << " " << dtComplex << "* Ld, int is, int id, int pos, int end_indx, int work_id";
             break;
         case CLFFT_HERMITIAN_INTERLEAVED:
         case CLFFT_HERMITIAN_PLANAR:
             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
         case CLFFT_REAL:
-            clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA, " << tmpBuffType <<" " << dtPlanar << "* Ls, "<< tmpBuffType <<" " << dtPlanar << "* Ld, int is, int id, int pos, int end_indx, int work_id){" << std::endl;
+            clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA, " << tmpBuffType <<" " << dtPlanar << "* Ls, "<< tmpBuffType <<" " << dtPlanar << "* Ld, int is, int id, int pos, int end_indx, int work_id";
             break;
         default:
             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
         }
 
+		if (params.fft_hasPostCallback)
+		{
+			clKernWrite(transKernel, 0) << ", size_t iOffset, __global void* post_userdata";
+			if (params.fft_postCallback.localMemSize > 0)
+			{
+				clKernWrite(transKernel, 0) << ", __local void* localmem";
+			}
+		}
+
+		clKernWrite(transKernel, 0) << "){" << std::endl;
+
         clKernWrite(transKernel, 3) << "for (int j = get_local_id(0); j < end_indx; j += " << local_work_size_swap << "){" << std::endl;
 
         switch (params.fft_inputLayout)
@@ -473,45 +520,61 @@ static clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Sig
         case CLFFT_REAL:
         case CLFFT_COMPLEX_INTERLEAVED:
 
+			
             clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
             clKernWrite(transKernel, 9) << "Ls[j] = inputA[is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
             clKernWrite(transKernel, 9) << "Ld[j] = inputA[id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j];" << std::endl;
             clKernWrite(transKernel, 6) << "}" << std::endl;
 
             clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
             clKernWrite(transKernel, 9) << "Ld[j] = inputA[id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j];" << std::endl;
             clKernWrite(transKernel, 6) << "}" << std::endl;
 
-            clKernWrite(transKernel, 6) << "else{" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j];" << std::endl;
-            clKernWrite(transKernel, 6) << "}" << std::endl;
+			if (params.fft_hasPostCallback)
+			{	
+				clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA, (iOffset + id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), post_userdata, Ls[j]";
+				if (params.fft_postCallback.localMemSize > 0)
+				{
+					clKernWrite( transKernel, 0 ) << ", localmem";
+				}
+				clKernWrite( transKernel, 0 ) << ");" << std::endl;
+			}
+			else
+			{
+				clKernWrite(transKernel, 6) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j];" << std::endl;
+			}
             break;
         case CLFFT_HERMITIAN_INTERLEAVED:
         case CLFFT_HERMITIAN_PLANAR:
             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
         case CLFFT_COMPLEX_PLANAR:
+			
             clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
             clKernWrite(transKernel, 9) << "Ls[j].x = inputA_R[is*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
             clKernWrite(transKernel, 9) << "Ls[j].y = inputA_I[is*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
             clKernWrite(transKernel, 9) << "Ld[j].x = inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
             clKernWrite(transKernel, 9) << "Ld[j].y = inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].x;" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].y;" << std::endl;
             clKernWrite(transKernel, 6) << "}" << std::endl;
 
             clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
             clKernWrite(transKernel, 9) << "Ld[j].x = inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
             clKernWrite(transKernel, 9) << "Ld[j].y = inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].x;" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].y;" << std::endl;
             clKernWrite(transKernel, 6) << "}" << std::endl;
 
-            clKernWrite(transKernel, 6) << "else{" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].x;" << std::endl;
-            clKernWrite(transKernel, 9) << "inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].y;" << std::endl;
-            clKernWrite(transKernel, 6) << "}" << std::endl;
+			if (params.fft_hasPostCallback)
+			{
+				clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R, inputA_I, (iOffset + id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), post_userdata, Ls[j].x, Ls[j].y";
+				if (params.fft_postCallback.localMemSize > 0)
+				{
+					clKernWrite( transKernel, 0 ) << ", localmem";
+				}
+				clKernWrite( transKernel, 0 ) << ");" << std::endl;
+			}
+			else
+			{
+				clKernWrite(transKernel, 6) << "inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].x;" << std::endl;
+				clKernWrite(transKernel, 6) << "inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].y;" << std::endl;
+			}
             break;
         default:
             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
@@ -547,20 +610,22 @@ static clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Sig
 
             clKernWrite(transKernel, 3) << "__local " << dtInput << " tmp_tot_mem[" << (num_elements_loaded * 2) << "];" << std::endl;
             clKernWrite(transKernel, 3) << tmpBuffType <<" " << dtInput << " *te = tmp_tot_mem;" << std::endl;
+
             clKernWrite(transKernel, 3) << tmpBuffType <<" " << dtInput << " *to = (tmp_tot_mem + " << num_elements_loaded << ");" << std::endl;
-            //Do not advance offset when precallback is set as the starting address of global buffer is needed
-            if (!params.fft_hasPreCallback)
-            {
+			 
+			//Do not advance offset when postcallback is set as the starting address of global buffer is needed
+            if (!params.fft_hasPostCallback)
                 clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl;  // Set A ptr to the start of each slice
-            }
             break;
         case CLFFT_COMPLEX_PLANAR:
            
             clKernWrite(transKernel, 3) << "__local " << dtComplex << " tmp_tot_mem[" << (num_elements_loaded * 2) << "];" << std::endl;
             clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *te = tmp_tot_mem;" << std::endl;
+
             clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *to = (tmp_tot_mem + " << num_elements_loaded << ");" << std::endl;
-            //Do not advance offset when precallback is set as the starting address of global buffer is needed
-            if (!params.fft_hasPreCallback)
+
+			//Do not advance offset when postcallback is set as the starting address of global buffer is needed
+            if (!params.fft_hasPostCallback)
             {
                 clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl;  // Set A ptr to the start of each slice 
                 clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl;  // Set A ptr to the start of each slice 
@@ -605,12 +670,21 @@ static clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Sig
         {
         case CLFFT_COMPLEX_INTERLEAVED:
         case CLFFT_REAL:
-            clKernWrite(transKernel, 6) << "swap(inputA, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id);" << std::endl;
+            clKernWrite(transKernel, 6) << "swap(inputA, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id";
             break;
         case CLFFT_COMPLEX_PLANAR:
-            clKernWrite(transKernel, 6) << "swap(inputA_R, inputA_I, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id);" << std::endl;
+            clKernWrite(transKernel, 6) << "swap(inputA_R, inputA_I, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id";
             break;
         }
+		if (params.fft_hasPostCallback)
+		{
+			clKernWrite(transKernel, 0) << ", iOffset, post_userdata";
+			if (params.fft_postCallback.localMemSize > 0)
+			{
+				clKernWrite(transKernel, 0) << ", localmem";
+			}
+		}
+		clKernWrite(transKernel, 0) << ");" << std::endl;
 
         clKernWrite(transKernel, 3) << "}" << std::endl;
 
@@ -896,13 +970,13 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
                 {
                     if (params.fft_preCallback.localMemSize > 0)
                     {
-                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
-                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
                     }
                     else
                     {
-                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
-                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
                     }
                 }
                 else
@@ -919,13 +993,13 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
                 {
                     if (params.fft_preCallback.localMemSize > 0)
                     {
-                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
-                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
                     }
                     else
                     {
-                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
-                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
+                        clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
                     }
                 }
                 else
@@ -1010,13 +1084,13 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
                 {
                     if (params.fft_preCallback.localMemSize > 0)
                     {
-                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
-                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
                     }
                     else
                     {
-                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
-                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
                     }
                 }
                 else
@@ -1032,13 +1106,13 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
                 {
                     if (params.fft_preCallback.localMemSize > 0)
                     {
-                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
-                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
                     }
                     else
                     {
-                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
-                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
+                        clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
                     }
                 }
                 else
@@ -1081,15 +1155,15 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
                 {
                     if (params.fft_preCallback.localMemSize > 0)
                     {
-                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
                         clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
-                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
                     }
                     else
                     {
-                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
                         clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
-                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
                     }
                 }
                 else
@@ -1107,15 +1181,15 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
                 {
                     if (params.fft_preCallback.localMemSize > 0)
                     {
-                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem); }" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem); }" << std::endl;
                         clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
-                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem); }" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem); }" << std::endl;
                     }
                     else
                     {
-                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata); }" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata); }" << std::endl;
                         clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
-                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata); }" << std::endl;
+                        clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata); }" << std::endl;
                     }
                 }
                 else
@@ -1307,6 +1381,12 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::initParams()
         this->signature.fft_hasPreCallback = true;
         this->signature.fft_preCallback = this->plan->preCallback;
     }
+	if (this->plan->hasPostCallback)
+	{
+		this->signature.fft_hasPostCallback = true;
+		this->signature.fft_postCallback = this->plan->postCallbackParam;
+	}
+	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
 
     return CLFFT_SUCCESS;
 }
@@ -1325,11 +1405,32 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
     std::string programCode;
     if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
     {
-        OPENCL_V(genTransposeKernel(this->signature, programCode, lwSize, reShapeFactor), _T("GenerateTransposeKernel() failed!"));
+		//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by transpose kernel
+		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+		{
+			assert(!this->signature.fft_hasPostCallback);
+
+			bool validLDSSize = false;
+			size_t requestedCallbackLDS = 0;
+
+			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+			
+			validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
+		
+			if(!validLDSSize)
+			{
+				fprintf(stderr, "Requested local memory size not available\n");
+				return CLFFT_INVALID_ARG_VALUE;
+			}
+		}
+        OPENCL_V(genTransposeKernel(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
     }
     else
     {
-        OPENCL_V(genSwapKernel(this->signature, programCode, lwSize, reShapeFactor), _T("GenerateTransposeKernel() failed!"));
+		//No pre-callback possible in swap kernel
+		assert(!this->signature.fft_hasPreCallback);
+
+        OPENCL_V(genSwapKernel(this->signature, programCode, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
     }
 
     cl_int status = CL_SUCCESS;
diff --git a/src/library/generator.transpose.square.cpp b/src/library/generator.transpose.square.cpp
index 5bbc508..1878073 100644
--- a/src/library/generator.transpose.square.cpp
+++ b/src/library/generator.transpose.square.cpp
@@ -233,16 +233,30 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeSquareActio
 
 	if (params.fft_hasPreCallback)
 	{
+		assert(!params.fft_hasPostCallback);
+
 		if (params.fft_preCallback.localMemSize > 0)
 		{
-			clKernWrite( transKernel, 0 ) << ", __global void* userdata, __local void* localmem";
+			clKernWrite( transKernel, 0 ) << ", __global void* pre_userdata, __local void* localmem";
 		}
 		else
 		{
-			clKernWrite( transKernel, 0 ) << ", __global void* userdata";
+			clKernWrite( transKernel, 0 ) << ", __global void* pre_userdata";
 		}
 	}
+	if (params.fft_hasPostCallback)
+	{
+		assert(!params.fft_hasPreCallback);
 
+		if (params.fft_postCallback.localMemSize > 0)
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* post_userdata, __local void* localmem";
+		}
+		else
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* post_userdata";
+		}
+	}
 
     // Close the method signature
     clKernWrite( transKernel, 0 ) << " )\n{" << std::endl;
@@ -331,6 +345,13 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 			clKernWrite( transKernel, 0 ) << params.fft_preCallback.funcstring << std::endl;
 			clKernWrite( transKernel, 0 ) << std::endl;
 		}
+		//If post-callback is set for the plan
+		if (params.fft_hasPostCallback)
+		{
+			//Insert callback function code at the beginning 
+			clKernWrite( transKernel, 0 ) << params.fft_postCallback.funcstring << std::endl;
+			clKernWrite( transKernel, 0 ) << std::endl;
+		}
 
 		std::string funcName;
 		if (params.fft_3StepTwiddle) // TODO
@@ -520,13 +541,13 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 						{
 							if (params.fft_preCallback.localMemSize > 0)
 							{
-								clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
-								clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+								clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
+								clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
 							}
 							else
 							{
-								clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
-								clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+								clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
+								clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
 							}
 						}
 						else
@@ -543,13 +564,13 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 					{
 						if (params.fft_preCallback.localMemSize > 0)
 						{
-							clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
-							clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
 						}
 						else
 						{
-							clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
-							clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+							clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
+							clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
 						}
 					}
 					else
@@ -595,17 +616,53 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 			switch (params.fft_outputLayout)
 			{
 				case CLFFT_COMPLEX_INTERLEAVED:
+					if (params.fft_hasPostCallback)
+					{
+						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem";
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+
+						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index]";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem";
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+					}
+					else
+					{
 					clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
 					clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
-
+					}
 					break;
 				case CLFFT_COMPLEX_PLANAR:
+					if (params.fft_hasPostCallback)
+					{
+						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem";
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
 
+						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem";
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+					}
+					else
+					{
 					clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
 					clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
 
 					clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
 					clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
+					}
 					break;
 				case CLFFT_HERMITIAN_INTERLEAVED:
 				case CLFFT_HERMITIAN_PLANAR:
@@ -636,13 +693,13 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 					{
 						if (params.fft_preCallback.localMemSize > 0)
 						{
-							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
-							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] <<" + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] <<" + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
 						}
 						else
 						{
-							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
-							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] <<" + lidx + starting_index_yx, userdata);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] <<" + lidx + starting_index_yx, pre_userdata);" << std::endl;
 						}
 					}
 					else
@@ -658,13 +715,13 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 					{
 						if (params.fft_preCallback.localMemSize > 0)
 						{
-							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + idx, userdata, localmem);" << std::endl;
-							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + idx, pre_userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
 						}
 						else
 						{
-							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + idx, userdata);" << std::endl;
-							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + lidx + starting_index_yx, userdata);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + idx, pre_userdata);" << std::endl;
+							clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*"<<16/reShapeFactor<<")*"<<params.fft_N[0]<<" + lidx + starting_index_yx, pre_userdata);" << std::endl;
 						}
 					}
 					else
@@ -708,15 +765,15 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 					{
 						if (params.fft_preCallback.localMemSize > 0)
 						{
-							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
 							clKernWrite(transKernel, 9) << "if ((t_gy_p *" <<16*reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16*reShapeFactor << " + lidy + loop*" << 16/reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
-							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
 						}
 						else
 						{
-							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
 							clKernWrite(transKernel, 9) << "if ((t_gy_p *" <<16*reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16*reShapeFactor << " + lidy + loop*" << 16/reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
-							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
 						}
 					}
 					else
@@ -734,15 +791,15 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 					{
 						if (params.fft_preCallback.localMemSize > 0)
 						{
-							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem); }" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem); }" << std::endl;
 							clKernWrite(transKernel, 9) << "if ((t_gy_p *" <<16*reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16*reShapeFactor << " + lidy + loop*" << 16/reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
-							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem); }" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem); }" << std::endl;
 						}
 						else
 						{
-							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata); }" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata); }" << std::endl;
 							clKernWrite(transKernel, 9) << "if ((t_gy_p *" <<16*reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16*reShapeFactor << " + lidy + loop*" << 16/reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
-							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata); }" << std::endl;
+							clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16/reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata); }" << std::endl;
 						}
 					}
 					else
@@ -788,18 +845,52 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 			switch (params.fft_outputLayout)
 			{
 				case CLFFT_COMPLEX_INTERLEAVED:
+					if (params.fft_hasPostCallback)
+					{
+						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+
+						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+					}
+					else
+					{
 					clKernWrite(transKernel, 9) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
 					clKernWrite(transKernel, 9) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index]; " << std::endl;
-
+					}
 					break;
 				case CLFFT_COMPLEX_PLANAR:
+					if (params.fft_hasPostCallback)
+					{
+						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+
+						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+					}
+					else
+					{
 					clKernWrite(transKernel, 9) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
 					clKernWrite(transKernel, 9) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
 					clKernWrite(transKernel, 9) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x; " << std::endl;
 					clKernWrite(transKernel, 9) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; " << std::endl;
-
-
-
+					}
 					break;
 				case CLFFT_HERMITIAN_INTERLEAVED:
 				case CLFFT_HERMITIAN_PLANAR:
@@ -824,19 +915,60 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 			{
 				case CLFFT_COMPLEX_INTERLEAVED:
 					clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << " && idx<" << params.fft_N[0] << ")" << std::endl;
+					if (params.fft_hasPostCallback)
+					{
+						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+
+						clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ")" << std::endl;
+
+						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << ");" << std::endl;
+					}
+					else
+					{
 					clKernWrite(transKernel, 12) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index]; " << std::endl;
 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ")" << std::endl;
 					clKernWrite(transKernel, 12) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index];" << std::endl;
-
+					}
 					break;
 				case CLFFT_COMPLEX_PLANAR:
 					clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << " && idx<" << params.fft_N[0] << ") {" << std::endl;
+					
+					if (params.fft_hasPostCallback)
+					{
+						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << "); }" << std::endl;
+
+						clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
+
+						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+						if (params.fft_postCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 0) << ", localmem"; 
+						}
+						clKernWrite(transKernel, 0) << "); }" << std::endl;
+					}
+					else
+					{
 					clKernWrite(transKernel, 12) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x; " << std::endl;
 					clKernWrite(transKernel, 12) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y; }" << std::endl;
 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
 					clKernWrite(transKernel, 12) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x;" << std::endl;
 					clKernWrite(transKernel, 12) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; }" << std::endl;
-
+					}
 
 					break;
 				case CLFFT_HERMITIAN_INTERLEAVED:
@@ -935,6 +1067,12 @@ clfftStatus FFTGeneratedTransposeSquareAction::initParams ()
 		this->signature.fft_hasPreCallback = true;
 		this->signature.fft_preCallback = this->plan->preCallback;
 	}
+	if (this->plan->hasPostCallback)
+	{
+		this->signature.fft_hasPostCallback = true;
+		this->signature.fft_postCallback = this->plan->postCallbackParam;
+	}
+	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
 
     return CLFFT_SUCCESS;
 }
@@ -948,7 +1086,28 @@ static const size_t reShapeFactor = 2;
 //	Feed this generator the FFTPlan, and it returns the generated program as a string
 clfftStatus FFTGeneratedTransposeSquareAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
 {
+	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+	if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) || 
+		(this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
+	{
+		assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
+
+		bool validLDSSize = false;
+		size_t requestedCallbackLDS = 0;
+
+		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+		else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
+			requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
+		
+		validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
 	
+		if(!validLDSSize)
+		{
+			fprintf(stderr, "Requested local memory size not available\n");
+			return CLFFT_INVALID_ARG_VALUE;
+		}
+	}
 
     std::string programCode;
     OPENCL_V( genTransposeKernel( this->signature, programCode, lwSize, reShapeFactor ), _T( "GenerateTransposeKernel() failed!" ) );
diff --git a/src/library/mainpage.h b/src/library/mainpage.h
index aa60a07..186a366 100644
--- a/src/library/mainpage.h
+++ b/src/library/mainpage.h
@@ -448,16 +448,15 @@ FFT features of this library.
 
 @section Callbacks  clFFT Callbacks
 
-The callback feature of clFFT has the ability to invoke user provided OpenCL inline functions from within FFT kernel
-to custom process the input or output data. The inline OpenCL function is passed as a string to the library
-which is incorporated into the generated FFT kernel. This helps to avoid additional kernel launches
-to carry out the pre/post processing tasks.
+The callback feature of clFFT has the ability to invoke user provided OpenCL™ inline functions 
+to pre-process or post-process data, from within the FFT kernel. The inline OpenCL callback function 
+is passed as a string to the library. It is then incorporated into the generated FFT kernel. This 
+eliminates the need for an additional kernel launch to carry out the pre/post processing tasks, thus 
+improving overall performance.
 
 There are 2 types of callback; Pre-callback and Post-callback. Pre-callback invokes user callback function to
-perform custom  pre-processing of the input data before FFT is executed,. Post-callback invokes user callback function to
-perform custom post-processing of the output data after FFT is executed.
-
-The current release of clFFT includes Pre-callback feature. Post-callback will be supported in future release.
+perform custom  pre-processing of the input data, before FFT is executed. Post-callback invokes user callback function to
+perform custom post-processing of the output data, after FFT is executed.
 
 @subsection CallbackWorkflow Callback Workflow
 
@@ -482,10 +481,11 @@ The workflow of FFT execution using callback feature of clFFT is as follows
 			<li> Name of the callback function
 			<li> Callback function as character array. The character array can also include any custom datatype declaration used by callback function
 			<li> Size of local memory requested by callback, if any, in bytes
-			<li> Type of callback. This is an enumerator. The current supported value for this is ‘PRECALLBACK’
+			<li> Type of callback; ‘PRECALLBACK’ or ‘POSTCALLBACK’. This is an enumerator
 			<li> Supplementary user data, if any, used by callback function
-			<li> Number of user data buffers
+			<li> Number of user data buffers. The library currently supports only 1 user data buffer per callback registration
 		</ul>
+		Multiple callback registration calls to the same type of callback will result in overwriting the previously registered callback function
 	<li> Invoke Bake Plan step
 	<li> Library inserts the callback code into the main FFT kernel during bake plan and compiles it. If there are any
 	compilation errors caused by syntax or incompatible callback function prototype, the library reports failure.
@@ -523,42 +523,80 @@ Parameters
 	filter data or any scalar value. The userdata can be of any custom data type/structure, in which case,
 	you have to declare the custom data type and include it along with the callback function string. </li>
 	<li> \c localmem : Pointer to local memory. This memory is allocated by library based on the size you specify
-	and is subjected to local memory availability. </li>
+	and is subject to local memory availability. </li>
 </ul>
 
 For Planar C2C, the return type of callback is a vector (float2/double2) whose elements contain the result for Real
 and Imaginary as computed in the callback
 
- at subsection SamplePrecallbackCode Sample Pre-Callback Code
+ at subsubsection PostcallbackProtyotype Post-callback Prototypes
+
+ FFT Type                               | Function Prototype
+----------------------------------------| ------------------
+C2C/R2C – Interleaved Single Precision  | Without LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, float2 fftoutput) <br /> With LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, float2 fftoutput, __local void *localmem)
+C2C/R2C – Interleaved Double Precision  | Without LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, double2 fftoutput) <br /> With LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, double2 fftoutput, __local void *localmem)
+C2C/R2C – Planar Single Precision		| Without LDS <br />void  <postcallback_func> ( __global void *outputRe, __global void *outputIm, uint outoffset, __global void *userdata, float fftoutputRe, float fftoutputIm) <br /> With LDS <br />void  <postcallback_func> ( __global void *outputRe, __global void *outputIm, uint outoffset, __global void *userdata, float fftoutputRe, float fftoutputIm, __local void *localmem)
+C2C/R2C – Planar Double Precision		| Without LDS <br />void  <postcallback_func> ( __global void *outputRe, __global void *outputIm, uint outoffset, __global void *userdata, double fftoutputRe, double fftoutputIm) <br /> With LDS <br />void  <postcallback_func> ( __global void *outputRe, __global void *outputIm, uint outoffset, __global void *userdata, double fftoutputRe, double fftoutputIm, __local void *localmem)
+C2R Single Precision					| Without LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, float fftoutput) <br /> With LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, float fftoutput, __local void *localmem)
+C2R Double Precision					| Without LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, double fftoutput) <br /> With LDS <br />void  <postcallback_func> ( __global void *output, uint outoffset, __global void *userdata, double fftoutput, __local void *localmem)
+
+
+Parameters
+<ul>
+	<li> \c output  : The base pointer of the output buffer for C2R and Interleaved R2C/C2C transforms
+	<li> \c outputRe : The base pointer of the “Real” output buffer for Planar R2C/C2C transforms
+	<li> \c outputIm : The base pointer of the “Imaginary” part output buffer for Planar R2C/C2C transforms
+	<li> \c outoffset : Index of the current element  of the output buffer from the start
+	<li> \c userdata : Buffer containing optional caller specified data. The userdata pointer is useful
+	for passing any supplementary data to the callback function. For example, buffer having convolution
+	filter data or any scalar value. The userdata can be of any custom data type/structure, in which case,
+	you have to declare the custom data type and include it along with the callback function string. </li>
+	<li> \c localmem : Pointer to local memory. This memory is allocated by library based on the size you specify
+	and is subject to local memory availability. </li>
+</ul>
+
+ at subsection SampleCallbackCode Sample Callback Code
 
 @code
 //**************************************************************************
-//* Step 1 : Store the callback function in a string.
+//* Step 1 : Store the pre and post callback function in a string.
 //**************************************************************************
-const char* precallbackstr = "float2 mulval(__global void* in,        \n
+const char* precallbackstr = "float2 pre_mulval(__global void* input, \n
                                   uint inoffset,                      \n
                                   __global void* userdata,            \n
                                   __local void* localmem)             \n
 				{                                                             \n
-				int scalar = *((__global int*)userdata + offset);             \n
-				float2 ret = *((__global float2*)(float2) + offset) * scalar; \n
+				float scalar = *((__global float*)userdata + inoffset);      \n
+				float2 ret = *((__global float2*)input + inoffset) * scalar; \n
 				return ret;                                                   \n
 				}                                                             \n";
 
+const char* postcallbackstr = "void post_mulval(__global void* output, \n
+                                  uint outoffset,                      \n
+                                  __global void* userdata,             \n
+								  float2 fftoutput,                    \n
+                                  __local void* localmem)              \n
+				{                                                      \n
+				float scalar = *((__global float*)userdata + outoffset);      \n
+				*((__global float2*)output + outoffset) = fftoutput * scalar; \n
+				}                                                             \n";
 
 //**************************************************************************
 //* Step 2 : Initialize arguments if any required by the callback.
 //**************************************************************************
-int h_userdata[N] = {  };
-cl_mem userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N,  (void*)h_userdata, NULL);
+int h_preuserdata[N] = {  };
+cl_mem preuserdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N,  (void*)h_preuserdata, NULL);
 
+int h_postuserdata[N] = {  };
+cl_mem postuserdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N,  (void*)h_postuserdata, NULL);
 
 //**************************************************************************
 //* Step 3 : Register the callback.
 //**************************************************************************
 
-status = clfftSetPlanCallback(plan_handle, "mulval", precallbackstr, 0, PRECALLBACK, &userdata, 1);
+status = clfftSetPlanCallback(plan_handle, "pre_mulval", precallbackstr, 0, PRECALLBACK, &preuserdata, 1);
 
+status = clfftSetPlanCallback(plan_handle, "post_mulval", postcallbackstr, 0, POSTCALLBACK, &postuserdata, 1);
 
 //**************************************************************************
 //* Step 4 : Bake plan and enqueue transform.
@@ -569,13 +607,14 @@ status = clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent,
 			&input_buffers[ 0 ], buffersOut, clMedBuffer );
 @endcode
 
- at subsection PreCallbackConsiderations Pre-callback Considerations
+ at subsection CallbackNotes Important Notes on Callback
 
 <ol>
 	<li> The caller is responsible to provide a callback function in string form that matches the function prototype based on the type of callback, type of transform(real/complex) and whether LDS is used
 	<li> clFFT considers the value returned by pre-callback function as the new value of the input at the index corresponding to the *inoffset* argument
-	<li> Pre-callback function can request for local memory for its own use. If the requested amount of local memory is available on the device, clFFT passes a pointer to the local memory when it invokes the callback function
-	<li> clFFT may invoke FFT kernels several times depending on the input parameters. However the pre-callback function provided by caller is invoked only once for each point in the input
+	<li> Callback function can request for local memory for its own use. If the requested amount of local memory is available on the device, clFFT passes a pointer to the local memory when it invokes the callback function
+	<li> clFFT may invoke FFT kernels several times depending on the input parameters. However the pre-callback function provided by caller is invoked only once for each point in the input. Similarly it calls the post-callback function, for each point in the output, only once.
+	<li> If clFFT is implementing a given FFT in multiple phases, it calls the pre-callback function only from the first phase kernel. Similarly it calls the post-callback function only from the last phase kernel
 </ol>
 
  */
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 309df31..e33a52a 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -504,7 +504,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					// Enable block compute under these conditions
 					if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
 						&& (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1)
-						&& (!clfftGetRequestLibNoMemAlloc()) )
+						&& (!clfftGetRequestLibNoMemAlloc() || (fftPlan->placeness == CLFFT_OUTOFPLACE)) )
 					{
 						fftPlan->blockCompute = true;
 
@@ -874,6 +874,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						trans3Plan->outStride.push_back(fftPlan->outStride[index]);
 					}
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						trans3Plan->hasPostCallback = true;
+						trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
+						trans3Plan->postcallUserData = fftPlan->postcallUserData;
+					}
 
 					OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
 						_T( "BakePlan large1d trans3 plan failed" ) );
@@ -1116,6 +1123,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					trans3Plan->realSpecial	  = true;
 					trans3Plan->transOutHorizontal = true;
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						trans3Plan->hasPostCallback = true;
+						trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
+						trans3Plan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
 						_T( "BakePlan large1d trans3 plan failed" ) );
 
@@ -1289,6 +1304,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 							copyPlan->outStride.push_back(fftPlan->outStride[index]);
 						}
 
+						//Set callback data if set on top level plan
+						if (fftPlan->hasPostCallback)
+						{
+							copyPlan->hasPostCallback = true;
+							copyPlan->postCallbackParam = fftPlan->postCallbackParam;
+							copyPlan->postcallUserData = fftPlan->postcallUserData;
+						}
+
 						OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
 					}
 
@@ -1463,6 +1486,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						col2Plan->outStride.push_back(fftPlan->outStride[index]);
 					}
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						col2Plan->hasPostCallback = true;
+						col2Plan->postCallbackParam = fftPlan->postCallbackParam;
+						col2Plan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
 				}
 				else
@@ -1840,6 +1871,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 							}
 						}
 
+						//Set callback data if set on top level plan
+						if (fftPlan->hasPostCallback && integratedTranposes)
+						{
+							col2Plan->hasPostCallback = true;
+							col2Plan->postCallbackParam = fftPlan->postCallbackParam;
+							col2Plan->postcallUserData = fftPlan->postcallUserData;
+						}
 
 						OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
 
@@ -1878,6 +1916,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 								trans3Plan->outStride.push_back(fftPlan->outStride[index]);
 							}
 
+							//Set callback data if set on top level plan
+							if (fftPlan->hasPostCallback)
+							{
+								trans3Plan->hasPostCallback = true;
+								trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
+								trans3Plan->postcallUserData = fftPlan->postcallUserData;
+							}
+
 							OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
 								_T( "BakePlan large1d trans plan failed" ) );
 						}
@@ -1901,9 +1947,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
                 else if (fftPlan->gen == Transpose_NONSQUARE)
                 {
-					if(fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
-						fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
-					else if (fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_SWAP)
+					if(fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE || fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_SWAP)
 						fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
 					else
 					{
@@ -1943,6 +1987,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 							trans1Plan->outStride.push_back(fftPlan->outStride[index]);
 						}
 
+						if (fftPlan->hasPreCallback)
+						{
+							trans1Plan->hasPreCallback = true;
+							trans1Plan->preCallback = fftPlan->preCallback;
+							trans1Plan->precallUserData = fftPlan->precallUserData;
+						}
+
 
 						OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL),
 							_T("BakePlan transpose_nsq_stage1 plan failed"));
@@ -1980,6 +2031,12 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 							trans2Plan->outStride.push_back(fftPlan->outStride[index]);
 						}
 
+						if (fftPlan->hasPostCallback)
+						{
+							trans2Plan->hasPostCallback = true;
+							trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
+							trans2Plan->postcallUserData = fftPlan->postcallUserData;
+						}
 
 						OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL),
 							_T("BakePlan transpose_nsq_stage2 plan failed"));
@@ -2241,6 +2298,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				transPlanY->batchsize       = fftPlan->batchsize;
 				transPlanY->transflag       = true;
 
+				//Set callback data if set on top level plan
+				if (fftPlan->hasPostCallback)
+				{
+					transPlanY->hasPostCallback = true;
+					transPlanY->postCallbackParam = fftPlan->postCallbackParam;
+					transPlanY->postcallUserData = fftPlan->postcallUserData;
+				}
+
 				OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
 					_T( "BakePlan for planTY failed" ) );
 
@@ -2508,6 +2573,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					}
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						trans2Plan->hasPostCallback = true;
+						trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
+						trans2Plan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
 						_T( "BakePlan for planTY failed" ) );
 
@@ -2571,6 +2644,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						colPlan->inStride.push_back(rowPlan->outStride[index]);
 					}
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						colPlan->hasPostCallback = true;
+						colPlan->postCallbackParam = fftPlan->postCallbackParam;
+						colPlan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
 				}
 
@@ -2819,6 +2900,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					rowPlan->batchsize    = fftPlan->batchsize;
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						rowPlan->hasPostCallback = true;
+						rowPlan->postCallbackParam = fftPlan->postCallbackParam;
+						rowPlan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
 				}
 				else
@@ -2983,6 +3072,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					rowPlan->batchsize    = fftPlan->batchsize;
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						rowPlan->hasPostCallback = true;
+						rowPlan->postCallbackParam = fftPlan->postCallbackParam;
+						rowPlan->postcallUserData = fftPlan->postcallUserData;
+					}
 
 					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
 				}
@@ -3110,6 +3206,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					colPlan->outStride.push_back(fftPlan->outStride[2]);
 				}
 
+				//Set callback data if set on top level plan
+				if (fftPlan->hasPostCallback)
+				{
+					colPlan->hasPostCallback = true;
+					colPlan->postCallbackParam = fftPlan->postCallbackParam;
+					colPlan->postcallUserData = fftPlan->postcallUserData;
+				}
+
 				OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
 			}
 
@@ -3373,6 +3477,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						trans2Plan->outStride.push_back(fftPlan->outStride[index]);
 					}
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						trans2Plan->hasPostCallback = true;
+						trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
+						trans2Plan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
 						_T( "BakePlan for planTY failed" ) );
 
@@ -3441,6 +3553,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						colPlan->outStride.push_back(fftPlan->outStride[index]);
 					}
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						colPlan->hasPostCallback = true;
+						colPlan->postCallbackParam = fftPlan->postCallbackParam;
+						colPlan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
 				}
 			}
@@ -3693,6 +3813,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					rowPlan->batchsize    = fftPlan->batchsize;
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						rowPlan->hasPostCallback = true;
+						rowPlan->postCallbackParam = fftPlan->postCallbackParam;
+						rowPlan->postcallUserData = fftPlan->postcallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
 				}
 				else
@@ -3858,6 +3986,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					xyPlan->batchsize    = fftPlan->batchsize;
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPostCallback)
+					{
+						xyPlan->hasPostCallback = true;
+						xyPlan->postCallbackParam = fftPlan->postCallbackParam;
+						xyPlan->postcallUserData = fftPlan->postcallUserData;
+					}
 
 					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
 				}
@@ -3959,6 +4094,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->iDist    = fftPlan->oDist;
 				colPlan->oDist    = fftPlan->oDist;
 
+				//Set callback data if set on top level plan
+				if (fftPlan->hasPostCallback)
+				{
+					colPlan->hasPostCallback = true;
+					colPlan->postCallbackParam = fftPlan->postCallbackParam;
+					colPlan->postcallUserData = fftPlan->postcallUserData;
+				}
+
 				OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
 			}
 
diff --git a/src/library/plan.h b/src/library/plan.h
index 843a2ba..73caa8d 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -158,6 +158,11 @@ struct FFTKernelGenKeyParams {
 	bool fft_hasPreCallback;
 	clfftCallbackParam fft_preCallback;
 
+	bool fft_hasPostCallback;
+	clfftCallbackParam fft_postCallback;
+
+	cl_ulong   limit_LocalMemSize;
+
 	// Default constructor
 	FFTKernelGenKeyParams()
 	{
@@ -194,6 +199,8 @@ struct FFTKernelGenKeyParams {
 		blockLDS = 0;
         nonSquareKernelType = NON_SQUARE_TRANS_PARENT;
 		fft_hasPreCallback = false;
+		fft_hasPostCallback = false;
+		limit_LocalMemSize = 0;
 	}
 };
 
@@ -460,9 +467,13 @@ public:
 	BlockComputeType blockComputeType;
 
 	bool hasPreCallback;
+	bool hasPostCallback;
 
 	clfftCallbackParam preCallback;
+	clfftCallbackParam postCallbackParam;
+
 	cl_mem precallUserData;
+	cl_mem postcallUserData;
 
     clfftPlanHandle plHandle;
 
@@ -517,6 +528,7 @@ public:
     ,   nonSquareKernelType(NON_SQUARE_TRANS_PARENT)
     ,   plHandle(0)
 	,   hasPreCallback(false)
+	,   hasPostCallback(false)
 	{
 	};
 
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index e986800..00473ab 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -28,11 +28,12 @@ set( clFFT.Test.Source
 	 accuracy_test_pow7.cpp
 	 accuracy_test_mixed_radices.cpp
 	 accuracy_test_random.cpp
-	 accuracy_test_mixed_precallback.cpp
+	 accuracy_test_mixed_callback.cpp
 	 accuracy_test_pow2_precallback.cpp
 	 accuracy_test_pow3_precallback.cpp
 	 accuracy_test_pow5_precallback.cpp
 	 accuracy_test_pow7_precallback.cpp
+	 accuracy_test_postcallback.cpp
 	 gtest_main.cpp
 	 ${PROJECT_SOURCE_DIR}/client/openCL.misc.cpp
 	 c-compliance.c
diff --git a/src/tests/accuracy_test_common.h b/src/tests/accuracy_test_common.h
index 6312780..e15af87 100644
--- a/src/tests/accuracy_test_common.h
+++ b/src/tests/accuracy_test_common.h
@@ -204,6 +204,201 @@ void precallback_complex_to_complex( data_pattern pattern, direction::direction_
 	EXPECT_EQ( true, test_fft.result() == reference.result() );
 }
 
+/*****************************************************/
+/*****************************************************/
+// complex to complex transform with postcallback
+// dimension is inferred from lengths.size()
+// tightly packed is inferred from strides.empty()
+template< class T, class cl_T, class fftw_T >
+void postcallback_complex_to_complex( data_pattern pattern, direction::direction_t direction,
+	std::vector<size_t> lengths, size_t batch,
+	std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+	size_t input_distance, size_t output_distance,
+	layout::buffer_layout_t in_layout, layout::buffer_layout_t out_layout,
+	placeness::placeness_t placeness, T scale = 1.0f, bool hasUserDatatype = false )
+{
+	clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+		input_strides.empty() ? NULL : &input_strides[0],
+		output_strides.empty() ? NULL : &output_strides[0],
+		batch, input_distance, output_distance,
+		cl_layout(in_layout), cl_layout(out_layout),
+		cl_placeness(placeness) );
+
+	fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2c );
+
+	//initialize input
+	if( pattern == sawtooth )
+	{
+		test_fft.set_input_to_sawtooth( 1.0f );
+		reference.set_data_to_sawtooth( 1.0f );
+	}
+	else if( pattern == value )
+	{
+		test_fft.set_input_to_value( 2.0f, 2.5f );
+		reference.set_all_data_to_value( 2.0f, 2.5f );
+	}
+	else if( pattern == impulse )
+	{
+		test_fft.set_input_to_impulse();
+		reference.set_data_to_impulse();
+	}
+	else if( pattern == erratic )
+	{
+		test_fft.set_input_to_random();
+		reference.set_data_to_random();
+	}
+	else
+	{
+		throw std::runtime_error( "invalid pattern type in complex_to_complex()" );
+	}
+
+	// if we're starting with unequal data, we're destined for failure
+	EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+	//set postcallback values
+	if (hasUserDatatype)
+	{
+		//test_fft.set_input_precallback_userdatatype();
+	}
+	else
+	{
+		test_fft.set_output_postcallback();
+	}
+	
+	if( direction == direction::forward )
+	{
+		test_fft.set_forward_transform();
+		test_fft.forward_scale( scale );
+
+		reference.set_forward_transform();
+		reference.forward_scale( scale );
+	}
+	else if( direction == direction::backward )
+	{
+		test_fft.set_backward_transform();
+		test_fft.backward_scale( scale );
+
+		reference.set_backward_transform();
+		reference.backward_scale( scale );
+	}
+	else
+		throw std::runtime_error( "invalid direction in complex_to_complex()" );
+
+	reference.transform();
+	test_fft.transform();
+
+	reference.set_output_postcallback();
+
+	EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
+/*****************************************************/
+/*****************************************************/
+// complex to complex transform with pre and post callback
+// dimension is inferred from lengths.size()
+// tightly packed is inferred from strides.empty()
+template< class T, class cl_T, class fftw_T >
+void pre_and_post_callback_complex_to_complex( data_pattern pattern, direction::direction_t direction,
+	std::vector<size_t> lengths, size_t batch,
+	std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+	size_t input_distance, size_t output_distance,
+	layout::buffer_layout_t in_layout, layout::buffer_layout_t out_layout,
+	placeness::placeness_t placeness, T scale = 1.0f, bool withLDS = false)
+{
+	clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+		input_strides.empty() ? NULL : &input_strides[0],
+		output_strides.empty() ? NULL : &output_strides[0],
+		batch, input_distance, output_distance,
+		cl_layout(in_layout), cl_layout(out_layout),
+		cl_placeness(placeness) );
+
+	fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2c );
+
+	//initialize input
+	if( pattern == sawtooth )
+	{
+		test_fft.set_input_to_sawtooth( 1.0f );
+		reference.set_data_to_sawtooth( 1.0f );
+	}
+	else if( pattern == value )
+	{
+		test_fft.set_input_to_value( 2.0f, 2.5f );
+		reference.set_all_data_to_value( 2.0f, 2.5f );
+	}
+	else if( pattern == impulse )
+	{
+		test_fft.set_input_to_impulse();
+		reference.set_data_to_impulse();
+	}
+	else if( pattern == erratic )
+	{
+		test_fft.set_input_to_random();
+		reference.set_data_to_random();
+	}
+	else
+	{
+		throw std::runtime_error( "invalid pattern type in complex_to_complex()" );
+	}
+
+	// if we're starting with unequal data, we're destined for failure
+	EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+	//set callback values
+	if (withLDS)
+	{
+		unsigned int localMemSize = 64 * sizeof(T);
+		test_fft.set_input_precallback(localMemSize);
+		reference.set_input_precallback_special();
+
+		test_fft.set_output_postcallback(localMemSize);
+	}
+	else
+	{
+		test_fft.set_input_precallback();
+		reference.set_input_precallback();
+
+		//set postcallback values
+		test_fft.set_output_postcallback();
+	}
+
+	if( direction == direction::forward )
+	{
+		test_fft.set_forward_transform();
+		test_fft.forward_scale( scale );
+
+		reference.set_forward_transform();
+		reference.forward_scale( scale );
+	}
+	else if( direction == direction::backward )
+	{
+		test_fft.set_backward_transform();
+		test_fft.backward_scale( scale );
+
+		reference.set_backward_transform();
+		reference.backward_scale( scale );
+	}
+	else
+		throw std::runtime_error( "invalid direction in complex_to_complex()" );
+
+	reference.transform();
+	test_fft.transform();
+
+	//update reference for postcallback
+	if (withLDS)
+	{
+		reference.set_output_postcallback_special();
+	}
+	else
+	{
+		reference.set_output_postcallback();
+	}
+
+	EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
+/*****************************************************/
+/*****************************************************/
+// complex to complex transform with precallback function that uses LDS
 template< class T, class cl_T, class fftw_T >
 void precallback_complex_to_complex_lds( data_pattern pattern, direction::direction_t direction,
 	std::vector<size_t> lengths, size_t batch,
@@ -284,6 +479,87 @@ void precallback_complex_to_complex_lds( data_pattern pattern, direction::direct
 
 /*****************************************************/
 /*****************************************************/
+// complex to complex transform with postcallback function that uses LDS
+template< class T, class cl_T, class fftw_T >
+void postcallback_complex_to_complex_lds( data_pattern pattern, direction::direction_t direction,
+	std::vector<size_t> lengths, size_t batch,
+	std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+	size_t input_distance, size_t output_distance,
+	layout::buffer_layout_t in_layout, layout::buffer_layout_t out_layout,
+	placeness::placeness_t placeness, T scale = 1.0f )
+{
+	clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+		input_strides.empty() ? NULL : &input_strides[0],
+		output_strides.empty() ? NULL : &output_strides[0],
+		batch, input_distance, output_distance,
+		cl_layout(in_layout), cl_layout(out_layout),
+		cl_placeness(placeness) );
+
+	fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2c );
+
+	//initialize input
+	if( pattern == sawtooth )
+	{
+		test_fft.set_input_to_sawtooth( 1.0f );
+		reference.set_data_to_sawtooth( 1.0f );
+	}
+	else if( pattern == value )
+	{
+		test_fft.set_input_to_value( 2.0f, 2.5f );
+		reference.set_all_data_to_value( 2.0f, 2.5f );
+	}
+	else if( pattern == impulse )
+	{
+		test_fft.set_input_to_impulse();
+		reference.set_data_to_impulse();
+	}
+	else if( pattern == erratic )
+	{
+		test_fft.set_input_to_random();
+		reference.set_data_to_random();
+	}
+	else
+	{
+		throw std::runtime_error( "invalid pattern type in complex_to_complex()" );
+	}
+
+	// if we're starting with unequal data, we're destined for failure
+	EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+	//set postcallback values
+	//Test assumes 64 length data
+	unsigned int localMemSize = 64 * sizeof(T);
+	test_fft.set_output_postcallback(localMemSize);
+		
+	if( direction == direction::forward )
+	{
+		test_fft.set_forward_transform();
+		test_fft.forward_scale( scale );
+
+		reference.set_forward_transform();
+		reference.forward_scale( scale );
+	}
+	else if( direction == direction::backward )
+	{
+		test_fft.set_backward_transform();
+		test_fft.backward_scale( scale );
+
+		reference.set_backward_transform();
+		reference.backward_scale( scale );
+	}
+	else
+		throw std::runtime_error( "invalid direction in complex_to_complex()" );
+
+	reference.transform();
+	test_fft.transform();
+
+	reference.set_output_postcallback_special();
+
+	EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
+/*****************************************************/
+/*****************************************************/
 // dimension is inferred from lengths.size()
 // tightly packed is inferred from strides.empty()
 // input layout is always real
@@ -409,6 +685,71 @@ void precallback_real_to_complex( data_pattern pattern,
 /*****************************************************/
 // dimension is inferred from lengths.size()
 // tightly packed is inferred from strides.empty()
+// input layout is always real
+template< class T, class cl_T, class fftw_T >
+void postcallback_real_to_complex( data_pattern pattern,
+	std::vector<size_t> lengths, size_t batch,
+	std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+	size_t input_distance, size_t output_distance,
+	layout::buffer_layout_t out_layout,
+	placeness::placeness_t placeness,
+	T scale = 1.0f )
+{
+	clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+		input_strides.empty() ? NULL : &input_strides[0],
+		output_strides.empty() ? NULL : &output_strides[0],
+		batch, input_distance, output_distance,
+		cl_layout(layout::real), cl_layout(out_layout),
+		cl_placeness(placeness) );
+
+	fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, r2c );
+
+	if( pattern == sawtooth )
+	{
+		test_fft.set_input_to_sawtooth( 1.0f );
+		reference.set_data_to_sawtooth( 1.0f );
+	}
+	else if( pattern == value )
+	{
+		test_fft.set_input_to_value( 2.0f );
+		reference.set_all_data_to_value( 2.0f );
+	}
+	else if( pattern == impulse )
+	{
+		test_fft.set_input_to_impulse();
+		reference.set_data_to_impulse();
+	}
+	else if( pattern == erratic )
+	{
+		test_fft.set_input_to_random();
+		reference.set_data_to_random();
+	}
+	else
+	{
+		throw std::runtime_error( "invalid pattern type in real_to_complex()" );
+	}
+
+	// if we're starting with unequal data, we're destined for failure
+	EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+	test_fft.forward_scale( scale );
+	reference.forward_scale( scale );
+
+	//set postcallback values
+	test_fft.set_output_postcallback();
+	
+	test_fft.transform();
+	reference.transform();
+
+	reference.set_output_postcallback();
+
+	EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
+/*****************************************************/
+/*****************************************************/
+// dimension is inferred from lengths.size()
+// tightly packed is inferred from strides.empty()
 // output layout is always real
 template< class T, class cl_T, class fftw_T >
 void complex_to_real( data_pattern pattern,
@@ -532,6 +873,68 @@ void precallback_complex_to_real( data_pattern pattern,
 	EXPECT_EQ( true, test_fft.result() == reference.result() );
 }
 
+template< class T, class cl_T, class fftw_T >
+void postcallback_complex_to_real( data_pattern pattern,
+	std::vector<size_t> lengths, size_t batch,
+	std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+	size_t input_distance, size_t output_distance,
+	layout::buffer_layout_t in_layout,
+	placeness::placeness_t placeness,
+	T scale = 1.0f )
+{
+	fftw<T, fftw_T> data_maker( lengths.size(), &lengths[0], batch, r2c );
+
+	if( pattern == sawtooth )
+	{
+		data_maker.set_data_to_sawtooth(1.0f);
+	}
+	else if( pattern == value )
+	{
+		data_maker.set_all_data_to_value(2.0f);
+	}
+	else if( pattern == impulse )
+	{
+		data_maker.set_data_to_impulse();
+	}
+	else if( pattern == erratic )
+	{
+		data_maker.set_data_to_random();
+	}
+	else
+	{
+		throw std::runtime_error( "invalid pattern type in complex_to_real()" );
+	}
+
+	data_maker.transform();
+
+	clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+		input_strides.empty() ? NULL : &input_strides[0],
+		output_strides.empty() ? NULL : &output_strides[0],
+		batch, input_distance, output_distance,
+		cl_layout(in_layout), cl_layout(layout::real),
+		cl_placeness(placeness) );
+	test_fft.set_input_to_buffer( data_maker.result() );
+
+	fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2r );
+	reference.set_input_to_buffer(data_maker.result());
+
+	// if we're starting with unequal data, we're destined for failure
+	EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+	test_fft.backward_scale( scale );
+	reference.backward_scale( scale );
+
+	//set postcallback values
+	test_fft.set_output_postcallback();
+
+	test_fft.transform();
+	reference.transform();
+
+	reference.set_output_postcallback();
+
+	EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
 /*****************************************************/
 /*****************************************************/
 // dimension is inferred from lengths.size()
diff --git a/src/tests/accuracy_test_mixed_precallback.cpp b/src/tests/accuracy_test_mixed_callback.cpp
similarity index 50%
rename from src/tests/accuracy_test_mixed_precallback.cpp
rename to src/tests/accuracy_test_mixed_callback.cpp
index 0ecbd8e..511b09a 100644
--- a/src/tests/accuracy_test_mixed_precallback.cpp
+++ b/src/tests/accuracy_test_mixed_callback.cpp
@@ -27,10 +27,10 @@
 
 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
-class accuracy_test_precallback_single : public ::testing::Test {
+class accuracy_test_callback_single : public ::testing::Test {
 protected:
-	accuracy_test_precallback_single(){}
-	virtual ~accuracy_test_precallback_single(){}
+	accuracy_test_callback_single(){}
+	virtual ~accuracy_test_callback_single(){}
 	virtual void SetUp(){}
 	virtual void TearDown(){
 	}
@@ -53,13 +53,21 @@ class mixed_radix_precallback : public ::testing::TestWithParam<size_t> {
 		virtual void TearDown(){}
 };
 
-class Supported_Fft_Sizes_precallback
+class mixed_radix_postcallback : public ::testing::TestWithParam<size_t> {
+	protected:
+		mixed_radix_postcallback(){}
+		virtual ~mixed_radix_postcallback(){}
+		virtual void SetUp(){}
+		virtual void TearDown(){}
+};
+
+class Supported_Fft_Sizes_Callback
 {
 public:
 	std::vector<size_t> sizes;
 	const size_t max_mixed_radices_to_test;
 
-	Supported_Fft_Sizes_precallback()
+	Supported_Fft_Sizes_Callback()
 	: max_mixed_radices_to_test( 4096 )
 	{
 		size_t i=0, j=0, k=0, l=0;
@@ -100,15 +108,21 @@ public:
 			sumi *= 7;
 		}
 	}
-} supported_sizes_precallback;
+} supported_sizes_callback;
 
 INSTANTIATE_TEST_CASE_P(
 	mixed_radices_precallback,
 	mixed_radix_precallback,
-	::testing::ValuesIn( supported_sizes_precallback.sizes )
+	::testing::ValuesIn( supported_sizes_callback.sizes )
 );
 
-namespace precallback_mixed
+INSTANTIATE_TEST_CASE_P(
+	mixed_radices_postcallback,
+	mixed_radix_postcallback,
+	::testing::ValuesIn( supported_sizes_callback.sizes )
+);
+
+namespace callback_mixed
 {
 
 /**********************************************************************************************
@@ -117,7 +131,7 @@ namespace precallback_mixed
 #pragma region Complex_To_Complex
 
 template< typename T, typename cl_T, typename fftw_T >
-void mixed_radix_complex_to_complex( size_t problem_size )
+void mixed_radix_complex_to_complex_precallback( size_t problem_size )
 {
 	try
 	{
@@ -151,19 +165,63 @@ void mixed_radix_complex_to_complex( size_t problem_size )
 TEST_P( mixed_radix_precallback, single_precision_complex_to_complex_auto_generated ) {
 	size_t problem_size = GetParam();
 	RecordProperty("problem_size", (int)problem_size);
-	mixed_radix_complex_to_complex<float, cl_float, fftwf_complex>(problem_size);
+	mixed_radix_complex_to_complex_precallback<float, cl_float, fftwf_complex>(problem_size);
 }
 
 TEST_P( mixed_radix_precallback, double_precision_complex_to_complex_auto_generated ) {
 	size_t problem_size = GetParam();
 	RecordProperty("problem_size", (int)problem_size);
-	mixed_radix_complex_to_complex<double, cl_double, fftw_complex>(problem_size);
+	mixed_radix_complex_to_complex_precallback<double, cl_double, fftw_complex>(problem_size);
+}
+
+template< typename T, typename cl_T, typename fftw_T >
+void mixed_radix_complex_to_complex_postcallback( size_t problem_size )
+{
+	try
+	{
+		if(verbose) std::cout << "Now testing problem size " << problem_size << std::endl;
+
+		std::vector<size_t> lengths;
+		lengths.push_back( problem_size );
+		size_t batch = 1;
+
+		std::vector<size_t> input_strides;
+		std::vector<size_t> output_strides;
+
+		size_t input_distance = 0;
+		size_t output_distance = 0;
+
+		layout::buffer_layout_t in_layout = layout::complex_planar;
+		layout::buffer_layout_t out_layout = layout::complex_planar;
+
+		placeness::placeness_t placeness = placeness::in_place;
+
+		direction::direction_t direction = direction::forward;
+
+		data_pattern pattern = sawtooth;
+		postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+	}
+	catch( const std::exception& err ) {
+		handle_exception(err);
+	}
+}
+
+TEST_P( mixed_radix_postcallback, single_precision_complex_to_complex_auto_generated ) {
+	size_t problem_size = GetParam();
+	RecordProperty("problem_size", (int)problem_size);
+	mixed_radix_complex_to_complex_postcallback<float, cl_float, fftwf_complex>(problem_size);
+}
+
+TEST_P( mixed_radix_postcallback, double_precision_complex_to_complex_auto_generated ) {
+	size_t problem_size = GetParam();
+	RecordProperty("problem_size", (int)problem_size);
+	mixed_radix_complex_to_complex_postcallback<double, cl_double, fftw_complex>(problem_size);
 }
 
 // *****************************************************
 // *****************************************************
 template< class T, class cl_T, class fftw_T >
-void pow2_normal_1D_forward_in_place_complex_to_complex_userdatatype()
+void precall_normal_1D_forward_in_place_complex_to_complex_userdatatype()
 {
 	std::vector<size_t> lengths;
 	lengths.push_back( normal2 );
@@ -181,15 +239,15 @@ void pow2_normal_1D_forward_in_place_complex_to_complex_userdatatype()
 	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness, 1.0f, true );
 }
 
-TEST_F(accuracy_test_precallback_single, pow2_normal_1D_forward_in_place_complex_to_complex_userdatatype)
+TEST_F(accuracy_test_callback_single, precall_normal_1D_forward_in_place_complex_to_complex_userdatatype)
 {
-	try { pow2_normal_1D_forward_in_place_complex_to_complex_userdatatype< float, cl_float, fftwf_complex >(); }
+	try { precall_normal_1D_forward_in_place_complex_to_complex_userdatatype< float, cl_float, fftwf_complex >(); }
 	catch( const std::exception& err ) { handle_exception(err);	}
 }
 
 //Precallback with LDS
 template< class T, class cl_T, class fftw_T >
-void lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved()
+void precall_lds_1D_forward_64_in_place_complex_to_complex()
 {
 	std::vector<size_t> lengths;
 	lengths.push_back( 64 );
@@ -207,9 +265,60 @@ void lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved()
 	precallback_complex_to_complex_lds<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
 }
 
-TEST_F(accuracy_test_precallback_single, lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved)
+TEST_F(accuracy_test_callback_single, precall_lds_1D_forward_64_in_place_complex_to_complex)
+{
+	try { precall_lds_1D_forward_64_in_place_complex_to_complex< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+//Postcallback with LDS
+template< class T, class cl_T, class fftw_T >
+void postcall_lds_1D_forward_64_in_place_complex_to_complex()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 64 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = impulse;
+	postcallback_complex_to_complex_lds<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_callback_single, postcall_lds_1D_forward_64_in_place_complex_to_complex)
+{
+	try { postcall_lds_1D_forward_64_in_place_complex_to_complex< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pre_and_post_callback_lds_1D_forward_64_in_place_complex_to_complex()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 64 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = impulse;
+	pre_and_post_callback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness, 1.0f, true );
+}
+
+TEST_F(accuracy_test_callback_single, pre_and_post_callback_lds_1D_forward_64_in_place_complex_to_complex)
 {
-	try { lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	try { pre_and_post_callback_lds_1D_forward_64_in_place_complex_to_complex< float, cl_float, fftwf_complex >(); }
 	catch( const std::exception& err ) { handle_exception(err);	}
 }
 
@@ -221,7 +330,7 @@ TEST_F(accuracy_test_precallback_single, lds_1D_forward_64_in_place_complex_inte
 #pragma region Complex_To_Real
 
 template< typename T, typename cl_T, typename fftw_T >
-void mixed_radix_hermitian_to_real( size_t problem_size )
+void mixed_radix_hermitian_to_real_precallback( size_t problem_size )
 {
 	try
 	{
@@ -252,13 +361,54 @@ void mixed_radix_hermitian_to_real( size_t problem_size )
 TEST_P( mixed_radix_precallback, single_precision_hermitian_to_real_auto_generated ) {
 	size_t problem_size = GetParam();
 	RecordProperty("problem_size", (int)problem_size);
-	mixed_radix_hermitian_to_real<float, cl_float, fftwf_complex>(problem_size);
+	mixed_radix_hermitian_to_real_precallback<float, cl_float, fftwf_complex>(problem_size);
 }
 
 TEST_P( mixed_radix_precallback, double_precision_hermitian_to_real_auto_generated ) {
 	size_t problem_size = GetParam();
 	RecordProperty("problem_size", (int)problem_size);
-	mixed_radix_hermitian_to_real<double, cl_double, fftw_complex>(problem_size);
+	mixed_radix_hermitian_to_real_precallback<double, cl_double, fftw_complex>(problem_size);
+}
+
+template< typename T, typename cl_T, typename fftw_T >
+void mixed_radix_hermitian_to_real_postcallback( size_t problem_size )
+{
+	try
+	{
+		if(verbose) std::cout << "Now testing problem size " << problem_size << std::endl;
+
+		std::vector<size_t> lengths;
+		lengths.push_back( problem_size );
+		size_t batch = 1;
+
+		std::vector<size_t> input_strides;
+		std::vector<size_t> output_strides;
+
+		size_t input_distance = 0;
+		size_t output_distance = 0;
+
+		layout::buffer_layout_t layout = layout::hermitian_interleaved;
+
+		placeness::placeness_t placeness = placeness::in_place;
+
+		data_pattern pattern = sawtooth;
+		postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+	}
+	catch( const std::exception& err ) {
+		handle_exception(err);
+	}
+}
+
+TEST_P( mixed_radix_postcallback, single_precision_hermitian_to_real_auto_generated ) {
+	size_t problem_size = GetParam();
+	RecordProperty("problem_size", (int)problem_size);
+	mixed_radix_hermitian_to_real_postcallback<float, cl_float, fftwf_complex>(problem_size);
+}
+
+TEST_P( mixed_radix_postcallback, double_precision_hermitian_to_real_auto_generated ) {
+	size_t problem_size = GetParam();
+	RecordProperty("problem_size", (int)problem_size);
+	mixed_radix_hermitian_to_real_postcallback<double, cl_double, fftw_complex>(problem_size);
 }
 
 #pragma endregion
@@ -269,7 +419,7 @@ TEST_P( mixed_radix_precallback, double_precision_hermitian_to_real_auto_generat
 #pragma region Real_To_Complex
 
 template< typename T, typename cl_T, typename fftw_T >
-void mixed_radix_real_to_hermitian( size_t problem_size )
+void mixed_radix_real_to_hermitian_precallback( size_t problem_size )
 {
 	try
 	{
@@ -300,13 +450,54 @@ void mixed_radix_real_to_hermitian( size_t problem_size )
 TEST_P( mixed_radix_precallback, single_precision_real_to_hermitian_auto_generated ) {
 	size_t problem_size = GetParam();
 	RecordProperty("problem_size", (int)problem_size);
-	mixed_radix_real_to_hermitian<float, cl_float, fftwf_complex>(problem_size);
+	mixed_radix_real_to_hermitian_precallback<float, cl_float, fftwf_complex>(problem_size);
 }
 
 TEST_P( mixed_radix_precallback, double_precision_real_to_hermitian_auto_generated ) {
 	size_t problem_size = GetParam();
 	RecordProperty("problem_size", (int)problem_size);
-	mixed_radix_real_to_hermitian<double, cl_double, fftw_complex>(problem_size);
+	mixed_radix_real_to_hermitian_precallback<double, cl_double, fftw_complex>(problem_size);
+}
+
+template< typename T, typename cl_T, typename fftw_T >
+void mixed_radix_real_to_hermitian_postcallback( size_t problem_size )
+{
+	try
+	{
+		if(verbose) std::cout << "Now testing problem size " << problem_size << std::endl;
+
+		std::vector<size_t> lengths;
+		lengths.push_back( problem_size );
+		size_t batch = 1;
+
+		std::vector<size_t> input_strides;
+		std::vector<size_t> output_strides;
+
+		size_t input_distance = 0;
+		size_t output_distance = 0;
+
+		layout::buffer_layout_t layout = layout::hermitian_interleaved;
+
+		placeness::placeness_t placeness = placeness::in_place;
+
+		data_pattern pattern = sawtooth;
+		postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+	}
+	catch( const std::exception& err ) {
+		handle_exception(err);
+	}
+}
+
+TEST_P( mixed_radix_postcallback, single_precision_real_to_hermitian_auto_generated ) {
+	size_t problem_size = GetParam();
+	RecordProperty("problem_size", (int)problem_size);
+	mixed_radix_real_to_hermitian_postcallback<float, cl_float, fftwf_complex>(problem_size);
+}
+
+TEST_P( mixed_radix_postcallback, double_precision_real_to_hermitian_auto_generated ) {
+	size_t problem_size = GetParam();
+	RecordProperty("problem_size", (int)problem_size);
+	mixed_radix_real_to_hermitian_postcallback<double, cl_double, fftw_complex>(problem_size);
 }
 
 #pragma endregion
diff --git a/src/tests/accuracy_test_postcallback.cpp b/src/tests/accuracy_test_postcallback.cpp
new file mode 100644
index 0000000..85c6c67
--- /dev/null
+++ b/src/tests/accuracy_test_postcallback.cpp
@@ -0,0 +1,2738 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <gtest/gtest.h>
+#include<math.h>
+
+#include "test_constants.h"
+#include "fftw_transform.h"
+#include "cl_transform.h"
+#include "typedefs.h"
+#include "accuracy_test_common.h"
+#include <stdexcept>
+#include <vector>
+
+/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
+/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
+class accuracy_test_postcallback_single : public ::testing::Test {
+protected:
+	accuracy_test_postcallback_single(){}
+	virtual ~accuracy_test_postcallback_single(){}
+	virtual void SetUp(){}
+	virtual void TearDown(){
+	}
+};
+
+/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
+/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
+class accuracy_test_postcallback_double : public ::testing::Test {
+protected:
+	accuracy_test_postcallback_double(){}
+	virtual ~accuracy_test_postcallback_double(){}
+	virtual void SetUp(){}
+	virtual void TearDown(){
+	}
+};
+
+namespace postcallback
+{
+
+#pragma region Complex_To_Complex
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_backward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::backward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_backward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_backward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_backward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_backward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_forward_out_of_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_forward_out_of_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_forward_out_of_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_forward_out_of_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_forward_out_of_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_forward_out_of_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_forward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow2_normal_1D_forward_out_of_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_forward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow2_normal_1D_forward_out_of_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_normal_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_verysmall_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_verysmall_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_verysmall_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_verysmall_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_1D_verysmall_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_non_unit_stride_and_distance_complex_to_complex()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 2;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	input_strides.push_back( 42 );
+	output_strides.push_back( 42 );
+	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 14;
+	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 14;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_non_unit_stride_and_distance_complex_to_complex)
+{
+	try { pow2_normal_1D_non_unit_stride_and_distance_complex_to_complex< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_non_unit_stride_and_distance_complex_to_complex)
+{
+	try { pow2_normal_1D_non_unit_stride_and_distance_complex_to_complex< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void mixed_normal_1D_val_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 100 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, mixed_normal_1D_val_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { mixed_normal_1D_val_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, mixed_normal_1D_val_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { mixed_normal_1D_val_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow3_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow3_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_small_1D_forward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_small_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow3_small_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_small_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow3_small_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow5_normal_1D_forward_out_of_place_complex_interleaved_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_normal_1D_forward_out_of_place_complex_interleaved_to_complex_planar)
+{
+	try { pow5_normal_1D_forward_out_of_place_complex_interleaved_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_normal_1D_forward_out_of_place_complex_interleaved_to_complex_planar)
+{
+	try { pow5_normal_1D_forward_out_of_place_complex_interleaved_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_backward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::backward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_backward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_backward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_backward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_backward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_withEndTranpose_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 131072 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_withEndTranpose_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_withEndTranpose_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_withEndTranpose_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_withEndTranpose_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_forward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_withEndTranpose_forward_out_of_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 131072 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_withEndTranpose_forward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_1D_withEndTranpose_forward_out_of_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_withEndTranpose_forward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_1D_withEndTranpose_forward_out_of_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_16M_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 16777216 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_16M_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_16M_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_16M_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_16M_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_1M_forward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 1048576 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_1M_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_1D_1M_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_1M_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_1D_1M_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow3_normal_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow3_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow5_small_1D_backward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::backward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_small_1D_backward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow5_small_1D_backward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_small_1D_backward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow5_small_1D_backward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow5_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow5_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_1D_forward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow5_large_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_1D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow5_large_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_normal_1D_backward_out_of_place_complex_planar_to_complex_planar()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(normal7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t in_layout = layout::complex_planar;
+    layout::buffer_layout_t out_layout = layout::complex_planar;
+    placeness::placeness_t placeness = placeness::out_of_place;
+    direction::direction_t direction = direction::backward;
+
+    data_pattern pattern = sawtooth;
+    postcallback_complex_to_complex<T, cl_T, fftw_T>(pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_normal_1D_backward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow7_normal_1D_backward_out_of_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_normal_1D_backward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow7_normal_1D_backward_out_of_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(large7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t in_layout = layout::complex_interleaved;
+    layout::buffer_layout_t out_layout = layout::complex_interleaved;
+    placeness::placeness_t placeness = placeness::in_place;
+    direction::direction_t direction = direction::forward;
+
+    data_pattern pattern = sawtooth;
+    postcallback_complex_to_complex<T, cl_T, fftw_T>(pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow7_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow7_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_2D_array_complex_to_complex_with_odd_batch_size()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( small2 );
+	size_t batch = 5;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_2D_array_complex_to_complex_with_odd_batch_size)
+{
+	try { pow2_normal_2D_array_complex_to_complex_with_odd_batch_size< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_2D_array_complex_to_complex_with_odd_batch_size)
+{
+	try { pow2_normal_2D_array_complex_to_complex_with_odd_batch_size< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_2D_forward_out_of_place_complex_interleaved_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small2 );
+	lengths.push_back( 8 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_2D_forward_out_of_place_complex_interleaved_to_complex_planar)
+{
+	try { pow2_small_2D_forward_out_of_place_complex_interleaved_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_2D_forward_out_of_place_complex_interleaved_to_complex_planar)
+{
+	try { pow2_small_2D_forward_out_of_place_complex_interleaved_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_2D_backward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( MaxLength2D<T>(2) );
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::backward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_2D_backward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_2D_backward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_2D_backward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow2_large_2D_backward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal3 );
+	lengths.push_back( normal3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow3_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow3_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_2D_backward_out_of_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( MaxLength2D<T>(5) );
+	lengths.push_back( normal5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+	direction::direction_t direction = direction::backward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_2D_backward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow5_large_2D_backward_out_of_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_2D_backward_out_of_place_complex_planar_to_complex_planar)
+{
+	try { pow5_large_2D_backward_out_of_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_small_2D_backward_out_of_place_complex_interleaved_to_complex_planar()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(small7);
+    lengths.push_back(small7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t in_layout = layout::complex_interleaved;
+    layout::buffer_layout_t out_layout = layout::complex_planar;
+    placeness::placeness_t placeness = placeness::out_of_place;
+    direction::direction_t direction = direction::backward;
+
+    data_pattern pattern = sawtooth;
+    postcallback_complex_to_complex<T, cl_T, fftw_T>(pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_small_2D_backward_out_of_place_complex_interleaved_to_complex_planar)
+{
+	try { pow7_small_2D_backward_out_of_place_complex_interleaved_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_small_2D_backward_out_of_place_complex_interleaved_to_complex_planar)
+{
+	try { pow7_small_2D_backward_out_of_place_complex_interleaved_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_3D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( small2 );
+	lengths.push_back( small2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_3D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_3D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_3D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_3D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_3D_forward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal3 );
+	lengths.push_back( small3 );
+	lengths.push_back( small3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_3D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow3_normal_3D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_3D_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { pow3_normal_3D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pre_and_post_callback_normal_complex_to_complex()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	pre_and_post_callback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pre_and_post_callback_normal_complex_to_complex)
+{
+	try { pre_and_post_callback_normal_complex_to_complex< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pre_and_post_callback_normal_complex_to_complex)
+{
+	try { pre_and_post_callback_normal_complex_to_complex< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pre_and_post_callback_singlepass_complex_to_complex()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	pre_and_post_callback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pre_and_post_callback_singlepass_complex_to_complex)
+{
+	try { pre_and_post_callback_singlepass_complex_to_complex< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pre_and_post_callback_singlepass_complex_to_complex)
+{
+	try { pre_and_post_callback_singlepass_complex_to_complex< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+#pragma endregion
+
+#pragma region Real_To_Complex
+
+// *****************************************************
+// *****************************************************
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_normal_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_normal_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_out_of_place_real_to_hermitian_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow2_normal_1D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow2_normal_1D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_1D_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_small_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_small_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_array_real_to_hermitian()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 8;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_array_real_to_hermitian)
+{
+	try { pow2_normal_1D_array_real_to_hermitian< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_array_real_to_hermitian)
+{
+	try { pow2_normal_1D_array_real_to_hermitian< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 4 );
+	size_t batch = 2;
+
+	std::vector<size_t> input_strides;
+	input_strides.push_back( 16 );
+
+	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 128;
+
+	std::vector<size_t> output_strides;
+	output_strides.push_back( 2 );
+
+	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 2;
+
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = impulse;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
+{
+	try { pow2_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex)
+{
+	try { pow2_very_small_1D_out_of_place_different_input_output_non_unit_stride_and_distance_real_to_complex< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_1D_out_of_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_1D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow3_normal_1D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_1D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow3_normal_1D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_normal_1D_array_real_to_hermitian_with_odd_batch_size()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal5 );
+	size_t batch = 5;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_normal_1D_array_real_to_hermitian_with_odd_batch_size)
+{
+	try { pow5_normal_1D_array_real_to_hermitian_with_odd_batch_size< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_normal_1D_array_real_to_hermitian_with_odd_batch_size)
+{
+	try { pow5_normal_1D_array_real_to_hermitian_with_odd_batch_size< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_normal_1D_out_of_place_real_to_hermitian_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_normal_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow5_normal_1D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_normal_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow5_normal_1D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_small_1D_in_place_real_to_hermitian_interleaved()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(small7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t layout = layout::hermitian_interleaved;
+    placeness::placeness_t placeness = placeness::in_place;
+
+    data_pattern pattern = sawtooth;
+    postcallback_real_to_complex<T, cl_T, fftw_T>(pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_small_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow7_small_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_small_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow7_small_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_large_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_large_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_4M_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 4194304 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_4M_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_large_1D_4M_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_4M_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_large_1D_4M_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_4M_out_of_place_real_to_hermitian_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 4194304 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_4M_out_of_place_real_to_hermitian_planar)
+{
+	try { pow2_large_1D_4M_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_4M_out_of_place_real_to_hermitian_planar)
+{
+	try { pow2_large_1D_4M_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_large_1D_out_of_place_real_to_hermitian_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_large_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow3_large_1D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_large_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow3_large_1D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_large_1D_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_large_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow3_large_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_large_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow3_large_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_large_1D_out_of_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_large_1D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow3_large_1D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_large_1D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow3_large_1D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_1D_out_of_place_real_to_hermitian_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow5_large_1D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_1D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow5_large_1D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_large_1D_in_place_real_to_hermitian_interleaved()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(large7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t layout = layout::hermitian_interleaved;
+    placeness::placeness_t placeness = placeness::in_place;
+
+    data_pattern pattern = sawtooth;
+    postcallback_real_to_complex<T, cl_T, fftw_T>(pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_large_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow7_large_1D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_large_1D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow7_large_1D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_2D_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_2D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_normal_2D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_2D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_normal_2D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_2D_out_of_place_real_to_hermitian_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_2D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow2_normal_2D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_2D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow2_normal_2D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_2D_array_real_to_hermitian()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small2 );
+	lengths.push_back( small2 );
+	size_t batch = 8;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_2D_array_real_to_hermitian)
+{
+	try { pow2_small_2D_array_real_to_hermitian< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_2D_array_real_to_hermitian)
+{
+	try { pow2_small_2D_array_real_to_hermitian< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_2D_out_of_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( MaxLength2D<T>(2) );
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_2D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow2_large_2D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_2D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow2_large_2D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 4 );
+	lengths.push_back( 4 );
+	size_t batch = 2;
+
+	std::vector<size_t> input_strides;
+	input_strides.push_back( 5 );
+	input_strides.push_back( lengths[0] * input_strides[0] + 1 );
+
+	std::vector<size_t> output_strides;
+	output_strides.push_back( 2 );
+	output_strides.push_back( lengths[0] * output_strides[0] + 2 );
+
+	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 30;
+	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 42;
+
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian)
+{
+	try { pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian)
+{
+	try { pow2_small_2D_non_unit_stride_and_distance_real_to_hermitian< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_2D_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal3 );
+	lengths.push_back( normal3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_2D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow3_normal_2D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_2D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow3_normal_2D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_2D_out_of_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( MaxLength2D<T>(5) );
+	lengths.push_back( normal5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_2D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow5_large_2D_out_of_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_2D_out_of_place_real_to_hermitian_interleaved)
+{
+	try { pow5_large_2D_out_of_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_small_2D_non_unit_stride_and_distance_real_to_hermitian()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(small7);
+    lengths.push_back(small7);
+    size_t batch = 2;
+
+    std::vector<size_t> input_strides;
+    input_strides.push_back(5);
+    input_strides.push_back(lengths[0] * input_strides[0] + 1);
+
+    std::vector<size_t> output_strides;
+    output_strides.push_back(2);
+    output_strides.push_back(lengths[0] * output_strides[0] + 2);
+
+    size_t input_distance = lengths[lengths.size() - 1] * input_strides[input_strides.size() - 1] + 30;
+    size_t output_distance = lengths[lengths.size() - 1] * output_strides[output_strides.size() - 1] + 42;
+
+    layout::buffer_layout_t layout = layout::hermitian_interleaved;
+    placeness::placeness_t placeness = placeness::out_of_place;
+
+    data_pattern pattern = sawtooth;
+    postcallback_real_to_complex<T, cl_T, fftw_T>(pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_small_2D_non_unit_stride_and_distance_real_to_hermitian)
+{
+	try { pow7_small_2D_non_unit_stride_and_distance_real_to_hermitian< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_small_2D_non_unit_stride_and_distance_real_to_hermitian)
+{
+	try { pow7_small_2D_non_unit_stride_and_distance_real_to_hermitian< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_3D_in_place_real_to_hermitian_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small2 );
+	lengths.push_back( normal2 );
+	lengths.push_back( small2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_3D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_normal_3D_in_place_real_to_hermitian_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_3D_in_place_real_to_hermitian_interleaved)
+{
+	try { pow2_normal_3D_in_place_real_to_hermitian_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_3D_non_unit_stride_and_distance_real_to_hermitian()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 4 );
+	lengths.push_back( 4 );
+	lengths.push_back( 4 );
+	size_t batch = 2;
+
+	std::vector<size_t> input_strides;
+	input_strides.push_back( 5 );
+	input_strides.push_back( lengths[0] * input_strides[0] + 1 );
+	input_strides.push_back( lengths[1] * input_strides[1] + 1 );
+
+	std::vector<size_t> output_strides;
+	output_strides.push_back( 2 );
+	output_strides.push_back( lengths[0] * output_strides[0] + 2 );
+	output_strides.push_back( lengths[1] * output_strides[1] + 2 );
+
+	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 30;
+	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 42;
+
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_3D_non_unit_stride_and_distance_real_to_hermitian)
+{
+	try { pow2_small_3D_non_unit_stride_and_distance_real_to_hermitian< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_3D_non_unit_stride_and_distance_real_to_hermitian)
+{
+	try { pow2_small_3D_non_unit_stride_and_distance_real_to_hermitian< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_large_3D_out_of_place_real_to_hermitian_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large3 );
+	lengths.push_back( 3 );
+	lengths.push_back( 3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_real_to_complex<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_large_3D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow3_large_3D_out_of_place_real_to_hermitian_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_large_3D_out_of_place_real_to_hermitian_planar)
+{
+	try { pow3_large_3D_out_of_place_real_to_hermitian_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+#pragma endregion
+
+#pragma region Complex_To_Real
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_in_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_1D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_1D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_out_of_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_1D_out_of_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_1D_out_of_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_out_of_place_hermitian_planar_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow2_normal_1D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow2_normal_1D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_1D_user_defined_scale_hermitian_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness, 42.0f );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_1D_user_defined_scale_hermitian_to_real)
+{
+	try { pow2_normal_1D_user_defined_scale_hermitian_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_1D_user_defined_scale_hermitian_to_real)
+{
+	try { pow2_normal_1D_user_defined_scale_hermitian_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_1D_non_unit_stride_hermitian_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	input_strides.push_back( 3 );
+	output_strides.push_back( 3 );
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_1D_non_unit_stride_hermitian_to_real)
+{
+	try { pow2_small_1D_non_unit_stride_hermitian_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_1D_non_unit_stride_hermitian_to_real)
+{
+	try { pow2_small_1D_non_unit_stride_hermitian_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_1D_out_of_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow3_normal_1D_out_of_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow3_normal_1D_out_of_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_small_1D_out_of_place_hermitian_planar_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_small_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow5_small_1D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_small_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow5_small_1D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_normal_1D_in_place_hermitian_interleaved_to_real()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(normal7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t layout = layout::hermitian_interleaved;
+    placeness::placeness_t placeness = placeness::in_place;
+
+    data_pattern pattern = sawtooth;
+    postcallback_complex_to_real<T, cl_T, fftw_T>(pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_normal_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow7_normal_1D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_normal_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow7_normal_1D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_in_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_large_1D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_large_1D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_out_of_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow2_large_1D_out_of_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow2_large_1D_out_of_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_out_of_place_hermitian_planar_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_large_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow2_large_1D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_large_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow2_large_1D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_large_1D_in_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_large_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow3_large_1D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_large_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow3_large_1D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_1D_out_of_place_hermitian_planar_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow5_large_1D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow5_large_1D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_large_1D_out_of_place_hermitian_interleaved_to_real()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(large7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t layout = layout::hermitian_interleaved;
+    placeness::placeness_t placeness = placeness::out_of_place;
+
+    data_pattern pattern = sawtooth;
+    postcallback_complex_to_real<T, cl_T, fftw_T>(pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_large_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow7_large_1D_out_of_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_large_1D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow7_large_1D_out_of_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_2D_in_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_2D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_2D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_2D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_2D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_2D_non_unit_stride_and_distance_hermitian_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 4 );
+	lengths.push_back( 4 );
+	size_t batch = 2;
+
+	std::vector<size_t> input_strides;
+	input_strides.push_back( 12 );
+	input_strides.push_back( lengths[0] * input_strides[0] + 9 );
+
+	std::vector<size_t> output_strides;
+	output_strides.push_back( 7 );
+	output_strides.push_back( lengths[0] * output_strides[0] + 32 );
+
+	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 50;
+	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 60;
+
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_2D_non_unit_stride_and_distance_hermitian_to_real)
+{
+	try { pow2_small_2D_non_unit_stride_and_distance_hermitian_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_2D_non_unit_stride_and_distance_hermitian_to_real)
+{
+	try { pow2_small_2D_non_unit_stride_and_distance_hermitian_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_2D_out_of_place_hermitian_planar_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal3 );
+	lengths.push_back( normal3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_2D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow3_normal_2D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_2D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow3_normal_2D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_2D_out_of_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( MaxLength2D<T>(5) );
+	lengths.push_back( normal5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_2D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow5_large_2D_out_of_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_2D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow5_large_2D_out_of_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow7_small_2D_in_place_hermitian_interleaved_to_real()
+{
+    std::vector<size_t> lengths;
+    lengths.push_back(small7);
+    lengths.push_back(small7);
+    size_t batch = 1;
+    std::vector<size_t> input_strides;
+    std::vector<size_t> output_strides;
+    size_t input_distance = 0;
+    size_t output_distance = 0;
+    layout::buffer_layout_t layout = layout::hermitian_interleaved;
+    placeness::placeness_t placeness = placeness::in_place;
+
+    data_pattern pattern = sawtooth;
+    postcallback_complex_to_real<T, cl_T, fftw_T>(pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness);
+}
+
+TEST_F(accuracy_test_postcallback_single, pow7_small_2D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow7_small_2D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow7_small_2D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow7_small_2D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_3D_in_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( small2 );
+	lengths.push_back( small2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_normal_3D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_3D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_normal_3D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_normal_3D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_small_3D_non_unit_stride_and_distance_hermitian_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 4 );
+	lengths.push_back( 4 );
+	lengths.push_back( 4 );
+	size_t batch = 2;
+
+	std::vector<size_t> input_strides;
+	input_strides.push_back( 5 );
+	input_strides.push_back( lengths[0] * input_strides[0] + 1 );
+	input_strides.push_back( lengths[1] * input_strides[1] + 1 );
+
+	std::vector<size_t> output_strides;
+	output_strides.push_back( 2 );
+	output_strides.push_back( lengths[0] * output_strides[0] + 2 );
+	output_strides.push_back( lengths[1] * output_strides[1] + 2 );
+
+	size_t input_distance = lengths[lengths.size()-1] * input_strides[input_strides.size()-1] + 30;
+	size_t output_distance = lengths[lengths.size()-1] * output_strides[output_strides.size()-1] + 42;
+
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow2_small_3D_non_unit_stride_and_distance_hermitian_to_real)
+{
+	try { pow2_small_3D_non_unit_stride_and_distance_hermitian_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow2_small_3D_non_unit_stride_and_distance_hermitian_to_real)
+{
+	try { pow2_small_3D_non_unit_stride_and_distance_hermitian_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow3_normal_3D_out_of_place_hermitian_planar_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( small3 );
+	lengths.push_back( normal3 );
+	lengths.push_back( small3 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow3_normal_3D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow3_normal_3D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow3_normal_3D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow3_normal_3D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow5_large_3D_out_of_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large5 );
+	lengths.push_back( 5 );
+	lengths.push_back( 5 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	postcallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_postcallback_single, pow5_large_3D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow5_large_3D_out_of_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_postcallback_double, pow5_large_3D_out_of_place_hermitian_interleaved_to_real)
+{
+	try { pow5_large_3D_out_of_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+#pragma endregion
+} //namespace
diff --git a/src/tests/cl_transform.h b/src/tests/cl_transform.h
index 48ffe07..7bc30c1 100644
--- a/src/tests/cl_transform.h
+++ b/src/tests/cl_transform.h
@@ -628,21 +628,21 @@ public:
 		if (localMemSize > 0)
 		{
 			//Test for LDS in precallback function
-			precallbackstr = STRINGIFY(MULVAL_LDS);
+			precallbackstr = STRINGIFY(PRE_MULVAL_LDS);
 		}
 		else
 		{
 			if (input.is_interleaved() )
 			{
-				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL) : STRINGIFY(PRE_MULVAL_DP);
 			}
 			else if (input.is_planar())
 			{
-				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
+				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL_PLANAR) : STRINGIFY(PRE_MULVAL_PLANAR_DP);
 			}
 			else if (input.is_real())
 			{
-				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_REAL) : STRINGIFY(MULVAL_REAL_DP);
+				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL_REAL) : STRINGIFY(PRE_MULVAL_REAL_DP);
 			}
 		}
 
@@ -665,14 +665,14 @@ public:
 		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
 
 		//Register the callback
-		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval", precallbackstr, localMemSize, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
+		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_pre", precallbackstr, localMemSize, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
 	}
 
 		/*****************************************************/
 	void set_input_precallback_userdatatype() {
 		cl_int status = 0;
 
-		char* precallbackstr = STRINGIFY(MULVAL_UDT);
+		char* precallbackstr = STRINGIFY(PRE_MULVAL_UDT);
 
 		size_t totalPts = input.total_number_of_points_including_data_and_intervening();
 
@@ -704,7 +704,58 @@ public:
 		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
 
 		//Register the callback
-		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval", precallbackstr, 0, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
+		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_pre", precallbackstr, 0, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
+	}
+
+		/*****************************************************/
+	void set_output_postcallback(unsigned int localMemSize = 0) {
+		cl_int status = 0;
+		clfftPrecision precision;
+		clfftGetPlanPrecision( *plan_handle, &precision );
+
+		const char* postcallbackstr;
+		
+		if (localMemSize > 0)
+		{
+			//Test for LDS in postcallback function
+			postcallbackstr = STRINGIFY(POST_MULVAL_LDS);
+		}
+		else
+		{
+			if (output.is_interleaved() )
+			{
+				postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL) : STRINGIFY(POST_MULVAL_DP);
+			}
+			else if (output.is_planar())
+			{
+				postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL_PLANAR) : STRINGIFY(POST_MULVAL_PLANAR_DP);
+			}
+			else if (output.is_real())
+			{
+				postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL_REAL) : STRINGIFY(POST_MULVAL_REAL_DP);
+			}
+		}
+
+		//post-callback user data
+		buffer<T> userdata( 	static_cast<size_t>(dimension),
+					output.lengths(),
+					output.strides(),
+					output.batch_size(),
+					output.distance(),
+					layout::real,
+					_placeness
+					);
+
+		userdata.set_all_to_random_data(lengths[0], 10);
+		
+		// make the new buffer
+		const size_t bufferSizeBytes = userdata.size_in_bytes( );
+
+		cl_mem userdataBuff = clCreateBuffer( context.get( ), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bufferSizeBytes, userdata.real_ptr(), &status);
+		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
+
+		//Register the post-callback
+		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_post", postcallbackstr, localMemSize, POSTCALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
 	}
 
 	/*****************************************************/
diff --git a/src/tests/fftw_transform.h b/src/tests/fftw_transform.h
index 5eaebbf..2a80f86 100644
--- a/src/tests/fftw_transform.h
+++ b/src/tests/fftw_transform.h
@@ -440,6 +440,23 @@ public:
 		input = other_buffer;
 	}
 
+	void set_output_postcallback()
+	{
+		//postcallback user data
+		buffer<T> userdata( 	output.number_of_dimensions(),
+					output.lengths(),
+					output.strides(),
+					output.batch_size(),
+					output.distance(),
+					layout::real ,
+					CLFFT_INPLACE
+					);
+		
+		userdata.set_all_to_random_data(_lengths[0], 10);
+		
+		output *= userdata;
+	}
+
 	void set_input_precallback()
 	{
 		//precallback user data
@@ -474,6 +491,23 @@ public:
 		input.multiply_3pt_average(userdata);
 	}
 
+	void set_output_postcallback_special()
+	{
+		//postcallback user data
+		buffer<T> userdata( 	output.number_of_dimensions(),
+					output.lengths(),
+					output.strides(),
+					output.batch_size(),
+					output.distance(),
+					layout::real ,
+					CLFFT_INPLACE
+					);
+		
+		userdata.set_all_to_random_data(_lengths[0], 10);
+		
+		output.multiply_3pt_average(userdata);
+	}
+
 	/*****************************************************/
 	void clear_data_buffer()
 	{
diff --git a/src/tests/test_constants.h b/src/tests/test_constants.h
index 10981d3..4b0d9ca 100644
--- a/src/tests/test_constants.h
+++ b/src/tests/test_constants.h
@@ -23,19 +23,20 @@
 #include <string>
 #include <stdexcept>
 
-#define MULVAL float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
+//Pre-callback function strings
+#define PRE_MULVAL float2 mulval_pre(__global void* in, uint offset, __global void* userdata)\n \
 				{ \n \
 				float scalar = *((__global float*)userdata + offset); \n \
 				float2 ret = *((__global float2*)in + offset) * scalar; \n \
 				return ret; \n \
 				}
 
-#define MULVAL_UDT typedef struct USER_DATA  \
+#define PRE_MULVAL_UDT typedef struct USER_DATA  \
 					   {  \
 						float scalar1;  \
 						float scalar2;  \
 						} USER_DATA; \n \
-					float2 mulval(__global void* in, uint offset, __global void* userdata)\n \
+					float2 mulval_pre(__global void* in, uint offset, __global void* userdata)\n \
 					{ \n \
 					__global USER_DATA *data = ((__global USER_DATA *)userdata + offset); \n \
 					float scalar = data->scalar1 * data->scalar2; \n \
@@ -43,14 +44,14 @@
 					return ret; \n \
 					}
 
-#define MULVAL_DP double2 mulval(__global void* in, uint offset, __global void* userdata)\n \
+#define PRE_MULVAL_DP double2 mulval_pre(__global void* in, uint offset, __global void* userdata)\n \
 				{ \n \
 				double scalar = *((__global double*)userdata + offset); \n \
 				double2 ret = *((__global double2*)in + offset) * scalar; \n \
 				return ret; \n \
 				}
 
-#define MULVAL_PLANAR float2 mulval(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
+#define PRE_MULVAL_PLANAR float2 mulval_pre(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
 				{ \n \
 				float scalar = *((__global float*)userdata + offset); \n \
 				float2 ret; \n \
@@ -59,7 +60,7 @@
 				return ret; \n \
 				}
 
-#define MULVAL_PLANAR_DP double2 mulval(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
+#define PRE_MULVAL_PLANAR_DP double2 mulval_pre(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
 				{ \n \
 				double scalar = *((__global double*)userdata + offset); \n \
 				double2 ret; \n \
@@ -68,14 +69,14 @@
 				return ret; \n \
 				}
 
-#define MULVAL_REAL float mulval(__global void* in, uint offset, __global void* userdata)\n \
+#define PRE_MULVAL_REAL float mulval_pre(__global void* in, uint offset, __global void* userdata)\n \
 				{ \n \
 				float scalar = *((__global float*)userdata + offset); \n \
 				float ret = *((__global float*)in + offset) * scalar; \n \
 				return ret; \n \
 				}
 
-#define MULVAL_REAL_DP double mulval(__global void* in, uint offset, __global void* userdata)\n \
+#define PRE_MULVAL_REAL_DP double mulval_pre(__global void* in, uint offset, __global void* userdata)\n \
 				{ \n \
 				double scalar = *((__global double*)userdata + offset); \n \
 				double ret = *((__global double*)in + offset) * scalar; \n \
@@ -83,7 +84,7 @@
 				}
 
 //Precallback test for LDS - works when 1 WI works on one input element
-#define MULVAL_LDS float2 mulval(__global void* in, uint offset, __global void* userdata, __local void* localmem)\n \
+#define PRE_MULVAL_LDS float2 mulval_pre(__global void* in, uint offset, __global void* userdata, __local void* localmem)\n \
 				{ \n \
 				uint lid = get_local_id(0); \n \
 				__local float* lds = (__local float*)localmem + lid; \n \
@@ -91,11 +92,72 @@
 				barrier(CLK_LOCAL_MEM_FENCE); \n \
 				float prev = offset <= 0 ? 0 : *(lds - 1); \n \
 				float next = offset >= get_global_size(0) ? 0 : *(lds + 1); \n \
-				float avg = (prev + *lds + next)/3.0;\n \
+				float avg = (prev + *lds + next)/3.0f;\n \
 				float2 ret = *((__global float2*)in + offset) * avg; \n \
 				return ret; \n \
 				}
 
+//Post-callback function strings
+#define POST_MULVAL void mulval_post(__global void *output, uint outoffset, __global void *userdata, float2 fftoutput )\n \
+				{ \n \
+				float scalar = *((__global float*)userdata + outoffset); \n \
+				*((__global float2*)output + outoffset) = fftoutput * scalar; \n \
+				}
+
+#define POST_MULVAL_DP void mulval_post(__global void *output, uint outoffset, __global void *userdata, double2 fftoutput )\n \
+				{ \n \
+				double scalar = *((__global double*)userdata + outoffset); \n \
+				*((__global double2*)output + outoffset) = fftoutput * scalar; \n \
+				}
+
+#define POST_MULVAL_PLANAR void mulval_post(__global void *outputRe, __global void *outputIm, size_t outoffset, __global void *userdata, float fftoutputRe, float fftoutputIm )\n \
+				{ \n \
+				float scalar = *((__global float*)userdata + outoffset); \n \
+				*((__global float*)outputRe + outoffset) = fftoutputRe * scalar; \n \
+				*((__global float*)outputIm + outoffset) = fftoutputIm * scalar; \n \
+				}
+
+#define POST_MULVAL_PLANAR_DP void mulval_post(__global void *outputRe, __global void *outputIm, size_t outoffset, __global void *userdata, double fftoutputRe, double fftoutputIm )\n \
+				{ \n \
+				double scalar = *((__global double*)userdata + outoffset); \n \
+				*((__global double*)outputRe + outoffset) = fftoutputRe * scalar; \n \
+				*((__global double*)outputIm + outoffset) = fftoutputIm * scalar; \n \
+				}
+
+//Postcallback test for LDS - works when 1 WI works on one element. 
+//Assumes 1D FFT of length 64.
+#define POST_MULVAL_LDS void mulval_post(__global void *output, uint outoffset, __global void *userdata, float2 fftoutput, __local void* localmem)\n \
+				{ \n \
+				uint lid = get_local_id(0); \n \
+				__local float* lds; \n \
+				if (outoffset < 16) \n \
+				{ \n \
+				lds  = (__local float*)localmem + lid*4; \n \
+				lds[0] = *((__global float*)userdata + lid*4); \n \
+				lds[1] = *((__global float*)userdata + lid*4 + 1); \n \
+				lds[2] = *((__global float*)userdata + lid*4 + 2); \n \
+				lds[3] = *((__global float*)userdata + lid*4 + 3); \n \
+				} \n \
+				barrier(CLK_LOCAL_MEM_FENCE); \n \
+				lds  = (__local float*)localmem + outoffset; \n \
+				float prev = outoffset <= 0 ? 0 : *(lds - 1); \n \
+				float next = outoffset >= (get_global_size(0) - 1) ? 0 : *(lds + 1); \n \
+				float avg = (prev + *lds + next)/3.0f; \n \
+				*((__global float2*)output + outoffset) = fftoutput * avg; \n \
+				}
+
+#define POST_MULVAL_REAL void mulval_post(__global void *output, uint outoffset, __global void *userdata, float fftoutput )\n \
+				{ \n \
+				float scalar = *((__global float*)userdata + outoffset); \n \
+				*((__global float*)output + outoffset) = fftoutput * scalar; \n \
+				}
+
+#define POST_MULVAL_REAL_DP void mulval_post(__global void *output, uint outoffset, __global void *userdata, double fftoutput )\n \
+				{ \n \
+				double scalar = *((__global double*)userdata + outoffset); \n \
+				*((__global double*)output + outoffset) = fftoutput * scalar; \n \
+				}
+
 typedef struct USER_DATA
 				{
 				float scalar1;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list