[clfft] 43/128: Precallback - Large 1D C2R, callback LDS updates and GTests

Thu Oct 22 14:54:36 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 16b6cf294198e418ddbfc845fa5b4405d4ab3429
Author: Pradeep <pradeep.rao at amd.com>
Date:   Mon Aug 24 20:47:11 2015 +0530

    Precallback - Large 1D C2R, callback LDS updates and GTests
---
 src/library/generator.copy.cpp          | 118 ++++++++++++++++++++------
 src/library/generator.stockham.cpp      |  53 ++++++++++--
 src/library/generator.transpose.gcn.cpp |  14 +++
 src/library/plan.cpp                    |   8 ++
 src/tests/accuracy_test_common.h        | 143 +++++++++++++++++++++++++++++++
 src/tests/accuracy_test_precallback.cpp | 145 ++++++++++++++++++++++++++++++++
 src/tests/buffer.h                      |  79 ++++++++++++++++-
 src/tests/cl_transform.h                |  28 +++---
 src/tests/fftw_transform.h              |  17 ++++
 src/tests/test_constants.h              |  14 +++
 10 files changed, 569 insertions(+), 50 deletions(-)

diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
index 45e9728..6f4eda0 100644
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@ -161,6 +161,21 @@ namespace CopyGenerator
 
 			std::string sfx = FloatSuffix<PR>();
 
+			//If pre-callback is set for the plan
+			if (params.fft_hasPreCallback && h2c)
+			{
+				//If user defined struct defined for callback function add it to opencl source string
+				if (params.fft_preCallback.userdatastruct != NULL)
+				{
+					str += params.fft_preCallback.userdatastruct;
+					str += "\n";
+				}
+
+				//Insert callback function code at the beginning 
+				str += params.fft_preCallback.funcstring;
+				str += "\n\n";
+			}
+
 			// Copy kernel begin
 			str += "__kernel void ";
 
@@ -187,14 +202,24 @@ namespace CopyGenerator
 
 			if(outIlvd)
 			{
-				str += "__global "; str += r2Type; str += " * restrict gbOut)\n";
+				str += "__global "; str += r2Type; str += " * restrict gbOut";
 			}
 			else
 			{
 				str += "__global "; str += rType; str += " * restrict gbOutRe, ";
-				str += "__global "; str += rType; str += " * restrict gbOutIm)\n";
+				str += "__global "; str += rType; str += " * restrict gbOutIm";
 			}
 
+			if (params.fft_hasPreCallback && h2c)
+			{
+				str += ", __global void* userdata";
+				if (params.fft_preCallback.localMemSize > 0)
+				{
+					str += ", __local void* localmem";
+				}
+			}
+
+			str += ")\n";
 
 			str += "{\n";
 
@@ -214,15 +239,18 @@ namespace CopyGenerator
 			str += "uint iOffset;\n\t";
 			str += "uint oOffset;\n\t";
 
-			// input
-			if(inIlvd)
+			if (!(params.fft_hasPreCallback && h2c))
 			{
-				str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
-			}
-			else
-			{
-				str += "__global "; str += rType; str += " *lwbInRe;\n\t";
-				str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+				// input
+				if(inIlvd)
+				{
+					str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+				}
+				else
+				{
+					str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+					str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+				}
 			}
 
 			// output
@@ -246,9 +274,7 @@ namespace CopyGenerator
 					str += "__global "; str += rType; str += " *lwbOutIm2;\n\n";
 				}
 			}
-
-
-
+			
 			// Setup registers
 			str += "\t"; str += RegBaseType<PR>(2); str += " R;\n\n";
 
@@ -286,15 +312,18 @@ namespace CopyGenerator
 
 			str += "\n\t";
 
-			// inputs
-			if(inIlvd)
+			if (!(params.fft_hasPreCallback && h2c))
 			{
-				str += "lwbIn = gbIn + iOffset"; str += inF; str += ";\n\t";
-			}
-			else
-			{
-				str += "lwbInRe = gbInRe + iOffset"; str += inF; str += ";\n\t";
-				str += "lwbInIm = gbInIm + iOffset"; str += inF; str += ";\n\t";
+				// inputs
+				if(inIlvd)
+				{
+					str += "lwbIn = gbIn + iOffset"; str += inF; str += ";\n\t";
+				}
+				else
+				{
+					str += "lwbInRe = gbInRe + iOffset"; str += inF; str += ";\n\t";
+					str += "lwbInIm = gbInIm + iOffset"; str += inF; str += ";\n\t";
+				}
 			}
 
 			// outputs
@@ -335,7 +364,7 @@ namespace CopyGenerator
 					str += "R.x = lwbInRe[me + t*64];\n\t\t";
 					str += "R.y = lwbInIm[me + t*64];\n\t\t";
 				}
-
+				
 				if(outIlvd)
 				{
 					str += "lwbOut[me + t*64] = R;\n";
@@ -352,7 +381,7 @@ namespace CopyGenerator
 			{
 				str += "if(meg < "; str += SztToStr(Nt); str += ")\n\t{\n\t";
 				if(c2h)
-				{
+				{	
 					if(inIlvd)
 					{
 						str += "R = lwbIn[0];\n\t";
@@ -362,7 +391,7 @@ namespace CopyGenerator
 						str += "R.x = lwbInRe[0];\n\t";
 						str += "R.y = lwbInIm[0];\n\t";
 					}
-
+				
 					if(outIlvd)
 					{
 						str += "lwbOut[0] = R;\n\n";
@@ -375,14 +404,33 @@ namespace CopyGenerator
 				}
 				else
 				{
-					if(inIlvd)
+					if (params.fft_hasPreCallback)
 					{
-						str += "R = lwbIn[0];\n\t";
+						if(inIlvd)
+						{
+							str += "R = "; str += params.fft_preCallback.funcname; str += "( gbIn, (iOffset"; str += inF; str += "), userdata"; 
+						}
+						else
+						{
+							str += "R = "; str += params.fft_preCallback.funcname; str += "( gbInRe, gbInIm, (iOffset"; str += inF; str += "), userdata";
+						}
+						if (params.fft_preCallback.localMemSize > 0)
+						{
+							str += ", localmem";
+						}
+						str += ");\n\t\t";
 					}
 					else
 					{
-						str += "R.x = lwbInRe[0];\n\t";
-						str += "R.y = lwbInIm[0];\n\t";
+						if(inIlvd)
+						{
+							str += "R = lwbIn[0];\n\t";
+						}
+						else
+						{
+							str += "R.x = lwbInRe[0];\n\t";
+							str += "R.y = lwbInIm[0];\n\t";
+						}
 					}
 
 					if(outIlvd)
@@ -443,6 +491,20 @@ clfftStatus FFTGeneratedCopyAction::initParams ()
     this->signature.fft_fwdScale  = this->plan->forwardScale;
     this->signature.fft_backScale = this->plan->backwardScale;
 
+	//Set callback if specified
+	if (this->plan->hasPreCallback)
+	{
+		this->signature.fft_hasPreCallback = true;
+		this->signature.fft_preCallback = this->plan->preCallback;
+
+		//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+		if (this->plan->preCallback.localMemSize > this->plan->envelope.limit_LocalMemSize)
+		{
+			fprintf(stderr, "Requested local memory size not available\n");
+			return CLFFT_INVALID_ARG_VALUE;
+		}
+	}
+
     return CLFFT_SUCCESS;
 }
 
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 91e9e81..56b6d48 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -40,7 +40,7 @@ FFTGeneratedStockhamAction::FFTGeneratedStockhamAction(clfftPlanHandle plHandle,
         fprintf(stderr, "FFTGeneratedStockhamAction::initParams() failed!\n");
         return;
     }
-
+	
     FFTRepo &fftRepo = FFTRepo::getInstance();
 
     err = this->generateKernel(fftRepo, queue);
@@ -51,7 +51,7 @@ FFTGeneratedStockhamAction::FFTGeneratedStockhamAction(clfftPlanHandle plHandle,
         return;
     }
 
-    err = compileKernels( queue, plHandle, plan);
+	err = compileKernels( queue, plHandle, plan);
 
     if (err != CLFFT_SUCCESS)
     {
@@ -3759,13 +3759,6 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
     this->signature.fft_inputLayout  = this->plan->inputLayout;
 	this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
 
-	//Set callback if specified
-	if (this->plan->hasPreCallback)
-	{
-		this->signature.fft_hasPreCallback = true;
-		this->signature.fft_preCallback = this->plan->preCallback;
-	}
-
     ARG_CHECK(this->plan->length.size()    > 0);
 	ARG_CHECK(this->plan->inStride.size()  > 0);
     ARG_CHECK(this->plan->outStride.size() > 0);
@@ -3859,6 +3852,12 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
 	this->signature.fft_R = (nt * this->signature.fft_N[0])/wgs;
 	this->signature.fft_SIMD = wgs;
 
+	//Set callback if specified
+	if (this->plan->hasPreCallback)
+	{
+		this->signature.fft_hasPreCallback = true;
+		this->signature.fft_preCallback = this->plan->preCallback;
+	}
 
     if (this->plan->large1D != 0) {
         ARG_CHECK (this->signature.fft_N[0] != 0)
@@ -3963,6 +3962,42 @@ clfftStatus FFTGeneratedStockhamAction::generateKernel(FFTRepo& fftRepo, const c
 		} break;
 	}
 
+	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+	if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+	{
+		bool validLDSSize = false;
+		if (this->plan->blockCompute)
+		{
+			validLDSSize = ((this->signature.blockLDS * this->plan->ElementSize()) +  this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+		}
+		else
+		{
+			size_t length = this->signature.fft_N[0];
+			size_t workGroupSize = this->signature.fft_SIMD;
+			size_t numTrans = (workGroupSize * this->signature.fft_R) / length;
+
+			//TODO - Need to abstract this out. Repeating the same compute as in GenerateKernel. 
+			// Set half lds only for power-of-2 problem sizes & interleaved data
+			bool halfLds = ( (this->signature.fft_inputLayout == CLFFT_COMPLEX_INTERLEAVED) &&
+						(this->signature.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ) ? true : false;
+			halfLds = halfLds ? ((length & (length-1)) ? false : true) : false;
+
+			// Set half lds for real transforms
+			halfLds = ( (this->signature.fft_inputLayout == CLFFT_REAL) &&
+						(this->signature.fft_outputLayout == CLFFT_REAL) ) ? true : halfLds;
+
+			size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
+			size_t elementSize = ((this->signature.fft_precision == CLFFT_DOUBLE) || (this->signature.fft_precision == CLFFT_DOUBLE_FAST)) ? sizeof(double) : sizeof(float);
+
+			validLDSSize = ((ldsSize * elementSize) + this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+		}
+		if(!validLDSSize)
+		{
+			fprintf(stderr, "Requested local memory size not available\n");
+			return CLFFT_INVALID_ARG_VALUE;
+		}
+	}
+
 #ifdef KERNEL_INTERJECT
 	ReadKernelFromFile(programCode);
 #endif
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
index 41bf2d2..5359ec4 100644
--- a/src/library/generator.transpose.gcn.cpp
+++ b/src/library/generator.transpose.gcn.cpp
@@ -1052,6 +1052,20 @@ clfftStatus FFTGeneratedTransposeGCNAction::generateKernel ( FFTRepo& fftRepo, c
 	blockSize.x = lwSize.x * reShapeFactor;
 	blockSize.y = lwSize.y / reShapeFactor * loopCount;
 
+	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+	if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+	{
+		bool validLDSSize = false;
+		size_t length = blockSize.x * blockSize.y;
+		
+		validLDSSize = ((length * this->plan->ElementSize()) + this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+		
+		if(!validLDSSize)
+		{
+			fprintf(stderr, "Requested local memory size not available\n");
+			return CLFFT_INVALID_ARG_VALUE;
+		}
+	}
 
     std::string programCode;
     OPENCL_V( genTransposeKernel( this->signature, programCode, lwSize, reShapeFactor, loopCount, blockSize, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index b5911fb..d07e304 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1258,6 +1258,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						copyPlan->inStride.push_back(fftPlan->inStride[index]);
 					}
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPreCallback)
+					{
+						copyPlan->hasPreCallback = true;
+						copyPlan->preCallback = fftPlan->preCallback;
+						copyPlan->precallUserData = fftPlan->precallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
 
 					// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
diff --git a/src/tests/accuracy_test_common.h b/src/tests/accuracy_test_common.h
index a6256e3..d1f23e3 100644
--- a/src/tests/accuracy_test_common.h
+++ b/src/tests/accuracy_test_common.h
@@ -205,6 +205,84 @@ void precallback_complex_to_complex( data_pattern pattern, direction::direction_
 	EXPECT_EQ( true, test_fft.result() == reference.result() );
 }
 
+template< class T, class cl_T, class fftw_T >
+void precallback_complex_to_complex_lds( data_pattern pattern, direction::direction_t direction,
+	std::vector<size_t> lengths, size_t batch,
+	std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+	size_t input_distance, size_t output_distance,
+	layout::buffer_layout_t in_layout, layout::buffer_layout_t out_layout,
+	placeness::placeness_t placeness, 
+	T scale = 1.0f )
+{
+	clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+		input_strides.empty() ? NULL : &input_strides[0],
+		output_strides.empty() ? NULL : &output_strides[0],
+		batch, input_distance, output_distance,
+		cl_layout(in_layout), cl_layout(out_layout),
+		cl_placeness(placeness) );
+
+	fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2c );
+
+	//initialize input
+	if( pattern == sawtooth )
+	{
+		test_fft.set_input_to_sawtooth( 1.0f );
+		reference.set_data_to_sawtooth( 1.0f );
+	}
+	else if( pattern == value )
+	{
+		test_fft.set_input_to_value( 2.0f, 2.5f );
+		reference.set_all_data_to_value( 2.0f, 2.5f );
+	}
+	else if( pattern == impulse )
+	{
+		test_fft.set_input_to_impulse();
+		reference.set_data_to_impulse();
+	}
+	else if( pattern == erratic )
+	{
+		test_fft.set_input_to_random();
+		reference.set_data_to_random();
+	}
+	else
+	{
+		throw std::runtime_error( "invalid pattern type in complex_to_complex()" );
+	}
+
+	// if we're starting with unequal data, we're destined for failure
+	EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+	//set precallback values
+	//Test assumes 64 length data
+	unsigned int localMemSize = 64 * sizeof(T);
+	test_fft.set_input_precallback(localMemSize);
+	reference.set_input_precallback_special();
+
+	if( direction == direction::forward )
+	{
+		test_fft.set_forward_transform();
+		test_fft.forward_scale( scale );
+
+		reference.set_forward_transform();
+		reference.forward_scale( scale );
+	}
+	else if( direction == direction::backward )
+	{
+		test_fft.set_backward_transform();
+		test_fft.backward_scale( scale );
+
+		reference.set_backward_transform();
+		reference.backward_scale( scale );
+	}
+	else
+		throw std::runtime_error( "invalid direction in complex_to_complex()" );
+
+	reference.transform();
+	test_fft.transform();
+
+	EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
 /*****************************************************/
 /*****************************************************/
 // dimension is inferred from lengths.size()
@@ -394,6 +472,71 @@ void complex_to_real( data_pattern pattern,
 /*****************************************************/
 // dimension is inferred from lengths.size()
 // tightly packed is inferred from strides.empty()
+// output layout is always real
+template< class T, class cl_T, class fftw_T >
+void precallback_complex_to_real( data_pattern pattern,
+	std::vector<size_t> lengths, size_t batch,
+	std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+	size_t input_distance, size_t output_distance,
+	layout::buffer_layout_t in_layout,
+	placeness::placeness_t placeness,
+	T scale = 1.0f )
+{
+	fftw<T, fftw_T> data_maker( lengths.size(), &lengths[0], batch, r2c );
+
+	if( pattern == sawtooth )
+	{
+		data_maker.set_data_to_sawtooth(1.0f);
+	}
+	else if( pattern == value )
+	{
+		data_maker.set_all_data_to_value(2.0f);
+	}
+	else if( pattern == impulse )
+	{
+		data_maker.set_data_to_impulse();
+	}
+	else if( pattern == erratic )
+	{
+		data_maker.set_data_to_random();
+	}
+	else
+	{
+		throw std::runtime_error( "invalid pattern type in complex_to_real()" );
+	}
+
+	data_maker.transform();
+
+	clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+		input_strides.empty() ? NULL : &input_strides[0],
+		output_strides.empty() ? NULL : &output_strides[0],
+		batch, input_distance, output_distance,
+		cl_layout(in_layout), cl_layout(layout::real),
+		cl_placeness(placeness) );
+	test_fft.set_input_to_buffer( data_maker.result() );
+
+	fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2r );
+	reference.set_input_to_buffer(data_maker.result());
+
+	// if we're starting with unequal data, we're destined for failure
+	EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+	test_fft.set_input_precallback();
+	reference.set_input_precallback();
+
+	test_fft.backward_scale( scale );
+	reference.backward_scale( scale );
+
+	test_fft.transform();
+	reference.transform();
+
+	EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
+/*****************************************************/
+/*****************************************************/
+// dimension is inferred from lengths.size()
+// tightly packed is inferred from strides.empty()
 
 // no need to support non-unit strides and distances here
 // they are covered in plenty of other places
diff --git a/src/tests/accuracy_test_precallback.cpp b/src/tests/accuracy_test_precallback.cpp
index c9c01e8..f2a9a9c 100644
--- a/src/tests/accuracy_test_precallback.cpp
+++ b/src/tests/accuracy_test_precallback.cpp
@@ -144,6 +144,64 @@ TEST_P( mixed_radix_precallback, double_precision_real_to_hermitian_auto_generat
 	mixed_radix_real_to_hermitian<double, cl_double, fftw_complex>(problem_size);
 }
 
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_in_place_hermitian_interleaved_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+
+	data_pattern pattern = sawtooth;
+	precallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_large_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_large_1D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_large_1D_in_place_hermitian_interleaved_to_real)
+{
+	try { pow2_large_1D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_out_of_place_hermitian_planar_to_real()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t layout = layout::hermitian_planar;
+	placeness::placeness_t placeness = placeness::out_of_place;
+
+	data_pattern pattern = sawtooth;
+	precallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_large_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow2_large_1D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_large_1D_out_of_place_hermitian_planar_to_real)
+{
+	try { pow2_large_1D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
 // *****************************************************
 // *****************************************************
 template< class T, class cl_T, class fftw_T >
@@ -210,6 +268,69 @@ TEST_F(accuracy_test_precallback_double, pow2_normal_1D_forward_in_place_complex
 	catch( const std::exception& err ) { handle_exception(err);	}
 }
 
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( large2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = impulse;
+	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( normal2 );
+	lengths.push_back( normal2 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
 // *****************************************************
 // *****************************************************
 template< class T, class cl_T, class fftw_T >
@@ -1366,5 +1487,29 @@ TEST_F(accuracy_test_precallback_double, pow2_large_3D_in_place_real_to_hermitia
 	catch( const std::exception& err ) { handle_exception(err);	}
 }
 
+template< class T, class cl_T, class fftw_T >
+void lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 64 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = impulse;
+	precallback_complex_to_complex_lds<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
 
 }
\ No newline at end of file
diff --git a/src/tests/buffer.h b/src/tests/buffer.h
index c9664fc..be8fe62 100644
--- a/src/tests/buffer.h
+++ b/src/tests/buffer.h
@@ -629,16 +629,25 @@ public:
 						{
 							*( base_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
 		
-							if (!other_buffer.is_real())
+							the_index = the_index + 1; // the imaginary component immediately follows the real
+							if (other_buffer.is_real())
 							{
-								the_index = the_index + 1; // the imaginary component immediately follows the real
+								*( base_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
+							}
+							else
+							{	
 								*( base_ptr + the_index ) *= other_buffer.imag(x, y, z, batch);
 							}
 						}
 						else if ( is_planar() )
 						{
 							*( real_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
-							if (!other_buffer.is_real())
+							
+							if (other_buffer.is_real())
+							{
+								*( imag_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
+							}
+							else
 							{
 								*( imag_ptr + the_index ) *= other_buffer.imag(x, y, z, batch);
 							}
@@ -650,6 +659,70 @@ public:
 					}
 	}
 
+	//Calculates a 3 point average of other_buffer and
+	//multiplies with buffer
+	//only real layout is supported for other_buffer currently
+	void multiply_3pt_average( buffer<T> & other_buffer )
+	{
+		if (!other_buffer.is_real())
+		{
+			throw std::runtime_error( "only real layout is supported currently for other_buffer" );
+		}
+
+		size_t the_index, o_the_index;
+		T *base_ptr, *o_base_ptr;
+		T *real_ptr;
+		T *imag_ptr;
+		T o_prev_val, o_next_val;
+		T average;
+
+		if( is_interleaved() )
+		{
+			base_ptr = _the_buffers[interleaved].ptr();
+		}
+		else if ( is_planar() )
+		{
+			real_ptr = _the_buffers[re].ptr();
+			imag_ptr = _the_buffers[im].ptr();
+		}
+		else if ( is_real() )
+		{
+			base_ptr = _the_buffers[re].ptr();
+		}
+		o_base_ptr = other_buffer.real_ptr();
+
+		for( size_t batch = 0; batch < batch_size(); batch++ )
+			for( size_t z = 0; z < length(dimz); z++ )
+				for( size_t y = 0; y < length(dimy); y++ )
+					for( size_t x = 0; x < length(dimx); x++ )
+					{		
+						the_index = index(x, y, z, batch);
+						o_the_index = other_buffer.index(x, y, z, batch);
+						o_prev_val = o_the_index <= 0 ? 0 : *(o_base_ptr + o_the_index - 1);
+						o_next_val = o_the_index >= (other_buffer.total_number_of_points_including_data_and_intervening() - 1) ? 0 : *(o_base_ptr + o_the_index +  1);
+						
+						average = (o_prev_val + *(o_base_ptr + o_the_index) + o_next_val)/ 3.0 ;
+
+						if( is_interleaved() )
+						{
+							*( base_ptr + the_index ) *= average;
+		
+							the_index = the_index + 1; // the imaginary component immediately follows the real
+							*( base_ptr + the_index ) *= average;
+						}
+						else if ( is_planar() )
+						{
+							*( real_ptr + the_index ) *= average;
+							
+							*( imag_ptr + the_index ) *= average;
+						}
+						else if ( is_real() )
+						{
+							*( base_ptr + the_index ) *= average;
+						}
+					}
+	}
+
 	/*****************************************************/
 	// strides and distance are those of the output (that is, the new hermitian buffer)
 	void change_real_to_hermitian( const size_t* strides_in, const size_t distance_in )
diff --git a/src/tests/cl_transform.h b/src/tests/cl_transform.h
index 3a33bd3..cc946eb 100644
--- a/src/tests/cl_transform.h
+++ b/src/tests/cl_transform.h
@@ -618,24 +618,32 @@ public:
 	}
 
 	/*****************************************************/
-	void set_input_precallback() {
+	void set_input_precallback(unsigned int localMemSize = 0) {
 		cl_int status = 0;
 		clfftPrecision precision;
 		clfftGetPlanPrecision( *plan_handle, &precision );
 
 		char* precallbackstr;
 		
-		if (input.is_interleaved() )
+		if (localMemSize > 0)
 		{
-			precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+			//Test for LDS in precallback function
+			precallbackstr = STRINGIFY(MULVAL_LDS);
 		}
-		else if (input.is_planar())
-		{
-			precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
-		}
-		else if (input.is_real())
+		else
 		{
-			precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_REAL) : STRINGIFY(MULVAL_REAL_DP);
+			if (input.is_interleaved() )
+			{
+				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+			}
+			else if (input.is_planar())
+			{
+				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
+			}
+			else if (input.is_real())
+			{
+				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_REAL) : STRINGIFY(MULVAL_REAL_DP);
+			}
 		}
 
 		//precallback user data
@@ -657,7 +665,7 @@ public:
 		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
 
 		//Register the callback
-		OPENCL_V_THROW (clFFTSetPlanCallback(*plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdataBuff), "clFFTSetPlanCallback failed");
+		OPENCL_V_THROW (clFFTSetPlanCallback(*plan_handle, "mulval", precallbackstr, NULL, localMemSize, PRECALLBACK, userdataBuff), "clFFTSetPlanCallback failed");
 	}
 
 		/*****************************************************/
diff --git a/src/tests/fftw_transform.h b/src/tests/fftw_transform.h
index 7970207..cd44d9a 100644
--- a/src/tests/fftw_transform.h
+++ b/src/tests/fftw_transform.h
@@ -457,6 +457,23 @@ public:
 		input *= userdata;
 	}
 
+	void set_input_precallback_special()
+	{
+		//precallback user data
+		buffer<T> userdata( 	input.number_of_dimensions(),
+					input.lengths(),
+					input.strides(),
+					input.batch_size(),
+					input.distance(),
+					layout::real ,
+					cl_placeness(placeness::in_place)
+					);
+		
+		userdata.set_all_to_random_data(_lengths[0], 10);
+		
+		input.multiply_3pt_average(userdata);
+	}
+
 	/*****************************************************/
 	void clear_data_buffer()
 	{
diff --git a/src/tests/test_constants.h b/src/tests/test_constants.h
index 56f14d4..5a27db8 100644
--- a/src/tests/test_constants.h
+++ b/src/tests/test_constants.h
@@ -77,6 +77,20 @@
 				return ret; \n \
 				}
 
+//Precallback test for LDS - works when 1 WI works on one input element
+#define MULVAL_LDS float2 mulval(__global void* in, uint offset, __global void* userdata, __local void* localmem)\n \
+				{ \n \
+				uint lid = get_local_id(0); \n \
+				__local float* lds = (__local float*)localmem + lid; \n \
+				lds[0] = *((__global float*)userdata + offset); \n \
+				barrier(CLK_LOCAL_MEM_FENCE); \n \
+				float prev = offset <= 0 ? 0 : *(lds - 1); \n \
+				float next = offset >= get_global_size(0) ? 0 : *(lds + 1); \n \
+				float avg = (prev + *lds + next)/3.0;\n \
+				float2 ret = *((__global float2*)in + offset) * avg; \n \
+				return ret; \n \
+				}
+
 #define STRUCT_USERDATA typedef struct USER_DATA  \
 					   {  \
 						float scalar1;  \

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git