[clfft] 43/128: Precallback - Large 1D C2R, callback LDS updates and GTests
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:36 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 16b6cf294198e418ddbfc845fa5b4405d4ab3429
Author: Pradeep <pradeep.rao at amd.com>
Date: Mon Aug 24 20:47:11 2015 +0530
Precallback - Large 1D C2R, callback LDS updates and GTests
---
src/library/generator.copy.cpp | 118 ++++++++++++++++++++------
src/library/generator.stockham.cpp | 53 ++++++++++--
src/library/generator.transpose.gcn.cpp | 14 +++
src/library/plan.cpp | 8 ++
src/tests/accuracy_test_common.h | 143 +++++++++++++++++++++++++++++++
src/tests/accuracy_test_precallback.cpp | 145 ++++++++++++++++++++++++++++++++
src/tests/buffer.h | 79 ++++++++++++++++-
src/tests/cl_transform.h | 28 +++---
src/tests/fftw_transform.h | 17 ++++
src/tests/test_constants.h | 14 +++
10 files changed, 569 insertions(+), 50 deletions(-)
diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
index 45e9728..6f4eda0 100644
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@ -161,6 +161,21 @@ namespace CopyGenerator
std::string sfx = FloatSuffix<PR>();
+ //If pre-callback is set for the plan
+ if (params.fft_hasPreCallback && h2c)
+ {
+ //If user defined struct defined for callback function add it to opencl source string
+ if (params.fft_preCallback.userdatastruct != NULL)
+ {
+ str += params.fft_preCallback.userdatastruct;
+ str += "\n";
+ }
+
+ //Insert callback function code at the beginning
+ str += params.fft_preCallback.funcstring;
+ str += "\n\n";
+ }
+
// Copy kernel begin
str += "__kernel void ";
@@ -187,14 +202,24 @@ namespace CopyGenerator
if(outIlvd)
{
- str += "__global "; str += r2Type; str += " * restrict gbOut)\n";
+ str += "__global "; str += r2Type; str += " * restrict gbOut";
}
else
{
str += "__global "; str += rType; str += " * restrict gbOutRe, ";
- str += "__global "; str += rType; str += " * restrict gbOutIm)\n";
+ str += "__global "; str += rType; str += " * restrict gbOutIm";
}
+ if (params.fft_hasPreCallback && h2c)
+ {
+ str += ", __global void* userdata";
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ str += ", __local void* localmem";
+ }
+ }
+
+ str += ")\n";
str += "{\n";
@@ -214,15 +239,18 @@ namespace CopyGenerator
str += "uint iOffset;\n\t";
str += "uint oOffset;\n\t";
- // input
- if(inIlvd)
+ if (!(params.fft_hasPreCallback && h2c))
{
- str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
- }
- else
- {
- str += "__global "; str += rType; str += " *lwbInRe;\n\t";
- str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ // input
+ if(inIlvd)
+ {
+ str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ }
}
// output
@@ -246,9 +274,7 @@ namespace CopyGenerator
str += "__global "; str += rType; str += " *lwbOutIm2;\n\n";
}
}
-
-
-
+
// Setup registers
str += "\t"; str += RegBaseType<PR>(2); str += " R;\n\n";
@@ -286,15 +312,18 @@ namespace CopyGenerator
str += "\n\t";
- // inputs
- if(inIlvd)
+ if (!(params.fft_hasPreCallback && h2c))
{
- str += "lwbIn = gbIn + iOffset"; str += inF; str += ";\n\t";
- }
- else
- {
- str += "lwbInRe = gbInRe + iOffset"; str += inF; str += ";\n\t";
- str += "lwbInIm = gbInIm + iOffset"; str += inF; str += ";\n\t";
+ // inputs
+ if(inIlvd)
+ {
+ str += "lwbIn = gbIn + iOffset"; str += inF; str += ";\n\t";
+ }
+ else
+ {
+ str += "lwbInRe = gbInRe + iOffset"; str += inF; str += ";\n\t";
+ str += "lwbInIm = gbInIm + iOffset"; str += inF; str += ";\n\t";
+ }
}
// outputs
@@ -335,7 +364,7 @@ namespace CopyGenerator
str += "R.x = lwbInRe[me + t*64];\n\t\t";
str += "R.y = lwbInIm[me + t*64];\n\t\t";
}
-
+
if(outIlvd)
{
str += "lwbOut[me + t*64] = R;\n";
@@ -352,7 +381,7 @@ namespace CopyGenerator
{
str += "if(meg < "; str += SztToStr(Nt); str += ")\n\t{\n\t";
if(c2h)
- {
+ {
if(inIlvd)
{
str += "R = lwbIn[0];\n\t";
@@ -362,7 +391,7 @@ namespace CopyGenerator
str += "R.x = lwbInRe[0];\n\t";
str += "R.y = lwbInIm[0];\n\t";
}
-
+
if(outIlvd)
{
str += "lwbOut[0] = R;\n\n";
@@ -375,14 +404,33 @@ namespace CopyGenerator
}
else
{
- if(inIlvd)
+ if (params.fft_hasPreCallback)
{
- str += "R = lwbIn[0];\n\t";
+ if(inIlvd)
+ {
+ str += "R = "; str += params.fft_preCallback.funcname; str += "( gbIn, (iOffset"; str += inF; str += "), userdata";
+ }
+ else
+ {
+ str += "R = "; str += params.fft_preCallback.funcname; str += "( gbInRe, gbInIm, (iOffset"; str += inF; str += "), userdata";
+ }
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ str += ", localmem";
+ }
+ str += ");\n\t\t";
}
else
{
- str += "R.x = lwbInRe[0];\n\t";
- str += "R.y = lwbInIm[0];\n\t";
+ if(inIlvd)
+ {
+ str += "R = lwbIn[0];\n\t";
+ }
+ else
+ {
+ str += "R.x = lwbInRe[0];\n\t";
+ str += "R.y = lwbInIm[0];\n\t";
+ }
}
if(outIlvd)
@@ -443,6 +491,20 @@ clfftStatus FFTGeneratedCopyAction::initParams ()
this->signature.fft_fwdScale = this->plan->forwardScale;
this->signature.fft_backScale = this->plan->backwardScale;
+ //Set callback if specified
+ if (this->plan->hasPreCallback)
+ {
+ this->signature.fft_hasPreCallback = true;
+ this->signature.fft_preCallback = this->plan->preCallback;
+
+ //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+ if (this->plan->preCallback.localMemSize > this->plan->envelope.limit_LocalMemSize)
+ {
+ fprintf(stderr, "Requested local memory size not available\n");
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
return CLFFT_SUCCESS;
}
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 91e9e81..56b6d48 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -40,7 +40,7 @@ FFTGeneratedStockhamAction::FFTGeneratedStockhamAction(clfftPlanHandle plHandle,
fprintf(stderr, "FFTGeneratedStockhamAction::initParams() failed!\n");
return;
}
-
+
FFTRepo &fftRepo = FFTRepo::getInstance();
err = this->generateKernel(fftRepo, queue);
@@ -51,7 +51,7 @@ FFTGeneratedStockhamAction::FFTGeneratedStockhamAction(clfftPlanHandle plHandle,
return;
}
- err = compileKernels( queue, plHandle, plan);
+ err = compileKernels( queue, plHandle, plan);
if (err != CLFFT_SUCCESS)
{
@@ -3759,13 +3759,6 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
this->signature.fft_inputLayout = this->plan->inputLayout;
this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
- //Set callback if specified
- if (this->plan->hasPreCallback)
- {
- this->signature.fft_hasPreCallback = true;
- this->signature.fft_preCallback = this->plan->preCallback;
- }
-
ARG_CHECK(this->plan->length.size() > 0);
ARG_CHECK(this->plan->inStride.size() > 0);
ARG_CHECK(this->plan->outStride.size() > 0);
@@ -3859,6 +3852,12 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
this->signature.fft_R = (nt * this->signature.fft_N[0])/wgs;
this->signature.fft_SIMD = wgs;
+ //Set callback if specified
+ if (this->plan->hasPreCallback)
+ {
+ this->signature.fft_hasPreCallback = true;
+ this->signature.fft_preCallback = this->plan->preCallback;
+ }
if (this->plan->large1D != 0) {
ARG_CHECK (this->signature.fft_N[0] != 0)
@@ -3963,6 +3962,42 @@ clfftStatus FFTGeneratedStockhamAction::generateKernel(FFTRepo& fftRepo, const c
} break;
}
+ //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+ if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+ {
+ bool validLDSSize = false;
+ if (this->plan->blockCompute)
+ {
+ validLDSSize = ((this->signature.blockLDS * this->plan->ElementSize()) + this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+ }
+ else
+ {
+ size_t length = this->signature.fft_N[0];
+ size_t workGroupSize = this->signature.fft_SIMD;
+ size_t numTrans = (workGroupSize * this->signature.fft_R) / length;
+
+ //TODO - Need to abstract this out. Repeating the same compute as in GenerateKernel.
+ // Set half lds only for power-of-2 problem sizes & interleaved data
+ bool halfLds = ( (this->signature.fft_inputLayout == CLFFT_COMPLEX_INTERLEAVED) &&
+ (this->signature.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ) ? true : false;
+ halfLds = halfLds ? ((length & (length-1)) ? false : true) : false;
+
+ // Set half lds for real transforms
+ halfLds = ( (this->signature.fft_inputLayout == CLFFT_REAL) &&
+ (this->signature.fft_outputLayout == CLFFT_REAL) ) ? true : halfLds;
+
+ size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
+ size_t elementSize = ((this->signature.fft_precision == CLFFT_DOUBLE) || (this->signature.fft_precision == CLFFT_DOUBLE_FAST)) ? sizeof(double) : sizeof(float);
+
+ validLDSSize = ((ldsSize * elementSize) + this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+ }
+ if(!validLDSSize)
+ {
+ fprintf(stderr, "Requested local memory size not available\n");
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
#ifdef KERNEL_INTERJECT
ReadKernelFromFile(programCode);
#endif
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
index 41bf2d2..5359ec4 100644
--- a/src/library/generator.transpose.gcn.cpp
+++ b/src/library/generator.transpose.gcn.cpp
@@ -1052,6 +1052,20 @@ clfftStatus FFTGeneratedTransposeGCNAction::generateKernel ( FFTRepo& fftRepo, c
blockSize.x = lwSize.x * reShapeFactor;
blockSize.y = lwSize.y / reShapeFactor * loopCount;
+ //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+ if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+ {
+ bool validLDSSize = false;
+ size_t length = blockSize.x * blockSize.y;
+
+ validLDSSize = ((length * this->plan->ElementSize()) + this->signature.fft_preCallback.localMemSize) < this->plan->envelope.limit_LocalMemSize;
+
+ if(!validLDSSize)
+ {
+ fprintf(stderr, "Requested local memory size not available\n");
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
std::string programCode;
OPENCL_V( genTransposeKernel( this->signature, programCode, lwSize, reShapeFactor, loopCount, blockSize, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index b5911fb..d07e304 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1258,6 +1258,14 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
copyPlan->inStride.push_back(fftPlan->inStride[index]);
}
+ //Set callback data if set on top level plan
+ if (fftPlan->hasPreCallback)
+ {
+ copyPlan->hasPreCallback = true;
+ copyPlan->preCallback = fftPlan->preCallback;
+ copyPlan->precallUserData = fftPlan->precallUserData;
+ }
+
OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
diff --git a/src/tests/accuracy_test_common.h b/src/tests/accuracy_test_common.h
index a6256e3..d1f23e3 100644
--- a/src/tests/accuracy_test_common.h
+++ b/src/tests/accuracy_test_common.h
@@ -205,6 +205,84 @@ void precallback_complex_to_complex( data_pattern pattern, direction::direction_
EXPECT_EQ( true, test_fft.result() == reference.result() );
}
+template< class T, class cl_T, class fftw_T >
+void precallback_complex_to_complex_lds( data_pattern pattern, direction::direction_t direction,
+ std::vector<size_t> lengths, size_t batch,
+ std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+ size_t input_distance, size_t output_distance,
+ layout::buffer_layout_t in_layout, layout::buffer_layout_t out_layout,
+ placeness::placeness_t placeness,
+ T scale = 1.0f )
+{
+ clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+ input_strides.empty() ? NULL : &input_strides[0],
+ output_strides.empty() ? NULL : &output_strides[0],
+ batch, input_distance, output_distance,
+ cl_layout(in_layout), cl_layout(out_layout),
+ cl_placeness(placeness) );
+
+ fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2c );
+
+ //initialize input
+ if( pattern == sawtooth )
+ {
+ test_fft.set_input_to_sawtooth( 1.0f );
+ reference.set_data_to_sawtooth( 1.0f );
+ }
+ else if( pattern == value )
+ {
+ test_fft.set_input_to_value( 2.0f, 2.5f );
+ reference.set_all_data_to_value( 2.0f, 2.5f );
+ }
+ else if( pattern == impulse )
+ {
+ test_fft.set_input_to_impulse();
+ reference.set_data_to_impulse();
+ }
+ else if( pattern == erratic )
+ {
+ test_fft.set_input_to_random();
+ reference.set_data_to_random();
+ }
+ else
+ {
+ throw std::runtime_error( "invalid pattern type in complex_to_complex()" );
+ }
+
+ // if we're starting with unequal data, we're destined for failure
+ EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+ //set precallback values
+ //Test assumes 64 length data
+ unsigned int localMemSize = 64 * sizeof(T);
+ test_fft.set_input_precallback(localMemSize);
+ reference.set_input_precallback_special();
+
+ if( direction == direction::forward )
+ {
+ test_fft.set_forward_transform();
+ test_fft.forward_scale( scale );
+
+ reference.set_forward_transform();
+ reference.forward_scale( scale );
+ }
+ else if( direction == direction::backward )
+ {
+ test_fft.set_backward_transform();
+ test_fft.backward_scale( scale );
+
+ reference.set_backward_transform();
+ reference.backward_scale( scale );
+ }
+ else
+ throw std::runtime_error( "invalid direction in complex_to_complex()" );
+
+ reference.transform();
+ test_fft.transform();
+
+ EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
/*****************************************************/
/*****************************************************/
// dimension is inferred from lengths.size()
@@ -394,6 +472,71 @@ void complex_to_real( data_pattern pattern,
/*****************************************************/
// dimension is inferred from lengths.size()
// tightly packed is inferred from strides.empty()
+// output layout is always real
+template< class T, class cl_T, class fftw_T >
+void precallback_complex_to_real( data_pattern pattern,
+ std::vector<size_t> lengths, size_t batch,
+ std::vector<size_t> input_strides, std::vector<size_t> output_strides,
+ size_t input_distance, size_t output_distance,
+ layout::buffer_layout_t in_layout,
+ placeness::placeness_t placeness,
+ T scale = 1.0f )
+{
+ fftw<T, fftw_T> data_maker( lengths.size(), &lengths[0], batch, r2c );
+
+ if( pattern == sawtooth )
+ {
+ data_maker.set_data_to_sawtooth(1.0f);
+ }
+ else if( pattern == value )
+ {
+ data_maker.set_all_data_to_value(2.0f);
+ }
+ else if( pattern == impulse )
+ {
+ data_maker.set_data_to_impulse();
+ }
+ else if( pattern == erratic )
+ {
+ data_maker.set_data_to_random();
+ }
+ else
+ {
+ throw std::runtime_error( "invalid pattern type in complex_to_real()" );
+ }
+
+ data_maker.transform();
+
+ clfft<T, cl_T> test_fft( static_cast<clfftDim>(lengths.size()), &lengths[0],
+ input_strides.empty() ? NULL : &input_strides[0],
+ output_strides.empty() ? NULL : &output_strides[0],
+ batch, input_distance, output_distance,
+ cl_layout(in_layout), cl_layout(layout::real),
+ cl_placeness(placeness) );
+ test_fft.set_input_to_buffer( data_maker.result() );
+
+ fftw<T, fftw_T> reference( lengths.size(), &lengths[0], batch, c2r );
+ reference.set_input_to_buffer(data_maker.result());
+
+ // if we're starting with unequal data, we're destined for failure
+ EXPECT_EQ( true, test_fft.input_buffer() == reference.input_buffer() );
+
+ test_fft.set_input_precallback();
+ reference.set_input_precallback();
+
+ test_fft.backward_scale( scale );
+ reference.backward_scale( scale );
+
+ test_fft.transform();
+ reference.transform();
+
+ EXPECT_EQ( true, test_fft.result() == reference.result() );
+}
+
+/*****************************************************/
+/*****************************************************/
+// dimension is inferred from lengths.size()
+// tightly packed is inferred from strides.empty()
// no need to support non-unit strides and distances here
// they are covered in plenty of other places
diff --git a/src/tests/accuracy_test_precallback.cpp b/src/tests/accuracy_test_precallback.cpp
index c9c01e8..f2a9a9c 100644
--- a/src/tests/accuracy_test_precallback.cpp
+++ b/src/tests/accuracy_test_precallback.cpp
@@ -144,6 +144,64 @@ TEST_P( mixed_radix_precallback, double_precision_real_to_hermitian_auto_generat
mixed_radix_real_to_hermitian<double, cl_double, fftw_complex>(problem_size);
}
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_in_place_hermitian_interleaved_to_real()
+{
+ std::vector<size_t> lengths;
+ lengths.push_back( large2 );
+ size_t batch = 1;
+ std::vector<size_t> input_strides;
+ std::vector<size_t> output_strides;
+ size_t input_distance = 0;
+ size_t output_distance = 0;
+ layout::buffer_layout_t layout = layout::hermitian_interleaved;
+ placeness::placeness_t placeness = placeness::in_place;
+
+ data_pattern pattern = sawtooth;
+ precallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_large_1D_in_place_hermitian_interleaved_to_real)
+{
+ try { pow2_large_1D_in_place_hermitian_interleaved_to_real< float, cl_float, fftwf_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_large_1D_in_place_hermitian_interleaved_to_real)
+{
+ try { pow2_large_1D_in_place_hermitian_interleaved_to_real< double, cl_double, fftw_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_out_of_place_hermitian_planar_to_real()
+{
+ std::vector<size_t> lengths;
+ lengths.push_back( large2 );
+ size_t batch = 1;
+ std::vector<size_t> input_strides;
+ std::vector<size_t> output_strides;
+ size_t input_distance = 0;
+ size_t output_distance = 0;
+ layout::buffer_layout_t layout = layout::hermitian_planar;
+ placeness::placeness_t placeness = placeness::out_of_place;
+
+ data_pattern pattern = sawtooth;
+ precallback_complex_to_real<T, cl_T, fftw_T>( pattern, lengths, batch, input_strides, output_strides, input_distance, output_distance, layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_large_1D_out_of_place_hermitian_planar_to_real)
+{
+ try { pow2_large_1D_out_of_place_hermitian_planar_to_real< float, cl_float, fftwf_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_large_1D_out_of_place_hermitian_planar_to_real)
+{
+ try { pow2_large_1D_out_of_place_hermitian_planar_to_real< double, cl_double, fftw_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
// *****************************************************
// *****************************************************
template< class T, class cl_T, class fftw_T >
@@ -210,6 +268,69 @@ TEST_F(accuracy_test_precallback_double, pow2_normal_1D_forward_in_place_complex
catch( const std::exception& err ) { handle_exception(err); }
}
+template< class T, class cl_T, class fftw_T >
+void pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+ std::vector<size_t> lengths;
+ lengths.push_back( large2 );
+ size_t batch = 1;
+ std::vector<size_t> input_strides;
+ std::vector<size_t> output_strides;
+ size_t input_distance = 0;
+ size_t output_distance = 0;
+ layout::buffer_layout_t in_layout = layout::complex_interleaved;
+ layout::buffer_layout_t out_layout = layout::complex_interleaved;
+ placeness::placeness_t placeness = placeness::in_place;
+ direction::direction_t direction = direction::forward;
+
+ data_pattern pattern = impulse;
+ precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+ try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+ try { pow2_large_1D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+template< class T, class cl_T, class fftw_T >
+void pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+ std::vector<size_t> lengths;
+ lengths.push_back( normal2 );
+ lengths.push_back( normal2 );
+ size_t batch = 1;
+ std::vector<size_t> input_strides;
+ std::vector<size_t> output_strides;
+ size_t input_distance = 0;
+ size_t output_distance = 0;
+ layout::buffer_layout_t in_layout = layout::complex_interleaved;
+ layout::buffer_layout_t out_layout = layout::complex_interleaved;
+ placeness::placeness_t placeness = placeness::in_place;
+ direction::direction_t direction = direction::forward;
+
+ data_pattern pattern = sawtooth;
+ precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+ try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_precallback_double, pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+ try { pow2_normal_2D_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
// *****************************************************
// *****************************************************
template< class T, class cl_T, class fftw_T >
@@ -1366,5 +1487,29 @@ TEST_F(accuracy_test_precallback_double, pow2_large_3D_in_place_real_to_hermitia
catch( const std::exception& err ) { handle_exception(err); }
}
+template< class T, class cl_T, class fftw_T >
+void lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved()
+{
+ std::vector<size_t> lengths;
+ lengths.push_back( 64 );
+ size_t batch = 1;
+ std::vector<size_t> input_strides;
+ std::vector<size_t> output_strides;
+ size_t input_distance = 0;
+ size_t output_distance = 0;
+ layout::buffer_layout_t in_layout = layout::complex_interleaved;
+ layout::buffer_layout_t out_layout = layout::complex_interleaved;
+ placeness::placeness_t placeness = placeness::in_place;
+ direction::direction_t direction = direction::forward;
+
+ data_pattern pattern = impulse;
+ precallback_complex_to_complex_lds<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_precallback_single, lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved)
+{
+ try { lds_1D_forward_64_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
}
\ No newline at end of file
diff --git a/src/tests/buffer.h b/src/tests/buffer.h
index c9664fc..be8fe62 100644
--- a/src/tests/buffer.h
+++ b/src/tests/buffer.h
@@ -629,16 +629,25 @@ public:
{
*( base_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
- if (!other_buffer.is_real())
+ the_index = the_index + 1; // the imaginary component immediately follows the real
+ if (other_buffer.is_real())
{
- the_index = the_index + 1; // the imaginary component immediately follows the real
+ *( base_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
+ }
+ else
+ {
*( base_ptr + the_index ) *= other_buffer.imag(x, y, z, batch);
}
}
else if ( is_planar() )
{
*( real_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
- if (!other_buffer.is_real())
+
+ if (other_buffer.is_real())
+ {
+ *( imag_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
+ }
+ else
{
*( imag_ptr + the_index ) *= other_buffer.imag(x, y, z, batch);
}
@@ -650,6 +659,70 @@ public:
}
}
+ //Calculates a 3 point average of other_buffer and
+ //multiplies with buffer
+ //only real layout is supported for other_buffer currently
+ void multiply_3pt_average( buffer<T> & other_buffer )
+ {
+ if (!other_buffer.is_real())
+ {
+ throw std::runtime_error( "only real layout is supported currently for other_buffer" );
+ }
+
+ size_t the_index, o_the_index;
+ T *base_ptr, *o_base_ptr;
+ T *real_ptr;
+ T *imag_ptr;
+ T o_prev_val, o_next_val;
+ T average;
+
+ if( is_interleaved() )
+ {
+ base_ptr = _the_buffers[interleaved].ptr();
+ }
+ else if ( is_planar() )
+ {
+ real_ptr = _the_buffers[re].ptr();
+ imag_ptr = _the_buffers[im].ptr();
+ }
+ else if ( is_real() )
+ {
+ base_ptr = _the_buffers[re].ptr();
+ }
+ o_base_ptr = other_buffer.real_ptr();
+
+ for( size_t batch = 0; batch < batch_size(); batch++ )
+ for( size_t z = 0; z < length(dimz); z++ )
+ for( size_t y = 0; y < length(dimy); y++ )
+ for( size_t x = 0; x < length(dimx); x++ )
+ {
+ the_index = index(x, y, z, batch);
+ o_the_index = other_buffer.index(x, y, z, batch);
+ o_prev_val = o_the_index <= 0 ? 0 : *(o_base_ptr + o_the_index - 1);
+ o_next_val = o_the_index >= (other_buffer.total_number_of_points_including_data_and_intervening() - 1) ? 0 : *(o_base_ptr + o_the_index + 1);
+
+ average = (o_prev_val + *(o_base_ptr + o_the_index) + o_next_val)/ 3.0 ;
+
+ if( is_interleaved() )
+ {
+ *( base_ptr + the_index ) *= average;
+
+ the_index = the_index + 1; // the imaginary component immediately follows the real
+ *( base_ptr + the_index ) *= average;
+ }
+ else if ( is_planar() )
+ {
+ *( real_ptr + the_index ) *= average;
+
+ *( imag_ptr + the_index ) *= average;
+ }
+ else if ( is_real() )
+ {
+ *( base_ptr + the_index ) *= average;
+ }
+ }
+ }
+
/*****************************************************/
// strides and distance are those of the output (that is, the new hermitian buffer)
void change_real_to_hermitian( const size_t* strides_in, const size_t distance_in )
diff --git a/src/tests/cl_transform.h b/src/tests/cl_transform.h
index 3a33bd3..cc946eb 100644
--- a/src/tests/cl_transform.h
+++ b/src/tests/cl_transform.h
@@ -618,24 +618,32 @@ public:
}
/*****************************************************/
- void set_input_precallback() {
+ void set_input_precallback(unsigned int localMemSize = 0) {
cl_int status = 0;
clfftPrecision precision;
clfftGetPlanPrecision( *plan_handle, &precision );
char* precallbackstr;
- if (input.is_interleaved() )
+ if (localMemSize > 0)
{
- precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+ //Test for LDS in precallback function
+ precallbackstr = STRINGIFY(MULVAL_LDS);
}
- else if (input.is_planar())
- {
- precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
- }
- else if (input.is_real())
+ else
{
- precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_REAL) : STRINGIFY(MULVAL_REAL_DP);
+ if (input.is_interleaved() )
+ {
+ precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+ }
+ else if (input.is_planar())
+ {
+ precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
+ }
+ else if (input.is_real())
+ {
+ precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_REAL) : STRINGIFY(MULVAL_REAL_DP);
+ }
}
//precallback user data
@@ -657,7 +665,7 @@ public:
OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
//Register the callback
- OPENCL_V_THROW (clFFTSetPlanCallback(*plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdataBuff), "clFFTSetPlanCallback failed");
+ OPENCL_V_THROW (clFFTSetPlanCallback(*plan_handle, "mulval", precallbackstr, NULL, localMemSize, PRECALLBACK, userdataBuff), "clFFTSetPlanCallback failed");
}
/*****************************************************/
diff --git a/src/tests/fftw_transform.h b/src/tests/fftw_transform.h
index 7970207..cd44d9a 100644
--- a/src/tests/fftw_transform.h
+++ b/src/tests/fftw_transform.h
@@ -457,6 +457,23 @@ public:
input *= userdata;
}
+ void set_input_precallback_special()
+ {
+ //precallback user data
+ buffer<T> userdata( input.number_of_dimensions(),
+ input.lengths(),
+ input.strides(),
+ input.batch_size(),
+ input.distance(),
+ layout::real ,
+ cl_placeness(placeness::in_place)
+ );
+
+ userdata.set_all_to_random_data(_lengths[0], 10);
+
+ input.multiply_3pt_average(userdata);
+ }
+
/*****************************************************/
void clear_data_buffer()
{
diff --git a/src/tests/test_constants.h b/src/tests/test_constants.h
index 56f14d4..5a27db8 100644
--- a/src/tests/test_constants.h
+++ b/src/tests/test_constants.h
@@ -77,6 +77,20 @@
return ret; \n \
}
+//Precallback test for LDS - works when 1 WI works on one input element
+#define MULVAL_LDS float2 mulval(__global void* in, uint offset, __global void* userdata, __local void* localmem)\n \
+ { \n \
+ uint lid = get_local_id(0); \n \
+ __local float* lds = (__local float*)localmem + lid; \n \
+ lds[0] = *((__global float*)userdata + offset); \n \
+ barrier(CLK_LOCAL_MEM_FENCE); \n \
+ float prev = offset <= 0 ? 0 : *(lds - 1); \n \
+ float next = offset >= get_global_size(0) ? 0 : *(lds + 1); \n \
+ float avg = (prev + *lds + next)/3.0;\n \
+ float2 ret = *((__global float2*)in + offset) * avg; \n \
+ return ret; \n \
+ }
+
#define STRUCT_USERDATA typedef struct USER_DATA \
{ \
float scalar1; \
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list