[clfft] 80/128: Precallback - support precallback in the new generator TransposeSquare - Mul16 cases
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:41 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 425ad9893934262513f91eb2c11d8374229ce3a3
Author: Pradeep <pradeep.rao at amd.com>
Date: Mon Sep 21 15:26:07 2015 +0530
Precallback - support precallback in the new generator TransposeSquare - Mul16 cases
---
src/library/generator.transpose.square.cpp | 124 +++++++++++++++++++++------
src/tests/accuracy_test_pow2_precallback.cpp | 62 ++++++++++++++
2 files changed, 158 insertions(+), 28 deletions(-)
diff --git a/src/library/generator.transpose.square.cpp b/src/library/generator.transpose.square.cpp
index a546aad..1a4a4d0 100644
--- a/src/library/generator.transpose.square.cpp
+++ b/src/library/generator.transpose.square.cpp
@@ -227,7 +227,17 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeSquareActio
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
-
+ if (params.fft_hasPreCallback)
+ {
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ clKernWrite( transKernel, 0 ) << ", __global void* userdata, __local void* localmem";
+ }
+ else
+ {
+ clKernWrite( transKernel, 0 ) << ", __global void* userdata";
+ }
+ }
// Close the method signature
@@ -302,6 +312,14 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
{
bool fwd = bothDir ? false : true;
+ //If pre-callback is set for the plan
+ if (params.fft_hasPreCallback)
+ {
+ //Insert callback function code at the beginning
+ clKernWrite( transKernel, 0 ) << params.fft_preCallback.funcstring << std::endl;
+ clKernWrite( transKernel, 0 ) << std::endl;
+ }
+
std::string funcName;
if (params.fft_3StepTwiddle) // TODO
funcName = fwd ? "transpose_square_tw_fwd" : "transpose_square_tw_back";
@@ -312,11 +330,6 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
// Generate kernel API
genTransposePrototype(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
-
-
-
-
-
if (mult_of_16)
clKernWrite(transKernel, 3) << "const int numGroupsY_1 = " << (params.fft_N[0] / 16 / reShapeFactor)*(params.fft_N[0] / 16 / reShapeFactor + 1) / 2 << ";" << std::endl;
else
@@ -341,15 +354,19 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
switch (params.fft_inputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl; // Set A ptr to the start of each slice
-
+ //Do not advance offset when precallback is set as the starting address of global buffer is needed
+ if (!params.fft_hasPreCallback)
+ {
+ clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl; // Set A ptr to the start of each slice
+ }
break;
case CLFFT_COMPLEX_PLANAR:
-
- clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl; // Set A ptr to the start of each slice
- clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl; // Set A ptr to the start of each slice
-
-
+ //Do not advance offset when precallback is set as the starting address of global buffer is needed
+ if (!params.fft_hasPreCallback)
+ {
+ clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl; // Set A ptr to the start of each slice
+ clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl; // Set A ptr to the start of each slice
+ }
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
@@ -387,15 +404,26 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
switch (params.fft_inputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
-
+ if (params.fft_hasPreCallback)
+ {
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA + iOffset;" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
+ }
break;
case CLFFT_COMPLEX_PLANAR:
-
- clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
- clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
-
-
+ if (params.fft_hasPreCallback)
+ {
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R + iOffset;" << std::endl;
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I + iOffset;" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
+ }
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
@@ -475,18 +503,51 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
switch (params.fft_inputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
- clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ {
+ if (params.fft_hasPreCallback)
+ {
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+ }
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ }
+ }
break;
case CLFFT_COMPLEX_PLANAR:
dtInput = dtPlanar;
dtOutput = dtPlanar;
- clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
- clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
-
- clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
- clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
-
+ if (params.fft_hasPreCallback)
+ {
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+ }
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+
+ clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ }
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
@@ -676,6 +737,13 @@ clfftStatus FFTGeneratedTransposeSquareAction::initParams ()
this->signature.fft_R = 1; // Dont think i'll use
this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+ //Set callback if specified
+ if (this->plan->hasPreCallback)
+ {
+ this->signature.fft_hasPreCallback = true;
+ this->signature.fft_preCallback = this->plan->preCallback;
+ }
+
return CLFFT_SUCCESS;
}
diff --git a/src/tests/accuracy_test_pow2_precallback.cpp b/src/tests/accuracy_test_pow2_precallback.cpp
index 93db6e9..ae50b62 100644
--- a/src/tests/accuracy_test_pow2_precallback.cpp
+++ b/src/tests/accuracy_test_pow2_precallback.cpp
@@ -1291,6 +1291,37 @@ TEST_F(accuracy_test_pow2_precallback_double, large_1D_forward_in_place_complex_
catch( const std::exception& err ) { handle_exception(err); }
}
+template< class T, class cl_T, class fftw_T >
+void large_1D_1048576_forward_in_place_complex_planar_to_complex_planar()
+{
+ std::vector<size_t> lengths;
+ lengths.push_back( 1048576 );
+ size_t batch = 1;
+ std::vector<size_t> input_strides;
+ std::vector<size_t> output_strides;
+ size_t input_distance = 0;
+ size_t output_distance = 0;
+ layout::buffer_layout_t in_layout = layout::complex_planar;
+ layout::buffer_layout_t out_layout = layout::complex_planar;
+ placeness::placeness_t placeness = placeness::in_place;
+ direction::direction_t direction = direction::forward;
+
+ data_pattern pattern = sawtooth;
+ precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_pow2_precallback_single, large_1D_1048576_forward_in_place_complex_planar_to_complex_planar)
+{
+ try { large_1D_1048576_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow2_precallback_double, large_1D_1048576_forward_in_place_complex_planar_to_complex_planar)
+{
+ try { large_1D_1048576_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
// ^^^^^^^^^^^^^^^^^^^^^^^ huge 1D ^^^^^^^^^^^^^^^^^^^^^^^ //
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
@@ -1442,6 +1473,37 @@ TEST_F(accuracy_test_pow2_precallback_double, large_1D_forward_in_place_complex_
catch( const std::exception& err ) { handle_exception(err); }
}
+template< class T, class cl_T, class fftw_T >
+void large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+ std::vector<size_t> lengths;
+ lengths.push_back( 1048576 );
+ size_t batch = 1;
+ std::vector<size_t> input_strides;
+ std::vector<size_t> output_strides;
+ size_t input_distance = 0;
+ size_t output_distance = 0;
+ layout::buffer_layout_t in_layout = layout::complex_interleaved;
+ layout::buffer_layout_t out_layout = layout::complex_interleaved;
+ placeness::placeness_t placeness = placeness::in_place;
+ direction::direction_t direction = direction::forward;
+
+ data_pattern pattern = sawtooth;
+ precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_pow2_precallback_single, large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+ try { large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow2_precallback_double, large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+ try { large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+ catch( const std::exception& err ) { handle_exception(err); }
+}
+
// *****************************************************
// *****************************************************
template< class T, class cl_T, class fftw_T >
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list