[clfft] 80/128: Precallback - support precallback in the new generator TransposeSquare - Mul16 cases

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:41 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 425ad9893934262513f91eb2c11d8374229ce3a3
Author: Pradeep <pradeep.rao at amd.com>
Date:   Mon Sep 21 15:26:07 2015 +0530

    Precallback - support precallback in the new generator TransposeSquare - Mul16 cases
---
 src/library/generator.transpose.square.cpp   | 124 +++++++++++++++++++++------
 src/tests/accuracy_test_pow2_precallback.cpp |  62 ++++++++++++++
 2 files changed, 158 insertions(+), 28 deletions(-)

diff --git a/src/library/generator.transpose.square.cpp b/src/library/generator.transpose.square.cpp
index a546aad..1a4a4d0 100644
--- a/src/library/generator.transpose.square.cpp
+++ b/src/library/generator.transpose.square.cpp
@@ -227,7 +227,17 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeSquareActio
 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
 	}
 
-
+	if (params.fft_hasPreCallback)
+	{
+		if (params.fft_preCallback.localMemSize > 0)
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* userdata, __local void* localmem";
+		}
+		else
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* userdata";
+		}
+	}
 
 
     // Close the method signature
@@ -302,6 +312,14 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 	{
 		bool fwd = bothDir ? false : true;
 
+		//If pre-callback is set for the plan
+		if (params.fft_hasPreCallback)
+		{
+			//Insert callback function code at the beginning 
+			clKernWrite( transKernel, 0 ) << params.fft_preCallback.funcstring << std::endl;
+			clKernWrite( transKernel, 0 ) << std::endl;
+		}
+
 		std::string funcName;
 		if (params.fft_3StepTwiddle) // TODO
 			funcName = fwd ? "transpose_square_tw_fwd" : "transpose_square_tw_back";
@@ -312,11 +330,6 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 		// Generate kernel API
 		genTransposePrototype(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
 
-
-		
-
-
-
 		if (mult_of_16)
 			clKernWrite(transKernel, 3) << "const int numGroupsY_1 = " << (params.fft_N[0] / 16 / reShapeFactor)*(params.fft_N[0] / 16 / reShapeFactor + 1) / 2 << ";" << std::endl;
 		else
@@ -341,15 +354,19 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 		switch (params.fft_inputLayout)
 		{
 			case CLFFT_COMPLEX_INTERLEAVED:
-				clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl;  // Set A ptr to the start of each slice
-
+				//Do not advance offset when precallback is set as the starting address of global buffer is needed
+				if (!params.fft_hasPreCallback)
+				{
+					clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl;  // Set A ptr to the start of each slice
+				}
 				break;
 			case CLFFT_COMPLEX_PLANAR:
-
-				clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl;  // Set A ptr to the start of each slice 
-				clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl;  // Set A ptr to the start of each slice 
-
-				
+				//Do not advance offset when precallback is set as the starting address of global buffer is needed
+				if (!params.fft_hasPreCallback)
+				{
+					clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl;  // Set A ptr to the start of each slice 
+					clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl;  // Set A ptr to the start of each slice 
+				}				
 				break;
 			case CLFFT_HERMITIAN_INTERLEAVED:
 			case CLFFT_HERMITIAN_PLANAR:
@@ -387,15 +404,26 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 			switch (params.fft_inputLayout)
 			{
 				case CLFFT_COMPLEX_INTERLEAVED:
-					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl; 
-
+					if (params.fft_hasPreCallback)
+					{
+						clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA + iOffset;" << std::endl; 
+					}
+					else
+					{
+						clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl; 
+					}
 					break;
 				case CLFFT_COMPLEX_PLANAR:
-
-					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
-					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
-
-				
+					if (params.fft_hasPreCallback)
+					{
+						clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R + iOffset;" << std::endl;
+						clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I + iOffset;" << std::endl;
+					}
+					else
+					{
+						clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
+						clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
+					}				
 					break;
 				case CLFFT_HERMITIAN_INTERLEAVED:
 				case CLFFT_HERMITIAN_PLANAR:
@@ -475,18 +503,51 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 			switch (params.fft_inputLayout)
 			{
 				case CLFFT_COMPLEX_INTERLEAVED:
-					clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
-					clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+					{
+						if (params.fft_hasPreCallback)
+						{
+							if (params.fft_preCallback.localMemSize > 0)
+							{
+								clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
+								clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+							}
+							else
+							{
+								clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
+								clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+							}
+						}
+						else
+						{
+							clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+							clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+						}
+					}
 					break;
 				case CLFFT_COMPLEX_PLANAR:
 					dtInput = dtPlanar;
 					dtOutput = dtPlanar;
-					clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
-					clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
-
-					clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
-					clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
-
+					if (params.fft_hasPreCallback)
+					{
+						if (params.fft_preCallback.localMemSize > 0)
+						{
+							clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata, localmem);" << std::endl;
+							clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata, localmem);" << std::endl;
+						}
+						else
+						{
+							clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, userdata);" << std::endl;
+							clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, userdata);" << std::endl;
+						}
+					}
+					else
+					{
+						clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+						clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+
+						clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+						clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+					}
 					break;
 				case CLFFT_HERMITIAN_INTERLEAVED:
 				case CLFFT_HERMITIAN_PLANAR:
@@ -676,6 +737,13 @@ clfftStatus FFTGeneratedTransposeSquareAction::initParams ()
     this->signature.fft_R = 1; // Dont think i'll use
     this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
 
+	//Set callback if specified
+	if (this->plan->hasPreCallback)
+	{
+		this->signature.fft_hasPreCallback = true;
+		this->signature.fft_preCallback = this->plan->preCallback;
+	}
+
     return CLFFT_SUCCESS;
 }
 
diff --git a/src/tests/accuracy_test_pow2_precallback.cpp b/src/tests/accuracy_test_pow2_precallback.cpp
index 93db6e9..ae50b62 100644
--- a/src/tests/accuracy_test_pow2_precallback.cpp
+++ b/src/tests/accuracy_test_pow2_precallback.cpp
@@ -1291,6 +1291,37 @@ TEST_F(accuracy_test_pow2_precallback_double, large_1D_forward_in_place_complex_
 	catch( const std::exception& err ) { handle_exception(err);	}
 }
 
+template< class T, class cl_T, class fftw_T >
+void large_1D_1048576_forward_in_place_complex_planar_to_complex_planar()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 1048576 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_planar;
+	layout::buffer_layout_t out_layout = layout::complex_planar;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_pow2_precallback_single, large_1D_1048576_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { large_1D_1048576_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_pow2_precallback_double, large_1D_1048576_forward_in_place_complex_planar_to_complex_planar)
+{
+	try { large_1D_1048576_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
 // ^^^^^^^^^^^^^^^^^^^^^^^ huge 1D ^^^^^^^^^^^^^^^^^^^^^^^ //
 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
@@ -1442,6 +1473,37 @@ TEST_F(accuracy_test_pow2_precallback_double, large_1D_forward_in_place_complex_
 	catch( const std::exception& err ) { handle_exception(err);	}
 }
 
+template< class T, class cl_T, class fftw_T >
+void large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved()
+{
+	std::vector<size_t> lengths;
+	lengths.push_back( 1048576 );
+	size_t batch = 1;
+	std::vector<size_t> input_strides;
+	std::vector<size_t> output_strides;
+	size_t input_distance = 0;
+	size_t output_distance = 0;
+	layout::buffer_layout_t in_layout = layout::complex_interleaved;
+	layout::buffer_layout_t out_layout = layout::complex_interleaved;
+	placeness::placeness_t placeness = placeness::in_place;
+	direction::direction_t direction = direction::forward;
+
+	data_pattern pattern = sawtooth;
+	precallback_complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness );
+}
+
+TEST_F(accuracy_test_pow2_precallback_single, large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved< float, cl_float, fftwf_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
+TEST_F(accuracy_test_pow2_precallback_double, large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved)
+{
+	try { large_1D_1048576_forward_in_place_complex_interleaved_to_complex_interleaved< double, cl_double, fftw_complex >(); }
+	catch( const std::exception& err ) { handle_exception(err);	}
+}
+
 // *****************************************************
 // *****************************************************
 template< class T, class cl_T, class fftw_T >

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list