[clfft] 48/128: adding twiddle multiplies to inplace generator, non-branch case only
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:37 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit e299c39ff6bd4459a450eab78137d9790a8664b9
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Thu Aug 27 17:42:02 2015 -0500
adding twiddle multiplies to inplace generator, non-branch case only
---
src/library/generator.transpose.inplace.cpp | 46 +++++++++++++++++++----------
1 file changed, 31 insertions(+), 15 deletions(-)
diff --git a/src/library/generator.transpose.inplace.cpp b/src/library/generator.transpose.inplace.cpp
index a76f7b7..1607fd7 100644
--- a/src/library/generator.transpose.inplace.cpp
+++ b/src/library/generator.transpose.inplace.cpp
@@ -159,22 +159,30 @@ static void OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyPara
// the generator that it wants the twiddle factors generated inside of the transpose
static clfftStatus genTwiddleMath( const FFTKernelGenKeyParams& params, std::stringstream& transKernel, const std::string& dtComplex, bool fwd )
{
- clKernWrite( transKernel, 9 ) << dtComplex << " W = TW3step( (groupIndex.x * wgTileExtent.x + xInd) * (currDimIndex * wgTileExtent.y * wgUnroll + yInd) );" << std::endl;
- clKernWrite( transKernel, 9 ) << dtComplex << " T;" << std::endl;
+ clKernWrite( transKernel, 9 ) << dtComplex << " Wm = TW3step( (t_gx_p*32 + lidx) * (t_gy_p*32 + lidy + loop*8) );" << std::endl;
+ clKernWrite( transKernel, 9 ) << dtComplex << " Wt = TW3step( (t_gy_p*32 + lidx) * (t_gx_p*32 + lidy + loop*8) );" << std::endl;
+ clKernWrite( transKernel, 9 ) << dtComplex << " Tm, Tt;" << std::endl;
if(fwd)
{
- clKernWrite( transKernel, 9 ) << "T.x = ( W.x * tmp.x ) - ( W.y * tmp.y );" << std::endl;
- clKernWrite( transKernel, 9 ) << "T.y = ( W.y * tmp.x ) + ( W.x * tmp.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tm.x = ( Wm.x * tmpm.x ) - ( Wm.y * tmpm.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tm.y = ( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tt.x = ( Wt.x * tmpt.x ) - ( Wt.y * tmpt.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tt.y = ( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
}
else
{
- clKernWrite( transKernel, 9 ) << "T.x = ( W.x * tmp.x ) + ( W.y * tmp.y );" << std::endl;
- clKernWrite( transKernel, 9 ) << "T.y = -( W.y * tmp.x ) + ( W.x * tmp.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tm.x = ( Wm.x * tmpm.x ) + ( Wm.y * tmpm.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tm.y = -( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tt.x = ( Wt.x * tmpt.x ) + ( Wt.y * tmpt.y );" << std::endl;
+ clKernWrite( transKernel, 9 ) << "Tt.y = -( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
}
- clKernWrite( transKernel, 9 ) << "tmp.x = T.x;" << std::endl;
- clKernWrite( transKernel, 9 ) << "tmp.y = T.y;" << std::endl;
+ clKernWrite( transKernel, 9 ) << "tmpm.x = Tm.x;" << std::endl;
+ clKernWrite( transKernel, 9 ) << "tmpm.y = Tm.y;" << std::endl;
+ clKernWrite( transKernel, 9 ) << "tmpt.x = Tt.x;" << std::endl;
+ clKernWrite( transKernel, 9 ) << "tmpt.y = Tt.y;" << std::endl;
+
return CLFFT_SUCCESS;
}
@@ -446,6 +454,8 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
clKernWrite(transKernel, 3) << "__local "<<dtComplex<<" xy_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
clKernWrite(transKernel, 3) << "__local "<<dtComplex<<" yx_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
+ clKernWrite(transKernel, 3) << dtComplex << " tmpm, tmpt;" << std::endl;
+
clKernWrite(transKernel, 3) << "" << std::endl;
// Step 1: Load both blocks into local memory
@@ -463,17 +473,17 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
switch (params.fft_inputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- clKernWrite(transKernel, 6) << "xy_s[index] = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
- clKernWrite(transKernel, 6) << "yx_s[index] = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
break;
case CLFFT_COMPLEX_PLANAR:
dtInput = dtPlanar;
dtOutput = dtPlanar;
- clKernWrite(transKernel, 6) << "xy_s[index].x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
- clKernWrite(transKernel, 6) << "xy_s[index].y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
- clKernWrite(transKernel, 6) << "yx_s[index].x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
- clKernWrite(transKernel, 6) << "yx_s[index].y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
break;
case CLFFT_HERMITIAN_INTERLEAVED:
@@ -485,6 +495,12 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
+ // If requested, generate the Twiddle math to multiply constant values
+ if( params.fft_3StepTwiddle )
+ genTwiddleMath( params, transKernel, dtComplex, fwd );
+
+ clKernWrite( transKernel, 6 ) << "xy_s[index] = tmpm; " << std::endl;
+ clKernWrite( transKernel, 6 ) << "yx_s[index] = tmpt; " << std::endl;
clKernWrite(transKernel, 3) << "}" << std::endl;
@@ -591,7 +607,7 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
break;
}
- return CLFFT_SUCCESS;
+ return CLFFT_SUCCESS;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list