[clfft] 89/107: placing pieces of extracted protype gen code

Thu Jul 30 18:06:40 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 56a46c83fb7828f700f6b2c5dc6291ec7a1662c1
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Fri May 29 19:37:28 2015 -0500

    placing pieces of extracted protype gen code
---
 src/library/generator.stockham.cpp | 51 +++++++++++++++++++++++++++++++++-----
 src/library/plan.h                 | 16 +++++++++++-
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index bd45288..eb8fcb7 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -605,6 +605,8 @@ namespace StockhamGenerator
 		bool rcFull;
 		bool rcSimple;
 
+		bool realSpecial;
+
 		bool enableGrouping;				
 		bool linearRegs;					// scalar registers (non-vectorized registers) to be used
 		bool halfLds;						// only half the LDS of a complex length need to be used
@@ -983,6 +985,15 @@ namespace StockhamGenerator
 					{
 						for(size_t r=0; r<radix; r++)
 						{
+							if(realSpecial && (nextPass == NULL) && (r > (radix/2)))
+								break;
+
+							if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i != 0))
+								break;
+
+							if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
+								passStr += "\n\t}\n\tif( rw && !me)\n\t{";
+
 							for(size_t c=cStart; c<cEnd; c++) // component loop: 0 - real, 1 - imaginary
 							{
 								std::string tail;
@@ -1048,6 +1059,10 @@ namespace StockhamGenerator
 								if(interleaved && (component == SR_COMP_BOTH) && linearRegs)
 									break;
 							}
+
+							if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
+								passStr += "\n\t}\n\tif(rw)\n\t{";
+
 						}
 
 						butterflyIndex++;
@@ -1419,10 +1434,11 @@ namespace StockhamGenerator
 
     public:
 		Pass(	size_t positionVal, size_t lengthVal, size_t radixVal, size_t cnPerWIVal,
-				size_t L, size_t LS, size_t R, bool linearRegsVal, bool halfLdsVal, bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal) :
+				size_t L, size_t LS, size_t R, bool linearRegsVal, bool halfLdsVal,
+				bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal, bool realSpecialVal) :
 			position(positionVal), length(lengthVal), radix(radixVal), cnPerWI(cnPerWIVal),
 			algL(L), algLS(LS), algR(R), linearRegs(linearRegsVal), halfLds(halfLdsVal),
-			r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal),
+			r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal), realSpecial(realSpecialVal),
 			enableGrouping(true),
 			numB1(0), numB2(0), numB4(0),
 			nextPass(NULL)
@@ -2053,6 +2069,7 @@ namespace StockhamGenerator
 		BlockComputeType blockComputeType;
 		size_t blockWidth, blockWGS, blockLDS;
 
+		bool realSpecial;
 
 		const FFTKernelGenKeyParams params;		// key params
 
@@ -2084,6 +2101,9 @@ namespace StockhamGenerator
 			if(r2c2r)
 				return false;
 
+			if(realSpecial)
+				return false;
+
 			if(params.fft_placeness == CLFFT_INPLACE)
 			{
 				iStride = oStride = params.fft_inStride;
@@ -2215,6 +2235,8 @@ namespace StockhamGenerator
 
 			linearRegs = halfLds;
 
+			realSpecial = params.fft_realSpecial;
+
 			blockCompute = params.blockCompute;
 			blockComputeType = params.blockComputeType;
 			// Make sure we can utilize all Lds if we are going to
@@ -2257,7 +2279,7 @@ namespace StockhamGenerator
 					R /= rad;
 
 					radices.push_back(rad);
-					passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
+					passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
 
 					LS *= rad;
 				}
@@ -2295,7 +2317,7 @@ namespace StockhamGenerator
 					R /= rad;
 
 					radices.push_back(rad);
-					passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
+					passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
 
 					pid++;
 					LS *= rad;
@@ -3101,6 +3123,10 @@ namespace StockhamGenerator
 					}
 				}
 
+				if(realSpecial)
+				{
+					str += "\n\tfor(uint t=0; t<2; t++)\n\t{\n\n";
+				}
 
 				// Call passes
 				if(numPasses == 1)
@@ -3118,7 +3144,7 @@ namespace StockhamGenerator
 					for(typename std::vector<Pass<PR> >::const_iterator p = passes.begin(); p != passes.end(); p++)
 					{
 						std::string exTab = "";
-						if(blockCompute) exTab = "\t";
+						if(blockCompute || realSpecial) exTab = "\t";
 
 						str += exTab;
 						str += "\t";
@@ -3183,13 +3209,23 @@ namespace StockhamGenerator
 					}
 				}
 
+				if(realSpecial)
+				{
+					size_t Nt = 1 + length/2;
+					str += 	"\n\t\tif( (batch == 0) || (2*batch == ";
+					str += SztToStr(params.fft_realSpecial_Nr); str += ") ) break;\n";
 
-				if(blockCompute)
+					str += "\t\tlwbOut += ("; str += SztToStr(params.fft_realSpecial_Nr);
+					str += " - 2*batch)*"; str += SztToStr(Nt); str += ";\n\n";
+				}
+
+				if(blockCompute || realSpecial)
 				{
 					str += "\n\t}\n\n";
 				}
 
 
+
 				// Write data from LDS for blocked access
 				if(blockCompute)
 				{
@@ -3303,6 +3339,9 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
 
 	this->signature.fft_RCsimple = this->plan->RCsimple;
 
+	this->signature.fft_realSpecial = this->plan->realSpecial;
+	this->signature.fft_realSpecial_Nr = this->plan->realSpecial_Nr;
+
 	this->signature.blockCompute = this->plan->blockCompute;
 	this->signature.blockComputeType = this->plan->blockComputeType;
 
diff --git a/src/library/plan.h b/src/library/plan.h
index a250e08..dd8b46a 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -122,8 +122,11 @@ struct FFTKernelGenKeyParams {
 	                                            // so extra twiddles are applied on output.
 	bool					 fft_twiddleFront;	// do twiddle scaling at the beginning pass
 
+	bool					 fft_realSpecial;	// this is the flag to control the special case step (4th step)
+	                                            // in the 5-step real 1D large breakdown
+	size_t					 fft_realSpecial_Nr;
 
-	bool                     fft_RCsimple;
+	bool                     fft_RCsimple; 
 
 	bool					 transOutHorizontal;	// tiles traverse the output buffer in horizontal direction
 
@@ -158,7 +161,11 @@ struct FFTKernelGenKeyParams {
 
 		transOutHorizontal = false;
 
+		fft_realSpecial = false;
+		fft_realSpecial_Nr = 0;
+
 		fft_RCsimple = false;
+
 		blockCompute = false;
 		blockComputeType = BCT_C2C;
 		blockSIMD = 0;
@@ -406,6 +413,11 @@ public:
 	// where imaginary of input is set to zero in forward and imaginary not written in backward
 	bool RCsimple;
 
+	// Real FFT special flag
+	// if this is set it means we are doing the 4th step in the 5-step real FFT breakdown algorithm
+	bool realSpecial;
+	
+	size_t realSpecial_Nr;
 
 	// User created plan
 	bool userPlan;
@@ -452,6 +464,8 @@ public:
 	,   transflag(false)
 	,	transOutHorizontal(false)
 	,	RCsimple(false)
+	,	realSpecial(false)
+	,	realSpecial_Nr(0)
 	,	userPlan(false)
 	,	blockCompute(false)
 	,	blockComputeType(BCT_C2C)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git