[clfft] 89/107: placing pieces of extracted protype gen code
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jul 30 18:06:40 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 56a46c83fb7828f700f6b2c5dc6291ec7a1662c1
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Fri May 29 19:37:28 2015 -0500
placing pieces of extracted protype gen code
---
src/library/generator.stockham.cpp | 51 +++++++++++++++++++++++++++++++++-----
src/library/plan.h | 16 +++++++++++-
2 files changed, 60 insertions(+), 7 deletions(-)
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index bd45288..eb8fcb7 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -605,6 +605,8 @@ namespace StockhamGenerator
bool rcFull;
bool rcSimple;
+ bool realSpecial;
+
bool enableGrouping;
bool linearRegs; // scalar registers (non-vectorized registers) to be used
bool halfLds; // only half the LDS of a complex length need to be used
@@ -983,6 +985,15 @@ namespace StockhamGenerator
{
for(size_t r=0; r<radix; r++)
{
+ if(realSpecial && (nextPass == NULL) && (r > (radix/2)))
+ break;
+
+ if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i != 0))
+ break;
+
+ if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
+ passStr += "\n\t}\n\tif( rw && !me)\n\t{";
+
for(size_t c=cStart; c<cEnd; c++) // component loop: 0 - real, 1 - imaginary
{
std::string tail;
@@ -1048,6 +1059,10 @@ namespace StockhamGenerator
if(interleaved && (component == SR_COMP_BOTH) && linearRegs)
break;
}
+
+ if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0))
+ passStr += "\n\t}\n\tif(rw)\n\t{";
+
}
butterflyIndex++;
@@ -1419,10 +1434,11 @@ namespace StockhamGenerator
public:
Pass( size_t positionVal, size_t lengthVal, size_t radixVal, size_t cnPerWIVal,
- size_t L, size_t LS, size_t R, bool linearRegsVal, bool halfLdsVal, bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal) :
+ size_t L, size_t LS, size_t R, bool linearRegsVal, bool halfLdsVal,
+ bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal, bool realSpecialVal) :
position(positionVal), length(lengthVal), radix(radixVal), cnPerWI(cnPerWIVal),
algL(L), algLS(LS), algR(R), linearRegs(linearRegsVal), halfLds(halfLdsVal),
- r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal),
+ r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal), realSpecial(realSpecialVal),
enableGrouping(true),
numB1(0), numB2(0), numB4(0),
nextPass(NULL)
@@ -2053,6 +2069,7 @@ namespace StockhamGenerator
BlockComputeType blockComputeType;
size_t blockWidth, blockWGS, blockLDS;
+ bool realSpecial;
const FFTKernelGenKeyParams params; // key params
@@ -2084,6 +2101,9 @@ namespace StockhamGenerator
if(r2c2r)
return false;
+ if(realSpecial)
+ return false;
+
if(params.fft_placeness == CLFFT_INPLACE)
{
iStride = oStride = params.fft_inStride;
@@ -2215,6 +2235,8 @@ namespace StockhamGenerator
linearRegs = halfLds;
+ realSpecial = params.fft_realSpecial;
+
blockCompute = params.blockCompute;
blockComputeType = params.blockComputeType;
// Make sure we can utilize all Lds if we are going to
@@ -2257,7 +2279,7 @@ namespace StockhamGenerator
R /= rad;
radices.push_back(rad);
- passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
+ passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
LS *= rad;
}
@@ -2295,7 +2317,7 @@ namespace StockhamGenerator
R /= rad;
radices.push_back(rad);
- passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
+ passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
pid++;
LS *= rad;
@@ -3101,6 +3123,10 @@ namespace StockhamGenerator
}
}
+ if(realSpecial)
+ {
+ str += "\n\tfor(uint t=0; t<2; t++)\n\t{\n\n";
+ }
// Call passes
if(numPasses == 1)
@@ -3118,7 +3144,7 @@ namespace StockhamGenerator
for(typename std::vector<Pass<PR> >::const_iterator p = passes.begin(); p != passes.end(); p++)
{
std::string exTab = "";
- if(blockCompute) exTab = "\t";
+ if(blockCompute || realSpecial) exTab = "\t";
str += exTab;
str += "\t";
@@ -3183,13 +3209,23 @@ namespace StockhamGenerator
}
}
+ if(realSpecial)
+ {
+ size_t Nt = 1 + length/2;
+ str += "\n\t\tif( (batch == 0) || (2*batch == ";
+ str += SztToStr(params.fft_realSpecial_Nr); str += ") ) break;\n";
- if(blockCompute)
+ str += "\t\tlwbOut += ("; str += SztToStr(params.fft_realSpecial_Nr);
+ str += " - 2*batch)*"; str += SztToStr(Nt); str += ";\n\n";
+ }
+
+ if(blockCompute || realSpecial)
{
str += "\n\t}\n\n";
}
+
// Write data from LDS for blocked access
if(blockCompute)
{
@@ -3303,6 +3339,9 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
this->signature.fft_RCsimple = this->plan->RCsimple;
+ this->signature.fft_realSpecial = this->plan->realSpecial;
+ this->signature.fft_realSpecial_Nr = this->plan->realSpecial_Nr;
+
this->signature.blockCompute = this->plan->blockCompute;
this->signature.blockComputeType = this->plan->blockComputeType;
diff --git a/src/library/plan.h b/src/library/plan.h
index a250e08..dd8b46a 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -122,8 +122,11 @@ struct FFTKernelGenKeyParams {
// so extra twiddles are applied on output.
bool fft_twiddleFront; // do twiddle scaling at the beginning pass
+ bool fft_realSpecial; // this is the flag to control the special case step (4th step)
+ // in the 5-step real 1D large breakdown
+ size_t fft_realSpecial_Nr;
- bool fft_RCsimple;
+ bool fft_RCsimple;
bool transOutHorizontal; // tiles traverse the output buffer in horizontal direction
@@ -158,7 +161,11 @@ struct FFTKernelGenKeyParams {
transOutHorizontal = false;
+ fft_realSpecial = false;
+ fft_realSpecial_Nr = 0;
+
fft_RCsimple = false;
+
blockCompute = false;
blockComputeType = BCT_C2C;
blockSIMD = 0;
@@ -406,6 +413,11 @@ public:
// where imaginary of input is set to zero in forward and imaginary not written in backward
bool RCsimple;
+ // Real FFT special flag
+ // if this is set it means we are doing the 4th step in the 5-step real FFT breakdown algorithm
+ bool realSpecial;
+
+ size_t realSpecial_Nr;
// User created plan
bool userPlan;
@@ -452,6 +464,8 @@ public:
, transflag(false)
, transOutHorizontal(false)
, RCsimple(false)
+ , realSpecial(false)
+ , realSpecial_Nr(0)
, userPlan(false)
, blockCompute(false)
, blockComputeType(BCT_C2C)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list