[clfft] 85/128: adding radix7 functionality
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:42 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit d6c6127d81b15e05773c2225528c5de632e40367
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Wed Sep 23 12:18:04 2015 -0500
adding radix7 functionality
---
src/library/generator.stockham.cpp | 132 ++++++++------
src/library/generator.stockham.h | 350 +++++++++++++++++++++++++++++++++++++
src/library/plan.cpp | 26 ++-
src/library/private.h | 2 +
src/tests/accuracy_test_random.cpp | 1 +
src/tests/unit_test.cpp | 12 +-
6 files changed, 451 insertions(+), 72 deletions(-)
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 4161279..08028ee 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -385,7 +385,7 @@ namespace StockhamGenerator
// Given the length of 1d fft, this function determines the appropriate work group size
// and the number of transforms per work group
// TODO for optimizations - experiment with different possibilities for work group sizes and num transforms for improving performance
- void DetermineSizes(const size_t &MAX_WGS, const size_t &length, size_t &workGroupSize, size_t &numTrans)
+ void DetermineSizes(const size_t &MAX_WGS, const size_t &length, size_t &workGroupSize, size_t &numTrans, Precision &pr)
{
assert(MAX_WGS >= 64);
@@ -396,7 +396,7 @@ namespace StockhamGenerator
return;
}
- size_t baseRadix[] = {5,3,2}; // list only supported primes
+ size_t baseRadix[] = {7,5,3,2}; // list only supported primes
size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);
size_t l = length;
@@ -418,9 +418,7 @@ namespace StockhamGenerator
if (primeFactorsExpanded[2] == length) // Length is pure power of 2
{
- //if(length == 1024) { workGroupSize = 128; numTrans = 1; }
if (length >= 1024) { workGroupSize = (MAX_WGS >= 256) ? 256 : MAX_WGS; numTrans = 1; }
- //else if (length == 512) { workGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; numTrans = 1; }
else if (length == 512) { workGroupSize = 64; numTrans = 1; }
else if (length >= 16) { workGroupSize = 64; numTrans = 256/length; }
else { workGroupSize = 64; numTrans = 128/length; }
@@ -428,69 +426,79 @@ namespace StockhamGenerator
else if (primeFactorsExpanded[3] == length) // Length is pure power of 3
{
workGroupSize = (MAX_WGS >= 256) ? 243 : 27;
- if(length >= 3*workGroupSize) numTrans = 1;
- else numTrans = (3*workGroupSize)/length;
+ numTrans = length >= 3*workGroupSize ? 1 : (3*workGroupSize)/length;
}
else if (primeFactorsExpanded[5] == length) // Length is pure power of 5
{
workGroupSize = (MAX_WGS >= 128) ? 125 : 25;
- if(length >= 5*workGroupSize) numTrans = 1;
- else numTrans = (5*workGroupSize)/length;
+ numTrans = length >= 5*workGroupSize ? 1 : (5*workGroupSize)/length;
}
- else
+ else if (primeFactorsExpanded[7] == length) // Length is pure power of 7
{
- size_t leastNumPerWI; // least number of elements in one work item
- size_t maxWorkGroupSize; // maximum work group size desired
-
- if (primeFactorsExpanded[2] * primeFactorsExpanded[3] == length) // Length is mix of 2&3 only
- {
- if(!(length%12)) { leastNumPerWI = 12; maxWorkGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; }
- else { leastNumPerWI = 6; maxWorkGroupSize = (MAX_WGS >= 256) ? 256 : MAX_WGS; }
- }
- else if (primeFactorsExpanded[2] * primeFactorsExpanded[5] == length) // Length is mix of 2&5 only
- {
- if(!(length%20)) { leastNumPerWI = 20; maxWorkGroupSize = 64; }
- else { leastNumPerWI = 10; maxWorkGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; }
- }
- else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] == length) // Length is mix of 3&5 only
- {
- leastNumPerWI = 15;
- maxWorkGroupSize = 64;
- }
- else
- {
- leastNumPerWI = 30;
- maxWorkGroupSize = 64;
- }
-
-
- // Make sure the work group size does not exceed MAX_WGS
- // for large problems sizes, this means doing more work per work-item
- size_t lnpi;
- size_t ft = 1;
- while(1)
- {
- lnpi = leastNumPerWI * ft++;
- if(length%lnpi) continue;
-
- if( (length/lnpi) <= MAX_WGS )
- {
+ workGroupSize = 49;
+ numTrans = length >= 7*workGroupSize ? 1 : (7*workGroupSize)/length;
+ } else {
+ size_t leastNumPerWI = 1; // least number of elements in one work item
+ size_t maxWorkGroupSize = MAX_WGS; // maximum work group size desired
+
+
+ if (primeFactorsExpanded[2] * primeFactorsExpanded[3] == length) {
+ if (length % 12 == 0) {
+ leastNumPerWI = 12; maxWorkGroupSize = 128;
+ } else {
+ leastNumPerWI = 6; maxWorkGroupSize = 256;
+ }
+ } else if (primeFactorsExpanded[2] * primeFactorsExpanded[5] == length) {
+ if (length % 20 == 0) {
+ leastNumPerWI = 20; maxWorkGroupSize = 64;
+ } else {
+ leastNumPerWI = 10; maxWorkGroupSize = 128;
+ }
+ } else if (primeFactorsExpanded[2] * primeFactorsExpanded[7] == length) {
+ leastNumPerWI = 14; maxWorkGroupSize = 64;
+ } else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] == length) {
+ leastNumPerWI = 15; maxWorkGroupSize = 128;
+ } else if (primeFactorsExpanded[3] * primeFactorsExpanded[7] == length) {
+ leastNumPerWI = 21; maxWorkGroupSize = 128;
+ } else if (primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) {
+ leastNumPerWI = 35; maxWorkGroupSize = 64;
+ } else if (primeFactorsExpanded[2] * primeFactorsExpanded[3] * primeFactorsExpanded[5] == length) {
+ leastNumPerWI = 30; maxWorkGroupSize = 64;
+ } else if (primeFactorsExpanded[2] * primeFactorsExpanded[3] * primeFactorsExpanded[7] == length) {
+ leastNumPerWI = 42; maxWorkGroupSize = 60;
+ } else if (primeFactorsExpanded[2] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) {
+ leastNumPerWI = 70; maxWorkGroupSize = 36;
+ } else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) {
+ leastNumPerWI =105; maxWorkGroupSize = 24;
+ } else {
+ leastNumPerWI =210; maxWorkGroupSize = 12;
+ }
+ if (pr==P_DOUBLE)
+ {
+ //leastNumPerWI /= 2;
+ maxWorkGroupSize /= 2;
+ }
+
+
+ if (maxWorkGroupSize > MAX_WGS)
+ maxWorkGroupSize = MAX_WGS;
+ assert (leastNumPerWI > 0 && length % leastNumPerWI == 0);
+
+ for (size_t lnpi = leastNumPerWI; lnpi <= length; lnpi += leastNumPerWI) {
+ if (length % lnpi != 0) continue;
+
+ if (length / lnpi <= MAX_WGS) {
leastNumPerWI = lnpi;
break;
}
}
- numTrans = 1;
- size_t n=1;
- while( ((n*length)/leastNumPerWI) <= maxWorkGroupSize )
- {
- numTrans = n;
- n++;
- }
-
- workGroupSize = (numTrans*length)/leastNumPerWI;
- assert(workGroupSize <= MAX_WGS);
+ numTrans = maxWorkGroupSize / (length / leastNumPerWI);
+ numTrans = numTrans < 1 ? 1 : numTrans;
+ workGroupSize = numTrans * (length / leastNumPerWI);
}
+
+ assert(workGroupSize <= MAX_WGS);
}
// Twiddle factors table
@@ -2499,7 +2507,7 @@ namespace StockhamGenerator
else
{
// Possible radices
- size_t cRad[] = {10,8,6,5,4,3,2,1}; // Must be in descending order
+ size_t cRad[] = {10,8,7,6,5,4,3,2,1}; // Must be in descending order
size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));
// Generate the radix and pass objects
@@ -2713,6 +2721,16 @@ namespace StockhamGenerator
str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
+
+ str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
+ str += "#define C7Q2 0.79015646852540022404554065360571"; str += sfx; str += "\n";
+ str += "#define C7Q3 0.05585426728964774240049351305970"; str += sfx; str += "\n";
+ str += "#define C7Q4 0.73430220123575240531721419756650"; str += sfx; str += "\n";
+ str += "#define C7Q5 0.44095855184409837868031445395900"; str += sfx; str += "\n";
+ str += "#define C7Q6 0.34087293062393136944265847887436"; str += sfx; str += "\n";
+ str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
+ str += "#define C7Q8 0.87484229096165666561546458979137"; str += sfx; str += "\n";
+
str += "\n";
bool cReg = linearRegs ? true : false;
@@ -3609,7 +3627,7 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
nt = t_nt;
}
else
- DetermineSizes(this->plan->envelope.limit_WorkGroupSize, this->signature.fft_N[0], wgs, nt);
+ DetermineSizes(this->plan->envelope.limit_WorkGroupSize, this->signature.fft_N[0], wgs, nt, pr);
#endif
assert((nt * this->signature.fft_N[0]) >= wgs);
diff --git a/src/library/generator.stockham.h b/src/library/generator.stockham.h
index 201e4a0..792037d 100644
--- a/src/library/generator.stockham.h
+++ b/src/library/generator.stockham.h
@@ -886,6 +886,353 @@ namespace StockhamGenerator
}
}
} break;
+ case 7:
+ {
+ static const char *C7SFR = "\
+ /*FFT7 Forward Real */ \n\
+ \n\
+ pr0 = *R1 + *R6; \n\
+ pi0 = *I1 + *I6; \n\
+ pr1 = *R1 - *R6; \n\
+ pi1 = *I1 - *I6; \n\
+ pr2 = *R2 + *R5; \n\
+ pi2 = *I2 + *I5; \n\
+ pr3 = *R2 - *R5; \n\
+ pi3 = *I2 - *I5; \n\
+ pr4 = *R4 + *R3; \n\
+ pi4 = *I4 + *I3; \n\
+ pr5 = *R4 - *R3; \n\
+ pi5 = *I4 - *I3; \n\
+ \n\
+ pr6 = pr2 + pr0; \n\
+ pi6 = pi2 + pi0; \n\
+ qr4 = pr2 - pr0; \n\
+ qi4 = pi2 - pi0; \n\
+ qr2 = pr0 - pr4; \n\
+ qi2 = pi0 - pi4; \n\
+ qr3 = pr4 - pr2; \n\
+ qi3 = pi4 - pi2; \n\
+ pr7 = pr5 + pr3; \n\
+ pi7 = pi5 + pi3; \n\
+ qr7 = pr5 - pr3; \n\
+ qi7 = pi5 - pi3; \n\
+ qr6 = pr1 - pr5; \n\
+ qi6 = pi1 - pi5; \n\
+ qr8 = pr3 - pr1; \n\
+ qi8 = pi3 - pi1; \n\
+ qr1 = pr6 + pr4; \n\
+ qi1 = pi6 + pi4; \n\
+ qr5 = pr7 + pr1; \n\
+ qi5 = pi7 + pi1; \n\
+ qr0 = *R0 + qr1; \n\
+ qi0 = *I0 + qi1; \n\
+ \n\
+ qr1 *= C7Q1; \n\
+ qi1 *= C7Q1; \n\
+ qr2 *= C7Q2; \n\
+ qi2 *= C7Q2; \n\
+ qr3 *= C7Q3; \n\
+ qi3 *= C7Q3; \n\
+ qr4 *= C7Q4; \n\
+ qi4 *= C7Q4; \n\
+ \n\
+ qr5 *= (C7Q5); \n\
+ qi5 *= (C7Q5); \n\
+ qr6 *= (C7Q6); \n\
+ qi6 *= (C7Q6); \n\
+ qr7 *= (C7Q7); \n\
+ qi7 *= (C7Q7); \n\
+ qr8 *= (C7Q8); \n\
+ qi8 *= (C7Q8); \n\
+ \n\
+ pr0 = qr0 + qr1; \n\
+ pi0 = qi0 + qi1; \n\
+ pr1 = qr2 + qr3; \n\
+ pi1 = qi2 + qi3; \n\
+ pr2 = qr4 - qr3; \n\
+ pi2 = qi4 - qi3; \n\
+ pr3 = -qr2 - qr4; \n\
+ pi3 = -qi2 - qi4; \n\
+ pr4 = qr6 + qr7; \n\
+ pi4 = qi6 + qi7; \n\
+ pr5 = qr8 - qr7; \n\
+ pi5 = qi8 - qi7; \n\
+ pr6 = -qr8 - qr6; \n\
+ pi6 = -qi8 - qi6; \n\
+ pr7 = pr0 + pr1; \n\
+ pi7 = pi0 + pi1; \n\
+ pr8 = pr0 + pr2; \n\
+ pi8 = pi0 + pi2; \n\
+ pr9 = pr0 + pr3; \n\
+ pi9 = pi0 + pi3; \n\
+ qr6 = pr4 + qr5; \n\
+ qi6 = pi4 + qi5; \n\
+ qr7 = pr5 + qr5; \n\
+ qi7 = pi5 + qi5; \n\
+ qr8 = pr6 + qr5; \n\
+ qi8 = pi6 + qi5; \n\
+ \n\
+ TR0 = qr0; TI0 = qi0; \n\
+ TR1 = pr7 + qi6; \n\
+ TI1 = pi7 - qr6; \n\
+ TR2 = pr9 + qi8; \n\
+ TI2 = pi9 - qr8; \n\
+ TR3 = pr8 - qi7; \n\
+ TI3 = pi8 + qr7; \n\
+ TR4 = pr8 + qi7; \n\
+ TI4 = pi8 - qr7; \n\
+ TR5 = pr9 - qi8; \n\
+ TI5 = pi9 + qr8; \n\
+ TR6 = pr7 - qi6; \n\
+ TI6 = pi7 + qr6; \n\
+ ";
+
+ static const char *C7SBR = "\
+ /*FFT7 Backward Real */ \n\
+ \n\
+ pr0 = *R1 + *R6; \n\
+ pi0 = *I1 + *I6; \n\
+ pr1 = *R1 - *R6; \n\
+ pi1 = *I1 - *I6; \n\
+ pr2 = *R2 + *R5; \n\
+ pi2 = *I2 + *I5; \n\
+ pr3 = *R2 - *R5; \n\
+ pi3 = *I2 - *I5; \n\
+ pr4 = *R4 + *R3; \n\
+ pi4 = *I4 + *I3; \n\
+ pr5 = *R4 - *R3; \n\
+ pi5 = *I4 - *I3; \n\
+ \n\
+ pr6 = pr2 + pr0; \n\
+ pi6 = pi2 + pi0; \n\
+ qr4 = pr2 - pr0; \n\
+ qi4 = pi2 - pi0; \n\
+ qr2 = pr0 - pr4; \n\
+ qi2 = pi0 - pi4; \n\
+ qr3 = pr4 - pr2; \n\
+ qi3 = pi4 - pi2; \n\
+ pr7 = pr5 + pr3; \n\
+ pi7 = pi5 + pi3; \n\
+ qr7 = pr5 - pr3; \n\
+ qi7 = pi5 - pi3; \n\
+ qr6 = pr1 - pr5; \n\
+ qi6 = pi1 - pi5; \n\
+ qr8 = pr3 - pr1; \n\
+ qi8 = pi3 - pi1; \n\
+ qr1 = pr6 + pr4; \n\
+ qi1 = pi6 + pi4; \n\
+ qr5 = pr7 + pr1; \n\
+ qi5 = pi7 + pi1; \n\
+ qr0 = *R0 + qr1; \n\
+ qi0 = *I0 + qi1; \n\
+ \n\
+ qr1 *= C7Q1; \n\
+ qi1 *= C7Q1; \n\
+ qr2 *= C7Q2; \n\
+ qi2 *= C7Q2; \n\
+ qr3 *= C7Q3; \n\
+ qi3 *= C7Q3; \n\
+ qr4 *= C7Q4; \n\
+ qi4 *= C7Q4; \n\
+ \n\
+ qr5 *= -(C7Q5); \n\
+ qi5 *= -(C7Q5); \n\
+ qr6 *= -(C7Q6); \n\
+ qi6 *= -(C7Q6); \n\
+ qr7 *= -(C7Q7); \n\
+ qi7 *= -(C7Q7); \n\
+ qr8 *= -(C7Q8); \n\
+ qi8 *= -(C7Q8); \n\
+ \n\
+ pr0 = qr0 + qr1; \n\
+ pi0 = qi0 + qi1; \n\
+ pr1 = qr2 + qr3; \n\
+ pi1 = qi2 + qi3; \n\
+ pr2 = qr4 - qr3; \n\
+ pi2 = qi4 - qi3; \n\
+ pr3 = -qr2 - qr4; \n\
+ pi3 = -qi2 - qi4; \n\
+ pr4 = qr6 + qr7; \n\
+ pi4 = qi6 + qi7; \n\
+ pr5 = qr8 - qr7; \n\
+ pi5 = qi8 - qi7; \n\
+ pr6 = -qr8 - qr6; \n\
+ pi6 = -qi8 - qi6; \n\
+ pr7 = pr0 + pr1; \n\
+ pi7 = pi0 + pi1; \n\
+ pr8 = pr0 + pr2; \n\
+ pi8 = pi0 + pi2; \n\
+ pr9 = pr0 + pr3; \n\
+ pi9 = pi0 + pi3; \n\
+ qr6 = pr4 + qr5; \n\
+ qi6 = pi4 + qi5; \n\
+ qr7 = pr5 + qr5; \n\
+ qi7 = pi5 + qi5; \n\
+ qr8 = pr6 + qr5; \n\
+ qi8 = pi6 + qi5; \n\
+ \n\
+ TR0 = qr0; TI0 = qi0; \n\
+ TR1 = pr7 + qi6; \n\
+ TI1 = pi7 - qr6; \n\
+ TR2 = pr9 + qi8; \n\
+ TI2 = pi9 - qr8; \n\
+ TR3 = pr8 - qi7; \n\
+ TI3 = pi8 + qr7; \n\
+ TR4 = pr8 + qi7; \n\
+ TI4 = pi8 - qr7; \n\
+ TR5 = pr9 - qi8; \n\
+ TI5 = pi9 + qr8; \n\
+ TR6 = pr7 - qi6; \n\
+ TI6 = pi7 + qr6; \n\
+ ";
+
+ static const char *C7SFC = "\
+ /*FFT7 Forward Complex */ \n\
+ \n\
+ p0 = *R1 + *R6; \n\
+ p1 = *R1 - *R6; \n\
+ p2 = *R2 + *R5; \n\
+ p3 = *R2 - *R5; \n\
+ p4 = *R4 + *R3; \n\
+ p5 = *R4 - *R3; \n\
+ \n\
+ p6 = p2 + p0; \n\
+ q4 = p2 - p0; \n\
+ q2 = p0 - p4; \n\
+ q3 = p4 - p2; \n\
+ p7 = p5 + p3; \n\
+ q7 = p5 - p3; \n\
+ q6 = p1 - p5; \n\
+ q8 = p3 - p1; \n\
+ q1 = p6 + p4; \n\
+ q5 = p7 + p1; \n\
+ q0 = *R0 + q1; \n\
+ \n\
+ q1 *= C7Q1; \n\
+ q2 *= C7Q2; \n\
+ q3 *= C7Q3; \n\
+ q4 *= C7Q4; \n\
+ \n\
+ q5 *= (C7Q5); \n\
+ q6 *= (C7Q6); \n\
+ q7 *= (C7Q7); \n\
+ q8 *= (C7Q8); \n\
+ \n\
+ p0 = q0 + q1; \n\
+ p1 = q2 + q3; \n\
+ p2 = q4 - q3; \n\
+ p3 = -q2 - q4; \n\
+ p4 = q6 + q7; \n\
+ p5 = q8 - q7; \n\
+ p6 = -q8 - q6; \n\
+ p7 = p0 + p1; \n\
+ p8 = p0 + p2; \n\
+ p9 = p0 + p3; \n\
+ q6 = p4 + q5; \n\
+ q7 = p5 + q5; \n\
+ q8 = p6 + q5; \n\
+ \n\
+ *R0 = q0; \n\
+ (*R1).x = p7.x + q6.y; \n\
+ (*R1).y = p7.y - q6.x; \n\
+ (*R2).x = p9.x + q8.y; \n\
+ (*R2).y = p9.y - q8.x; \n\
+ (*R3).x = p8.x - q7.y; \n\
+ (*R3).y = p8.y + q7.x; \n\
+ (*R4).x = p8.x + q7.y; \n\
+ (*R4).y = p8.y - q7.x; \n\
+ (*R5).x = p9.x - q8.y; \n\
+ (*R5).y = p9.y + q8.x; \n\
+ (*R6).x = p7.x - q6.y; \n\
+ (*R6).y = p7.y + q6.x; \n\
+ ";
+
+ static const char *C7SBC = "\
+ /*FFT7 Backward Complex */ \n\
+ \n\
+ p0 = *R1 + *R6; \n\
+ p1 = *R1 - *R6; \n\
+ p2 = *R2 + *R5; \n\
+ p3 = *R2 - *R5; \n\
+ p4 = *R4 + *R3; \n\
+ p5 = *R4 - *R3; \n\
+ \n\
+ p6 = p2 + p0; \n\
+ q4 = p2 - p0; \n\
+ q2 = p0 - p4; \n\
+ q3 = p4 - p2; \n\
+ p7 = p5 + p3; \n\
+ q7 = p5 - p3; \n\
+ q6 = p1 - p5; \n\
+ q8 = p3 - p1; \n\
+ q1 = p6 + p4; \n\
+ q5 = p7 + p1; \n\
+ q0 = *R0 + q1; \n\
+ \n\
+ q1 *= C7Q1; \n\
+ q2 *= C7Q2; \n\
+ q3 *= C7Q3; \n\
+ q4 *= C7Q4; \n\
+ \n\
+ q5 *= -(C7Q5); \n\
+ q6 *= -(C7Q6); \n\
+ q7 *= -(C7Q7); \n\
+ q8 *= -(C7Q8); \n\
+ \n\
+ p0 = q0 + q1; \n\
+ p1 = q2 + q3; \n\
+ p2 = q4 - q3; \n\
+ p3 = -q2 - q4; \n\
+ p4 = q6 + q7; \n\
+ p5 = q8 - q7; \n\
+ p6 = -q8 - q6; \n\
+ p7 = p0 + p1; \n\
+ p8 = p0 + p2; \n\
+ p9 = p0 + p3; \n\
+ q6 = p4 + q5; \n\
+ q7 = p5 + q5; \n\
+ q8 = p6 + q5; \n\
+ \n\
+ *R0 = q0; \n\
+ (*R1).x = p7.x + q6.y; \n\
+ (*R1).y = p7.y - q6.x; \n\
+ (*R2).x = p9.x + q8.y; \n\
+ (*R2).y = p9.y - q8.x; \n\
+ (*R3).x = p8.x - q7.y; \n\
+ (*R3).y = p8.y + q7.x; \n\
+ (*R4).x = p8.x + q7.y; \n\
+ (*R4).y = p8.y - q7.x; \n\
+ (*R5).x = p9.x - q8.y; \n\
+ (*R5).y = p9.y + q8.x; \n\
+ (*R6).x = p7.x - q6.y; \n\
+ (*R6).y = p7.y + q6.x; \n\
+ ";
+
+
+
+ if (!cReg) {
+ for (size_t i = 0; i < 10; i++)
+ bflyStr += regType + " pr" + SztToStr(i) + ", pi" + SztToStr(i) + ";\n\t";
+ for (size_t i = 0; i < 9; i++)
+ bflyStr += regType + " qr" + SztToStr(i) + ", qi" + SztToStr(i) + ";\n\t";
+
+ if (fwd)
+ bflyStr += C7SFR;
+ else
+ bflyStr += C7SBR;
+ } else {
+ for (size_t i = 0; i < 10; i++)
+ bflyStr += regType + " p" + SztToStr(i) + ";\n\t";
+ for (size_t i = 0; i < 9; i++)
+ bflyStr += regType + " q" + SztToStr(i) + ";\n\t";
+ if (fwd)
+ bflyStr += C7SFC;
+ else
+ bflyStr += C7SBC;
+ }
+ }
+ break;
+
case 8:
{
if(fwd)
@@ -1315,8 +1662,11 @@ namespace StockhamGenerator
{
if(cReg)
{
+ if (radix !=7)
+ {
bflyStr += "((*R"; bflyStr += SztToStr(i); bflyStr += ").x) = TR"; bflyStr += SztToStr(i); bflyStr += "; ";
bflyStr += "((*R"; bflyStr += SztToStr(i); bflyStr += ").y) = TI"; bflyStr += SztToStr(i); bflyStr += ";\n\t";
+ }
}
else
{
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 084cf72..506e1a5 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -565,15 +565,23 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
else
{
// This array must be kept sorted in the ascending order
- size_t supported[] = { 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40,
- 45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
- 144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288,
- 300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480, 486, 500, 512, 540,
- 576, 600, 625, 640, 648, 675, 720, 729, 750, 768, 800, 810, 864, 900, 960,
- 972, 1000, 1024, 1080, 1125, 1152, 1200, 1215, 1250, 1280, 1296, 1350, 1440,
- 1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048,
- 2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
- 3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888, 4000, 4050, 4096 };
+ size_t supported[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 24, 25, 27, 28,
+ 30, 32, 35, 36, 40, 42, 45, 48, 49, 50, 54, 56, 60, 63, 64, 70, 72, 75, 80,
+ 81, 84, 90, 96, 98, 100, 105, 108, 112, 120, 125, 126, 128, 135, 140, 144,
+ 147, 150, 160, 162, 168, 175, 180, 189, 192, 196, 200, 210, 216, 224, 225,
+ 240, 243, 245, 250, 252, 256, 270, 280, 288, 294, 300, 315, 320, 324, 336,
+ 343, 350, 360, 375, 378, 384, 392, 400, 405, 420, 432, 441, 448, 450, 480,
+ 486, 490, 500, 504, 512, 525, 540, 560, 567, 576, 588, 600, 625, 630, 640,
+ 648, 672, 675, 686, 700, 720, 729, 735, 750, 756, 768, 784, 800, 810, 840,
+ 864, 875, 882, 896, 900, 945, 960, 972, 980, 1000, 1008, 1024, 1029, 1050,
+ 1080, 1120, 1125, 1134, 1152, 1176, 1200, 1215, 1225, 1250, 1260, 1280, 1296,
+ 1323, 1344, 1350, 1372, 1400, 1440, 1458, 1470, 1500, 1512, 1536, 1568, 1575,
+ 1600, 1620, 1680, 1701, 1715, 1728, 1750, 1764, 1792, 1800, 1875, 1890, 1920,
+ 1944, 1960, 2000, 2016, 2025, 2048, 2058, 2100, 2160, 2187, 2205, 2240, 2250,
+ 2268, 2304, 2352, 2400, 2401, 2430, 2450, 2500, 2520, 2560, 2592, 2625, 2646,
+ 2688, 2700, 2744, 2800, 2835, 2880, 2916, 2940, 3000, 3024, 3072, 3087, 3125,
+ 3136, 3150, 3200, 3240, 3360, 3375, 3402, 3430, 3456, 3500, 3528, 3584, 3600,
+ 3645, 3675, 3750, 3780, 3840, 3888, 3920, 3969, 4000, 4032, 4050, 4096};
size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
diff --git a/src/library/private.h b/src/library/private.h
index fecb84f..2729973 100644
--- a/src/library/private.h
+++ b/src/library/private.h
@@ -150,6 +150,8 @@ inline bool IsASupportedLength( size_t length )
length /= 3;
else if( length % 5 == 0 )
length /= 5;
+ else if( length % 7 == 0 )
+ length /= 7;
else
return false;
}
diff --git a/src/tests/accuracy_test_random.cpp b/src/tests/accuracy_test_random.cpp
index 377776f..3f019c6 100644
--- a/src/tests/accuracy_test_random.cpp
+++ b/src/tests/accuracy_test_random.cpp
@@ -118,6 +118,7 @@ namespace ParameterizedTest {
supported_radices.push_back(2);
supported_radices.push_back(3);
supported_radices.push_back(5);
+ supported_radices.push_back(7);
// total size of this problem should be some fraction of the total space available on the device
size_t this_problem_size = random_int(1, max_problem_size_in_datapoints(precision,layout));
diff --git a/src/tests/unit_test.cpp b/src/tests/unit_test.cpp
index f25829f..e5c761f 100644
--- a/src/tests/unit_test.cpp
+++ b/src/tests/unit_test.cpp
@@ -366,8 +366,8 @@ TEST_F(clfft_UnitTest, setPlanDimLength_should_fail_if_a_length_is_set_to_zero)
}
TEST_F(clfft_UnitTest, setPlanDimLength_should_fail_on_radices_that_have_non_supported_factors) {
- // currently only factors of 2, 3, and 5 are supported
- lengths[0] = 2*3*5*7;
+ // currently only factors of 2, 3, 5, and 7 are supported
+ lengths[0] = 2*3*5*7*11;
EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftSetPlanLength( test_plan, CLFFT_1D, lengths ) );
lengths[0] = 2*2*3*3*5*5*5*5*13;
@@ -516,14 +516,14 @@ TEST_F(clfft_UnitTest, createDefaultPlan_should_fail_when_passed_invalid_dimensi
TEST_F(clfft_UnitTest, createDefaultPlan_should_fail_when_passed_unsupported_length) {
size_t length[3] = {1,1,1};
- length[0] = 7;
+ length[0] = 11;
EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_1D, length));
length[0] = 13;
length[1] = 1;
EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_2D, length));
length[0] = 1;
- length[1] = 14;
+ length[1] = 34;
EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_2D, length));
length[0] = 19;
length[1] = 22;
@@ -537,13 +537,13 @@ TEST_F(clfft_UnitTest, createDefaultPlan_should_fail_when_passed_unsupported_len
length[1] = 17;
length[2] = 1;
EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_3D, length));
- length[0] = 42;
+ length[0] = 66;
length[1] = 1;
length[2] = 1;
EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_3D, length));
length[0] = 5;
length[1] = 6;
- length[2] = 7;
+ length[2] = 17;
EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_3D, length));
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list