[clfft] 85/128: adding radix7 functionality

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:42 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit d6c6127d81b15e05773c2225528c5de632e40367
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Wed Sep 23 12:18:04 2015 -0500

    adding radix7 functionality
---
 src/library/generator.stockham.cpp | 132 ++++++++------
 src/library/generator.stockham.h   | 350 +++++++++++++++++++++++++++++++++++++
 src/library/plan.cpp               |  26 ++-
 src/library/private.h              |   2 +
 src/tests/accuracy_test_random.cpp |   1 +
 src/tests/unit_test.cpp            |  12 +-
 6 files changed, 451 insertions(+), 72 deletions(-)

diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 4161279..08028ee 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -385,7 +385,7 @@ namespace StockhamGenerator
 	// Given the length of 1d fft, this function determines the appropriate work group size
 	// and the number of transforms per work group
 	// TODO for optimizations - experiment with different possibilities for work group sizes and num transforms for improving performance
-	void DetermineSizes(const size_t &MAX_WGS, const size_t &length, size_t &workGroupSize, size_t &numTrans)
+	void DetermineSizes(const size_t &MAX_WGS, const size_t &length, size_t &workGroupSize, size_t &numTrans, Precision &pr)
 	{
 		assert(MAX_WGS >= 64);
 
@@ -396,7 +396,7 @@ namespace StockhamGenerator
 			return;
 		}
 
-		size_t baseRadix[] = {5,3,2}; // list only supported primes
+		size_t baseRadix[] = {7,5,3,2}; // list only supported primes
 		size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);
 
 		size_t l = length;
@@ -418,9 +418,7 @@ namespace StockhamGenerator
 
 		if		(primeFactorsExpanded[2] == length)	// Length is pure power of 2
 		{
-			//if(length == 1024) { workGroupSize = 128;  numTrans = 1; }
 			if		(length >= 1024)	{ workGroupSize = (MAX_WGS >= 256) ? 256 : MAX_WGS; numTrans = 1; }
-			//else if (length == 512)		{ workGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; numTrans = 1; }
 			else if (length == 512)		{ workGroupSize = 64; numTrans = 1; }
 			else if	(length >= 16)		{ workGroupSize = 64;  numTrans = 256/length; }
 			else						{ workGroupSize = 64;  numTrans = 128/length; }
@@ -428,69 +426,79 @@ namespace StockhamGenerator
 		else if	(primeFactorsExpanded[3] == length) // Length is pure power of 3
 		{
 			workGroupSize = (MAX_WGS >= 256) ? 243 : 27;
-			if(length >= 3*workGroupSize)	numTrans = 1;
-			else							numTrans = (3*workGroupSize)/length;
+			numTrans = length >= 3*workGroupSize ? 1 : (3*workGroupSize)/length;
 		}
 		else if	(primeFactorsExpanded[5] == length) // Length is pure power of 5
 		{
 			workGroupSize = (MAX_WGS >= 128) ? 125 : 25;
-			if(length >= 5*workGroupSize)	numTrans = 1;
-			else							numTrans = (5*workGroupSize)/length;
+			numTrans = length >= 5*workGroupSize ? 1 : (5*workGroupSize)/length;
 		}
-		else
+		else if	(primeFactorsExpanded[7] == length) // Length is pure power of 7
 		{
-			size_t leastNumPerWI; // least number of elements in one work item
-			size_t maxWorkGroupSize; // maximum work group size desired
-
-			if		(primeFactorsExpanded[2] * primeFactorsExpanded[3] == length) // Length is mix of 2&3 only
-			{
-				if(!(length%12))	{ leastNumPerWI = 12; maxWorkGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; }
-				else				{ leastNumPerWI = 6;  maxWorkGroupSize = (MAX_WGS >= 256) ? 256 : MAX_WGS; }
-			}
-			else if	(primeFactorsExpanded[2] * primeFactorsExpanded[5] == length) // Length is mix of 2&5 only
-			{
-				if(!(length%20))	{ leastNumPerWI = 20; maxWorkGroupSize = 64; }
-				else				{ leastNumPerWI = 10; maxWorkGroupSize = (MAX_WGS >= 128) ? 128 : MAX_WGS; }
-			}
-			else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] == length) // Length is mix of 3&5 only
-			{
-				leastNumPerWI = 15;
-				maxWorkGroupSize = 64;
-			}
-			else
-			{
-				leastNumPerWI = 30;
-				maxWorkGroupSize = 64;
-			}
-
-
-			// Make sure the work group size does not exceed MAX_WGS
-			// for large problems sizes, this means doing more work per work-item
-			size_t lnpi;
-			size_t ft = 1;
-			while(1)
-			{
-				lnpi = leastNumPerWI * ft++;
-				if(length%lnpi) continue;
-
-				if( (length/lnpi) <= MAX_WGS )
-				{
+			workGroupSize = 49;
+			numTrans = length >= 7*workGroupSize ? 1 : (7*workGroupSize)/length;
+		} else {
+			size_t leastNumPerWI = 1; // least number of elements in one work item
+			size_t maxWorkGroupSize = MAX_WGS; // maximum work group size desired
+
+
+			if        (primeFactorsExpanded[2] * primeFactorsExpanded[3] == length) { 
+				if (length % 12 == 0) { 
+					leastNumPerWI = 12; maxWorkGroupSize = 128;
+				} else { 
+					leastNumPerWI =  6; maxWorkGroupSize = 256;
+				}
+			} else if (primeFactorsExpanded[2] * primeFactorsExpanded[5] == length) { 
+				if (length % 20 == 0) { 
+					leastNumPerWI = 20; maxWorkGroupSize = 64;
+				} else { 
+					leastNumPerWI = 10; maxWorkGroupSize = 128;
+				}
+			} else if (primeFactorsExpanded[2] * primeFactorsExpanded[7] == length) { 
+					leastNumPerWI = 14; maxWorkGroupSize = 64;
+			} else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] == length) { 
+				    leastNumPerWI = 15; maxWorkGroupSize = 128;
+			} else if (primeFactorsExpanded[3] * primeFactorsExpanded[7] == length) { 
+				    leastNumPerWI = 21; maxWorkGroupSize = 128;
+			} else if (primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) { 
+				    leastNumPerWI = 35; maxWorkGroupSize = 64;
+			} else if (primeFactorsExpanded[2] * primeFactorsExpanded[3] * primeFactorsExpanded[5] == length) { 
+				    leastNumPerWI = 30; maxWorkGroupSize = 64;
+			} else if (primeFactorsExpanded[2] * primeFactorsExpanded[3] * primeFactorsExpanded[7] == length) { 
+				    leastNumPerWI = 42; maxWorkGroupSize = 60;
+			} else if (primeFactorsExpanded[2] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) { 
+				    leastNumPerWI = 70; maxWorkGroupSize = 36;
+			} else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) { 
+				    leastNumPerWI =105; maxWorkGroupSize = 24;
+			} else { 
+				    leastNumPerWI =210; maxWorkGroupSize = 12;
+			}
+			if (pr==P_DOUBLE)
+			{
+				//leastNumPerWI /= 2; 
+				maxWorkGroupSize /= 2;
+			}
+			
+
+			if (maxWorkGroupSize > MAX_WGS)
+				maxWorkGroupSize = MAX_WGS;
+			assert (leastNumPerWI > 0 && length % leastNumPerWI == 0);
+
+			for (size_t lnpi = leastNumPerWI; lnpi <= length; lnpi += leastNumPerWI) {
+				if (length % lnpi != 0) continue;
+
+				if (length / lnpi <= MAX_WGS) {
 					leastNumPerWI = lnpi;
 					break;
 				}
 			}
 
-			numTrans = 1;
-			size_t n=1;
-			while( ((n*length)/leastNumPerWI) <= maxWorkGroupSize )
-			{
-				numTrans = n;
-				n++;
-			}
-
-			workGroupSize = (numTrans*length)/leastNumPerWI;
-			assert(workGroupSize <= MAX_WGS);
+			numTrans = maxWorkGroupSize / (length / leastNumPerWI);
+			numTrans = numTrans < 1 ? 1 : numTrans;
+			workGroupSize = numTrans * (length / leastNumPerWI);
 		}
+
+		assert(workGroupSize <= MAX_WGS);
 	}
 
 	// Twiddle factors table
@@ -2499,7 +2507,7 @@ namespace StockhamGenerator
 			else
 			{
 				// Possible radices
-				size_t cRad[] = {10,8,6,5,4,3,2,1}; // Must be in descending order
+				size_t cRad[] = {10,8,7,6,5,4,3,2,1}; // Must be in descending order
 				size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));
 
 				// Generate the radix and pass objects
@@ -2713,6 +2721,16 @@ namespace StockhamGenerator
 
 			str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
 			str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
+
+			str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
+			str += "#define C7Q2  0.79015646852540022404554065360571"; str += sfx; str += "\n";
+			str += "#define C7Q3  0.05585426728964774240049351305970"; str += sfx; str += "\n";
+			str += "#define C7Q4  0.73430220123575240531721419756650"; str += sfx; str += "\n";
+			str += "#define C7Q5  0.44095855184409837868031445395900"; str += sfx; str += "\n";
+			str += "#define C7Q6  0.34087293062393136944265847887436"; str += sfx; str += "\n";
+			str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
+			str += "#define C7Q8  0.87484229096165666561546458979137"; str += sfx; str += "\n";
+
 			str += "\n";
 
 			bool cReg = linearRegs ? true : false;
@@ -3609,7 +3627,7 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
 		nt = t_nt;
 	}
 	else
-		DetermineSizes(this->plan->envelope.limit_WorkGroupSize, this->signature.fft_N[0], wgs, nt);
+		DetermineSizes(this->plan->envelope.limit_WorkGroupSize, this->signature.fft_N[0], wgs, nt, pr);
 #endif
 
 	assert((nt * this->signature.fft_N[0]) >= wgs);
diff --git a/src/library/generator.stockham.h b/src/library/generator.stockham.h
index 201e4a0..792037d 100644
--- a/src/library/generator.stockham.h
+++ b/src/library/generator.stockham.h
@@ -886,6 +886,353 @@ namespace StockhamGenerator
 						}
 					}
 				} break;
+			case 7:
+				{
+					static const char *C7SFR = "\
+					/*FFT7 Forward Real */ \n\
+					\n\
+						pr0 = *R1 + *R6; \n\
+						pi0 = *I1 + *I6; \n\
+						pr1 = *R1 - *R6; \n\
+						pi1 = *I1 - *I6; \n\
+						pr2 = *R2 + *R5; \n\
+						pi2 = *I2 + *I5; \n\
+						pr3 = *R2 - *R5; \n\
+						pi3 = *I2 - *I5; \n\
+						pr4 = *R4 + *R3; \n\
+						pi4 = *I4 + *I3; \n\
+						pr5 = *R4 - *R3; \n\
+						pi5 = *I4 - *I3; \n\
+					\n\
+						pr6 = pr2 + pr0; \n\
+						pi6 = pi2 + pi0; \n\
+						qr4 = pr2 - pr0; \n\
+						qi4 = pi2 - pi0; \n\
+						qr2 = pr0 - pr4; \n\
+						qi2 = pi0 - pi4; \n\
+						qr3 = pr4 - pr2; \n\
+						qi3 = pi4 - pi2; \n\
+						pr7 = pr5 + pr3; \n\
+						pi7 = pi5 + pi3; \n\
+						qr7 = pr5 - pr3; \n\
+						qi7 = pi5 - pi3; \n\
+						qr6 = pr1 - pr5; \n\
+						qi6 = pi1 - pi5; \n\
+						qr8 = pr3 - pr1; \n\
+						qi8 = pi3 - pi1; \n\
+						qr1 = pr6 + pr4; \n\
+						qi1 = pi6 + pi4; \n\
+						qr5 = pr7 + pr1; \n\
+						qi5 = pi7 + pi1; \n\
+						qr0 = *R0 + qr1; \n\
+						qi0 = *I0 + qi1; \n\
+					\n\
+						qr1 *= C7Q1; \n\
+						qi1 *= C7Q1; \n\
+						qr2 *= C7Q2; \n\
+						qi2 *= C7Q2; \n\
+						qr3 *= C7Q3; \n\
+						qi3 *= C7Q3; \n\
+						qr4 *= C7Q4; \n\
+						qi4 *= C7Q4; \n\
+					\n\
+						qr5 *= (C7Q5); \n\
+						qi5 *= (C7Q5); \n\
+						qr6 *= (C7Q6); \n\
+						qi6 *= (C7Q6); \n\
+						qr7 *= (C7Q7); \n\
+						qi7 *= (C7Q7); \n\
+						qr8 *= (C7Q8); \n\
+						qi8 *= (C7Q8); \n\
+					\n\
+						pr0 =  qr0 + qr1; \n\
+						pi0 =  qi0 + qi1; \n\
+						pr1 =  qr2 + qr3; \n\
+						pi1 =  qi2 + qi3; \n\
+						pr2 =  qr4 - qr3; \n\
+						pi2 =  qi4 - qi3; \n\
+						pr3 = -qr2 - qr4; \n\
+						pi3 = -qi2 - qi4; \n\
+						pr4 =  qr6 + qr7; \n\
+						pi4 =  qi6 + qi7; \n\
+						pr5 =  qr8 - qr7; \n\
+						pi5 =  qi8 - qi7; \n\
+						pr6 = -qr8 - qr6; \n\
+						pi6 = -qi8 - qi6; \n\
+						pr7 =  pr0 + pr1; \n\
+						pi7 =  pi0 + pi1; \n\
+						pr8 =  pr0 + pr2; \n\
+						pi8 =  pi0 + pi2; \n\
+						pr9 =  pr0 + pr3; \n\
+						pi9 =  pi0 + pi3; \n\
+						qr6 =  pr4 + qr5; \n\
+						qi6 =  pi4 + qi5; \n\
+						qr7 =  pr5 + qr5; \n\
+						qi7 =  pi5 + qi5; \n\
+						qr8 =  pr6 + qr5; \n\
+						qi8 =  pi6 + qi5; \n\
+					\n\
+						TR0 = qr0; TI0 = qi0; \n\
+						TR1 = pr7 + qi6; \n\
+						TI1 = pi7 - qr6; \n\
+						TR2 = pr9 + qi8; \n\
+						TI2 = pi9 - qr8; \n\
+						TR3 = pr8 - qi7; \n\
+						TI3 = pi8 + qr7; \n\
+						TR4 = pr8 + qi7; \n\
+						TI4 = pi8 - qr7; \n\
+						TR5 = pr9 - qi8; \n\
+						TI5 = pi9 + qr8; \n\
+						TR6 = pr7 - qi6; \n\
+						TI6 = pi7 + qr6; \n\
+					";
+
+					static const char *C7SBR = "\
+					/*FFT7 Backward Real */ \n\
+					\n\
+						pr0 = *R1 + *R6; \n\
+						pi0 = *I1 + *I6; \n\
+						pr1 = *R1 - *R6; \n\
+						pi1 = *I1 - *I6; \n\
+						pr2 = *R2 + *R5; \n\
+						pi2 = *I2 + *I5; \n\
+						pr3 = *R2 - *R5; \n\
+						pi3 = *I2 - *I5; \n\
+						pr4 = *R4 + *R3; \n\
+						pi4 = *I4 + *I3; \n\
+						pr5 = *R4 - *R3; \n\
+						pi5 = *I4 - *I3; \n\
+					\n\
+						pr6 = pr2 + pr0; \n\
+						pi6 = pi2 + pi0; \n\
+						qr4 = pr2 - pr0; \n\
+						qi4 = pi2 - pi0; \n\
+						qr2 = pr0 - pr4; \n\
+						qi2 = pi0 - pi4; \n\
+						qr3 = pr4 - pr2; \n\
+						qi3 = pi4 - pi2; \n\
+						pr7 = pr5 + pr3; \n\
+						pi7 = pi5 + pi3; \n\
+						qr7 = pr5 - pr3; \n\
+						qi7 = pi5 - pi3; \n\
+						qr6 = pr1 - pr5; \n\
+						qi6 = pi1 - pi5; \n\
+						qr8 = pr3 - pr1; \n\
+						qi8 = pi3 - pi1; \n\
+						qr1 = pr6 + pr4; \n\
+						qi1 = pi6 + pi4; \n\
+						qr5 = pr7 + pr1; \n\
+						qi5 = pi7 + pi1; \n\
+						qr0 = *R0 + qr1; \n\
+						qi0 = *I0 + qi1; \n\
+					\n\
+						qr1 *= C7Q1; \n\
+						qi1 *= C7Q1; \n\
+						qr2 *= C7Q2; \n\
+						qi2 *= C7Q2; \n\
+						qr3 *= C7Q3; \n\
+						qi3 *= C7Q3; \n\
+						qr4 *= C7Q4; \n\
+						qi4 *= C7Q4; \n\
+					\n\
+						qr5 *= -(C7Q5); \n\
+						qi5 *= -(C7Q5); \n\
+						qr6 *= -(C7Q6); \n\
+						qi6 *= -(C7Q6); \n\
+						qr7 *= -(C7Q7); \n\
+						qi7 *= -(C7Q7); \n\
+						qr8 *= -(C7Q8); \n\
+						qi8 *= -(C7Q8); \n\
+					\n\
+						pr0 =  qr0 + qr1; \n\
+						pi0 =  qi0 + qi1; \n\
+						pr1 =  qr2 + qr3; \n\
+						pi1 =  qi2 + qi3; \n\
+						pr2 =  qr4 - qr3; \n\
+						pi2 =  qi4 - qi3; \n\
+						pr3 = -qr2 - qr4; \n\
+						pi3 = -qi2 - qi4; \n\
+						pr4 =  qr6 + qr7; \n\
+						pi4 =  qi6 + qi7; \n\
+						pr5 =  qr8 - qr7; \n\
+						pi5 =  qi8 - qi7; \n\
+						pr6 = -qr8 - qr6; \n\
+						pi6 = -qi8 - qi6; \n\
+						pr7 =  pr0 + pr1; \n\
+						pi7 =  pi0 + pi1; \n\
+						pr8 =  pr0 + pr2; \n\
+						pi8 =  pi0 + pi2; \n\
+						pr9 =  pr0 + pr3; \n\
+						pi9 =  pi0 + pi3; \n\
+						qr6 =  pr4 + qr5; \n\
+						qi6 =  pi4 + qi5; \n\
+						qr7 =  pr5 + qr5; \n\
+						qi7 =  pi5 + qi5; \n\
+						qr8 =  pr6 + qr5; \n\
+						qi8 =  pi6 + qi5; \n\
+					\n\
+						TR0 = qr0; TI0 = qi0; \n\
+						TR1 = pr7 + qi6; \n\
+						TI1 = pi7 - qr6; \n\
+						TR2 = pr9 + qi8; \n\
+						TI2 = pi9 - qr8; \n\
+						TR3 = pr8 - qi7; \n\
+						TI3 = pi8 + qr7; \n\
+						TR4 = pr8 + qi7; \n\
+						TI4 = pi8 - qr7; \n\
+						TR5 = pr9 - qi8; \n\
+						TI5 = pi9 + qr8; \n\
+						TR6 = pr7 - qi6; \n\
+						TI6 = pi7 + qr6; \n\
+					";
+
+					static const char *C7SFC = "\
+					/*FFT7 Forward Complex */ \n\
+					\n\
+						p0 = *R1 + *R6; \n\
+						p1 = *R1 - *R6; \n\
+						p2 = *R2 + *R5; \n\
+						p3 = *R2 - *R5; \n\
+						p4 = *R4 + *R3; \n\
+						p5 = *R4 - *R3; \n\
+					\n\
+						p6 = p2 + p0; \n\
+						q4 = p2 - p0; \n\
+						q2 = p0 - p4; \n\
+						q3 = p4 - p2; \n\
+						p7 = p5 + p3; \n\
+						q7 = p5 - p3; \n\
+						q6 = p1 - p5; \n\
+						q8 = p3 - p1; \n\
+						q1 = p6 + p4; \n\
+						q5 = p7 + p1; \n\
+						q0 = *R0 + q1; \n\
+					\n\
+						q1 *= C7Q1; \n\
+						q2 *= C7Q2; \n\
+						q3 *= C7Q3; \n\
+						q4 *= C7Q4; \n\
+					\n\
+						q5 *= (C7Q5); \n\
+						q6 *= (C7Q6); \n\
+						q7 *= (C7Q7); \n\
+						q8 *= (C7Q8); \n\
+					\n\
+						p0 = q0 + q1; \n\
+						p1 = q2 + q3; \n\
+						p2 = q4 - q3; \n\
+						p3 = -q2 - q4; \n\
+						p4 = q6 + q7; \n\
+						p5 = q8 - q7; \n\
+						p6 = -q8 - q6; \n\
+						p7 = p0 + p1; \n\
+						p8 = p0 + p2; \n\
+						p9 = p0 + p3; \n\
+						q6 = p4 + q5; \n\
+						q7 = p5 + q5; \n\
+						q8 = p6 + q5; \n\
+					\n\
+						*R0 = q0; \n\
+						(*R1).x = p7.x + q6.y; \n\
+						(*R1).y = p7.y - q6.x; \n\
+						(*R2).x = p9.x + q8.y; \n\
+						(*R2).y = p9.y - q8.x; \n\
+						(*R3).x = p8.x - q7.y; \n\
+						(*R3).y = p8.y + q7.x; \n\
+						(*R4).x = p8.x + q7.y; \n\
+						(*R4).y = p8.y - q7.x; \n\
+						(*R5).x = p9.x - q8.y; \n\
+						(*R5).y = p9.y + q8.x; \n\
+						(*R6).x = p7.x - q6.y; \n\
+						(*R6).y = p7.y + q6.x; \n\
+					";
+
+					static const char *C7SBC = "\
+					/*FFT7 Backward Complex */ \n\
+					\n\
+						p0 = *R1 + *R6; \n\
+						p1 = *R1 - *R6; \n\
+						p2 = *R2 + *R5; \n\
+						p3 = *R2 - *R5; \n\
+						p4 = *R4 + *R3; \n\
+						p5 = *R4 - *R3; \n\
+					\n\
+						p6 = p2 + p0; \n\
+						q4 = p2 - p0; \n\
+						q2 = p0 - p4; \n\
+						q3 = p4 - p2; \n\
+						p7 = p5 + p3; \n\
+						q7 = p5 - p3; \n\
+						q6 = p1 - p5; \n\
+						q8 = p3 - p1; \n\
+						q1 = p6 + p4; \n\
+						q5 = p7 + p1; \n\
+						q0 = *R0 + q1; \n\
+					\n\
+						q1 *= C7Q1; \n\
+						q2 *= C7Q2; \n\
+						q3 *= C7Q3; \n\
+						q4 *= C7Q4; \n\
+					\n\
+						q5 *= -(C7Q5); \n\
+						q6 *= -(C7Q6); \n\
+						q7 *= -(C7Q7); \n\
+						q8 *= -(C7Q8); \n\
+					\n\
+						p0 = q0 + q1; \n\
+						p1 = q2 + q3; \n\
+						p2 = q4 - q3; \n\
+						p3 = -q2 - q4; \n\
+						p4 = q6 + q7; \n\
+						p5 = q8 - q7; \n\
+						p6 = -q8 - q6; \n\
+						p7 = p0 + p1; \n\
+						p8 = p0 + p2; \n\
+						p9 = p0 + p3; \n\
+						q6 = p4 + q5; \n\
+						q7 = p5 + q5; \n\
+						q8 = p6 + q5; \n\
+					\n\
+						*R0 = q0; \n\
+						(*R1).x = p7.x + q6.y; \n\
+						(*R1).y = p7.y - q6.x; \n\
+						(*R2).x = p9.x + q8.y; \n\
+						(*R2).y = p9.y - q8.x; \n\
+						(*R3).x = p8.x - q7.y; \n\
+						(*R3).y = p8.y + q7.x; \n\
+						(*R4).x = p8.x + q7.y; \n\
+						(*R4).y = p8.y - q7.x; \n\
+						(*R5).x = p9.x - q8.y; \n\
+						(*R5).y = p9.y + q8.x; \n\
+						(*R6).x = p7.x - q6.y; \n\
+						(*R6).y = p7.y + q6.x; \n\
+					";
+
+
+
+					if (!cReg) {
+						for (size_t i = 0; i < 10; i++)
+							bflyStr += regType + " pr" + SztToStr(i) + ", pi" + SztToStr(i) + ";\n\t";
+						for (size_t i = 0; i < 9; i++)
+							bflyStr += regType + " qr" + SztToStr(i) + ", qi" + SztToStr(i) + ";\n\t";
+
+						if (fwd)
+							bflyStr += C7SFR;
+						else
+							bflyStr += C7SBR;
+					} else {
+						for (size_t i = 0; i < 10; i++)
+							bflyStr += regType + " p" + SztToStr(i) + ";\n\t";
+						for (size_t i = 0; i < 9; i++)
+							bflyStr += regType + " q" + SztToStr(i) + ";\n\t";
+						if (fwd)
+							bflyStr += C7SFC;
+						else
+							bflyStr += C7SBC;
+					}
+				}
+				break;
+
 			case 8:
 				{
 					if(fwd)
@@ -1315,8 +1662,11 @@ namespace StockhamGenerator
 				{
 					if(cReg)
 					{
+						if (radix !=7) 
+						{
 						bflyStr += "((*R"; bflyStr += SztToStr(i); bflyStr += ").x) = TR"; bflyStr += SztToStr(i); bflyStr += "; ";
 						bflyStr += "((*R"; bflyStr += SztToStr(i); bflyStr += ").y) = TI"; bflyStr += SztToStr(i); bflyStr += ";\n\t";
+						}
 					}
 					else
 					{
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 084cf72..506e1a5 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -565,15 +565,23 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				else
 				{
 					// This array must be kept sorted in the ascending order
-					size_t supported[] = {	1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40,
-											45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
-											144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288,
-											300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480, 486, 500, 512, 540,
-											576, 600, 625, 640, 648, 675, 720, 729, 750, 768, 800, 810, 864, 900, 960,
-											972, 1000, 1024, 1080, 1125, 1152, 1200, 1215, 1250, 1280, 1296, 1350, 1440,
-											1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048,
-											2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
-											3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888, 4000, 4050, 4096 };
+					size_t supported[] = {	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 24, 25, 27, 28,
+											30, 32, 35, 36, 40, 42, 45, 48, 49, 50, 54, 56, 60, 63, 64, 70, 72, 75, 80,
+											81, 84, 90, 96, 98, 100, 105, 108, 112, 120, 125, 126, 128, 135, 140, 144,
+											147, 150, 160, 162, 168, 175, 180, 189, 192, 196, 200, 210, 216, 224, 225,
+											240, 243, 245, 250, 252, 256, 270, 280, 288, 294, 300, 315, 320, 324, 336,
+											343, 350, 360, 375, 378, 384, 392, 400, 405, 420, 432, 441, 448, 450, 480,
+											486, 490, 500, 504, 512, 525, 540, 560, 567, 576, 588, 600, 625, 630, 640,
+											648, 672, 675, 686, 700, 720, 729, 735, 750, 756, 768, 784, 800, 810, 840,
+											864, 875, 882, 896, 900, 945, 960, 972, 980, 1000, 1008, 1024, 1029, 1050,
+											1080, 1120, 1125, 1134, 1152, 1176, 1200, 1215, 1225, 1250, 1260, 1280, 1296,
+											1323, 1344, 1350, 1372, 1400, 1440, 1458, 1470, 1500, 1512, 1536, 1568, 1575,
+											1600, 1620, 1680, 1701, 1715, 1728, 1750, 1764, 1792, 1800, 1875, 1890, 1920,
+											1944, 1960, 2000, 2016, 2025, 2048, 2058, 2100, 2160, 2187, 2205, 2240, 2250,
+											2268, 2304, 2352, 2400, 2401, 2430, 2450, 2500, 2520, 2560, 2592, 2625, 2646,
+											2688, 2700, 2744, 2800, 2835, 2880, 2916, 2940, 3000, 3024, 3072, 3087, 3125,
+											3136, 3150, 3200, 3240, 3360, 3375, 3402, 3430, 3456, 3500, 3528, 3584, 3600,
+											3645, 3675, 3750, 3780, 3840, 3888, 3920, 3969, 4000, 4032, 4050, 4096};
 
 					size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
 					size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
diff --git a/src/library/private.h b/src/library/private.h
index fecb84f..2729973 100644
--- a/src/library/private.h
+++ b/src/library/private.h
@@ -150,6 +150,8 @@ inline bool IsASupportedLength( size_t length )
 			length /= 3;
 		else if( length % 5 == 0 )
 			length /= 5;
+		else if( length % 7 == 0 )
+			length /= 7;
 		else
 			return false;
 	}
diff --git a/src/tests/accuracy_test_random.cpp b/src/tests/accuracy_test_random.cpp
index 377776f..3f019c6 100644
--- a/src/tests/accuracy_test_random.cpp
+++ b/src/tests/accuracy_test_random.cpp
@@ -118,6 +118,7 @@ namespace ParameterizedTest {
 		supported_radices.push_back(2);
 		supported_radices.push_back(3);
 		supported_radices.push_back(5);
+		supported_radices.push_back(7);
 
 		// total size of this problem should be some fraction of the total space available on the device
 		size_t this_problem_size = random_int(1, max_problem_size_in_datapoints(precision,layout));
diff --git a/src/tests/unit_test.cpp b/src/tests/unit_test.cpp
index f25829f..e5c761f 100644
--- a/src/tests/unit_test.cpp
+++ b/src/tests/unit_test.cpp
@@ -366,8 +366,8 @@ TEST_F(clfft_UnitTest, setPlanDimLength_should_fail_if_a_length_is_set_to_zero)
 }
 
 TEST_F(clfft_UnitTest, setPlanDimLength_should_fail_on_radices_that_have_non_supported_factors) {
-	// currently only factors of 2, 3, and 5 are supported
-	lengths[0] = 2*3*5*7;
+	// currently only factors of 2, 3, 5, and 7 are supported
+	lengths[0] = 2*3*5*7*11;
 	EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftSetPlanLength( test_plan, CLFFT_1D, lengths ) );
 
 	lengths[0] = 2*2*3*3*5*5*5*5*13;
@@ -516,14 +516,14 @@ TEST_F(clfft_UnitTest, createDefaultPlan_should_fail_when_passed_invalid_dimensi
 TEST_F(clfft_UnitTest, createDefaultPlan_should_fail_when_passed_unsupported_length) {
     size_t length[3] = {1,1,1};
 
-    length[0] = 7;
+    length[0] = 11;
 	EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_1D, length));
 
     length[0] = 13;
     length[1] = 1;
 	EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_2D, length));
     length[0] = 1;
-    length[1] = 14;
+    length[1] = 34;
 	EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_2D, length));
     length[0] = 19;
     length[1] = 22;
@@ -537,13 +537,13 @@ TEST_F(clfft_UnitTest, createDefaultPlan_should_fail_when_passed_unsupported_len
     length[1] = 17;
     length[2] = 1;
 	EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_3D, length));
-    length[0] = 42;
+    length[0] = 66;
     length[1] = 1;
     length[2] = 1;
 	EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_3D, length));
     length[0] = 5;
     length[1] = 6;
-    length[2] = 7;
+    length[2] = 17;
 	EXPECT_EQ( CLFFT_NOTIMPLEMENTED, clfftCreateDefaultPlan( &test_plan, context, CLFFT_3D, length));
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list