[clfft] 07/128: Precallback - support for sizes > 4096

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:32 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 815496382a58822e73bd0e28814f2d9888b5b0c7
Author: Pradeep <pradeep.rao at amd.com>
Date:   Tue Jul 28 08:56:47 2015 +0530

    Precallback - support for sizes > 4096
---
 src/client-callback/callback-client.cpp |   2 +-
 src/library/generator.stockham.cpp      | 141 ++++++++++++++++++++++----------
 src/library/generator.transpose.gcn.cpp |  81 +++++++++++++++++-
 src/library/plan.cpp                    |  16 ++++
 4 files changed, 191 insertions(+), 49 deletions(-)

diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
index a1fd832..6bcfbcf 100644
--- a/src/client-callback/callback-client.cpp
+++ b/src/client-callback/callback-client.cpp
@@ -250,7 +250,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 		return 1;
 	}
 
-	if (hasPrecallback && (dim != CLFFT_1D || fftVectorSize > 4096 || sizeof(T) != sizeof(float)))
+	if (hasPrecallback && (dim != CLFFT_1D || sizeof(T) != sizeof(float)))
 	{
 		terr << _T("Pre-callback feature is currently supported only for Single Precision 1D FFT and size upto 4096" ) << std::endl;
 		return 1;
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 610ef93..b4b1c29 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -2985,14 +2985,18 @@ namespace StockhamGenerator
 					{
 						str += "uint ioOffset;\n\t";
 
-						if(inInterleaved)
-						{
-							str += "__global "; str += r2Type; str += " *lwb;\n\n";
-						}
-						else
+						//Skip if precallback is set and its blockcompute
+						if (!(blockCompute && params.fft_hasPreCallback))
 						{
-							str += "__global "; str += rType; str += " *lwbRe;\n\t";
-							str += "__global "; str += rType; str += " *lwbIm;\n\n";
+							if(inInterleaved)
+							{
+								str += "__global "; str += r2Type; str += " *lwb;\n\n";
+							}
+							else
+							{
+								str += "__global "; str += rType; str += " *lwbRe;\n\t";
+								str += "__global "; str += rType; str += " *lwbIm;\n\n";
+							}
 						}
 					}
 					else
@@ -3000,14 +3004,18 @@ namespace StockhamGenerator
 						str += "uint iOffset;\n\t";
 						str += "uint oOffset;\n\t";
 
-						if(inInterleaved)
+						//Skip if precallback is set and its blockcompute
+						if (!(blockCompute && params.fft_hasPreCallback))
 						{
-							str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
-						}
-						else
-						{
-							str += "__global "; str += rType; str += " *lwbInRe;\n\t";
-							str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+							if(inInterleaved)
+							{
+								str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+							}
+							else
+							{
+								str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+								str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+							}
 						}
 
 						if(outInterleaved)
@@ -3154,14 +3162,19 @@ namespace StockhamGenerator
 							str += OffsetCalc("ioOffset", true);
 
 						str += "\t";
-						if(inInterleaved)
-						{
-							str += "lwb = gb + ioOffset;\n\n";
-						}
-						else
+
+						//Skip if precallback is set and its blockcompute
+						if (!(blockCompute && params.fft_hasPreCallback))
 						{
-							str += "lwbRe = gbRe + ioOffset;\n\t";
-							str += "lwbIm = gbIm + ioOffset;\n\n";
+							if(inInterleaved)
+							{
+								str += "lwb = gb + ioOffset;\n\n";
+							}
+							else
+							{
+								str += "lwbRe = gbRe + ioOffset;\n\t";
+								str += "lwbIm = gbIm + ioOffset;\n\n";
+							}
 						}
 					}
 					else
@@ -3178,14 +3191,19 @@ namespace StockhamGenerator
 						}
 
 						str += "\t";
-						if(inInterleaved)
-						{
-							str += "lwbIn = gbIn + iOffset;\n\t";
-						}
-						else
+
+						//Skip if precallback is set and its blockcompute
+						if (!(blockCompute && params.fft_hasPreCallback))
 						{
-							str += "lwbInRe = gbInRe + iOffset;\n\t";
-							str += "lwbInIm = gbInIm + iOffset;\n\t";
+							if(inInterleaved)
+							{
+								str += "lwbIn = gbIn + iOffset;\n\t";
+							}
+							else
+							{
+								str += "lwbInRe = gbInRe + iOffset;\n\t";
+								str += "lwbInIm = gbInIm + iOffset;\n\t";
+							}
 						}
 
 						if(outInterleaved)
@@ -3200,6 +3218,18 @@ namespace StockhamGenerator
 					}
 				}
 
+				std::string inOffset;
+				if (!r2c2r)
+				{
+					if (params.fft_placeness == CLFFT_INPLACE)
+					{
+						inOffset += "ioOffset";
+					}
+					else
+					{
+						inOffset += "iOffset";
+					}
+				}
 
 				// Read data into LDS for blocked access
 				if(blockCompute)
@@ -3207,9 +3237,17 @@ namespace StockhamGenerator
 
 					size_t loopCount = (length * blockWidth)/blockWGS;
 					
+					if ((blockComputeType == BCT_C2C) && params.fft_hasPreCallback)
+					{
+						str += "\n\t"; str += r2Type; str += " retCallback;";
+					}
+
 					str += "\n\tfor(uint t=0; t<"; str += SztToStr(loopCount);
 					str += "; t++)\n\t{\n";
 
+					//get offset 
+					std::string bufOffset;
+
 					for(size_t c=0; c<2; c++)
 					{
 						std::string comp = "";
@@ -3220,9 +3258,37 @@ namespace StockhamGenerator
 
 						if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_C2R) )
 						{
-							str += "\t\tR0"; str+= comp; str+= " = "; str += readBuf; str += "[(me%"; str+= SztToStr(blockWidth); str += ") + ";
-							str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_inStride[0]);
-							str += " + t*"; str += SztToStr(params.fft_inStride[0]*blockWGS/blockWidth); str += "];\n";
+							bufOffset.clear();
+							bufOffset += "(me%"; bufOffset += SztToStr(blockWidth); bufOffset += ") + ";
+							bufOffset += "(me/"; bufOffset+= SztToStr(blockWidth); bufOffset+= ")*"; bufOffset += SztToStr(params.fft_inStride[0]);
+							bufOffset += " + t*"; bufOffset += SztToStr(params.fft_inStride[0]*blockWGS/blockWidth);
+
+							if ((blockComputeType == BCT_C2C) && params.fft_hasPreCallback)
+							{
+								if (c == 0)
+								{
+									str += "\t\tretCallback = "; str += params.fft_preCallback.funcname; str += "(";
+								
+									if(inInterleaved)
+									{
+										str += (params.fft_placeness == CLFFT_INPLACE) ? "gb, " : "gbIn, ";
+									}
+									else
+									{
+										str += (params.fft_placeness == CLFFT_INPLACE) ? "gbRe, gbIm, " : "gbInRe, gbInIm, ";
+									}
+
+									str += inOffset; str += " + "; str += bufOffset; str += ", userdata";
+									str += (params.fft_preCallback.localMemSize > 0) ? str += ", localmem);\n" : ");\n";
+								}
+
+								str += "\t\tR0"; str+= comp; str+= " = retCallback"; str+= comp; str += ";\n";
+							}
+							else
+							{
+								str += "\t\tR0"; str+= comp; str+= " = "; 
+								str += readBuf; str += "[";	str += bufOffset; str += "];\n";
+							}
 						}
 						else
 						{
@@ -3318,19 +3384,6 @@ namespace StockhamGenerator
 					str += "\n\tfor(uint t=0; t<2; t++)\n\t{\n\n";
 				}
 
-				std::string inOffset;
-				if (!r2c2r)
-				{
-					if (params.fft_placeness == CLFFT_INPLACE)
-					{
-						inOffset += "ioOffset";
-					}
-					else
-					{
-						inOffset += "iOffset";
-					}
-				}
-
 				// Call passes
 				if(numPasses == 1)
 				{
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
index c4d6e0b..40736b2 100644
--- a/src/library/generator.transpose.gcn.cpp
+++ b/src/library/generator.transpose.gcn.cpp
@@ -315,6 +315,18 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeGCNAction::
         return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
     }
 
+	if (params.fft_hasPreCallback)
+	{
+		if (params.fft_preCallback.localMemSize > 0)
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* userdata, __local void* localmem";
+		}
+		else
+		{
+			clKernWrite( transKernel, 0 ) << ", __global void* userdata";
+		}
+	}
+
     // Close the method signature
     clKernWrite( transKernel, 0 ) << " )\n{" << std::endl;
 
@@ -383,6 +395,21 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
     clKernWrite( transKernel, 3 ) << "size_t y;" << std::endl;
     clKernWrite( transKernel, 0 ) << "} Tile;" << std::endl << std::endl;
 
+	//If pre-callback is set for the plan
+	if (params.fft_hasPreCallback)
+	{
+		//If user defined struct defined for callback function add it to opencl source string
+		if (params.fft_preCallback.userdatastruct != NULL)
+		{
+			clKernWrite( transKernel, 0 ) <<  params.fft_preCallback.userdatastruct;
+			clKernWrite( transKernel, 0 ) << std::endl;
+		}
+
+		//Insert callback function code at the beginning 
+		clKernWrite( transKernel, 0 ) << params.fft_preCallback.funcstring << std::endl;
+		clKernWrite( transKernel, 0 ) << std::endl;
+	}
+
     // This detects whether the input matrix is square
     bool notSquare = ( params.fft_N[ 0 ] == params.fft_N[ 1 ] ) ? false : true;
 
@@ -572,7 +599,11 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 					clKernWrite( transKernel, 3 ) << "{" << std::endl;
 				}
 
-
+			//If precallback is set
+			if (params.fft_hasPreCallback)
+			{
+				clKernWrite( transKernel, 6 ) << dtComplex << " retCallback;" << std::endl;
+			}
 
 			clKernWrite( transKernel, 6 ) << "for( uint t=0; t < wgUnroll; t++ )" << std::endl;
 			clKernWrite( transKernel, 6 ) << "{" << std::endl;
@@ -631,11 +662,46 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
 			switch( params.fft_inputLayout )
 			{
 			case CLFFT_COMPLEX_INTERLEAVED:
-				clKernWrite( transKernel, 9 ) << "tmp = tileIn[ gInd ];" << std::endl;
+				{
+					if (params.fft_hasPreCallback)
+					{
+						if (params.fft_preCallback.localMemSize > 0)
+						{
+							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, userdata, localmem);" << std::endl;
+						}
+						else
+						{
+							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, userdata);" << std::endl;
+						}
+						clKernWrite( transKernel, 9 ) << "tmp = retCallback;" << std::endl;
+					}
+					else
+					{
+						clKernWrite( transKernel, 9 ) << "tmp = tileIn[ gInd ];" << std::endl;
+					}
+				}
 				break;
 			case CLFFT_COMPLEX_PLANAR:
-				clKernWrite( transKernel, 9 ) << "tmp.s0 = realTileIn[ gInd ];" << std::endl;
-				clKernWrite( transKernel, 9 ) << "tmp.s1 = imagTileIn[ gInd ];" << std::endl;
+				{
+					if (params.fft_hasPreCallback)
+					{
+						if (params.fft_preCallback.localMemSize > 0)
+						{
+							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, userdata, localmem);" << std::endl;
+						}
+						else
+						{
+							clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, userdata);" << std::endl;
+						}
+						clKernWrite( transKernel, 9 ) << "tmp.s0 = retCallback.x;" << std::endl;
+						clKernWrite( transKernel, 9 ) << "tmp.s1 = retCallback.y;" << std::endl;
+					}
+					else
+					{
+						clKernWrite( transKernel, 9 ) << "tmp.s0 = realTileIn[ gInd ];" << std::endl;
+						clKernWrite( transKernel, 9 ) << "tmp.s1 = imagTileIn[ gInd ];" << std::endl;
+					}
+				}
 				break;
 			case CLFFT_HERMITIAN_INTERLEAVED:
 			case CLFFT_HERMITIAN_PLANAR:
@@ -918,6 +984,13 @@ clfftStatus FFTGeneratedTransposeGCNAction::initParams ()
     this->signature.fft_R = 1; // Dont think i'll use
     this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
 
+	//Set callback if specified
+	if (this->plan->hasPreCallback)
+	{
+		this->signature.fft_hasPreCallback = true;
+		this->signature.fft_preCallback = this->plan->preCallback;
+	}
+
     return CLFFT_SUCCESS;
 }
 
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index ae06d07..d0a0a84 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -651,6 +651,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					trans1Plan->gen           = Transpose_GCN;
 					trans1Plan->transflag     = true;
 
+					//Set callback data if set on top level plan
+					if (fftPlan->hasPreCallback)
+					{
+						trans1Plan->hasPreCallback = true;
+						trans1Plan->preCallback = fftPlan->preCallback;
+						trans1Plan->precallUserData = fftPlan->precallUserData;
+					}
+
 					OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
 						_T( "BakePlan large1d trans1 plan failed" ) );
 
@@ -1392,6 +1400,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 							trans1Plan->oDist *= fftPlan->length[index];
 						}
 
+						//Set callback data if set on top level plan
+						if (fftPlan->hasPreCallback)
+						{
+							trans1Plan->hasPreCallback = true;
+							trans1Plan->preCallback = fftPlan->preCallback;
+							trans1Plan->precallUserData = fftPlan->precallUserData;
+						}
+
 						OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
 							_T( "BakePlan large1d trans1 plan failed" ) );
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list