[clfft] 07/128: Precallback - support for sizes > 4096
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:32 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 815496382a58822e73bd0e28814f2d9888b5b0c7
Author: Pradeep <pradeep.rao at amd.com>
Date: Tue Jul 28 08:56:47 2015 +0530
Precallback - support for sizes > 4096
---
src/client-callback/callback-client.cpp | 2 +-
src/library/generator.stockham.cpp | 141 ++++++++++++++++++++++----------
src/library/generator.transpose.gcn.cpp | 81 +++++++++++++++++-
src/library/plan.cpp | 16 ++++
4 files changed, 191 insertions(+), 49 deletions(-)
diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
index a1fd832..6bcfbcf 100644
--- a/src/client-callback/callback-client.cpp
+++ b/src/client-callback/callback-client.cpp
@@ -250,7 +250,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
return 1;
}
- if (hasPrecallback && (dim != CLFFT_1D || fftVectorSize > 4096 || sizeof(T) != sizeof(float)))
+ if (hasPrecallback && (dim != CLFFT_1D || sizeof(T) != sizeof(float)))
{
terr << _T("Pre-callback feature is currently supported only for Single Precision 1D FFT and size upto 4096" ) << std::endl;
return 1;
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 610ef93..b4b1c29 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -2985,14 +2985,18 @@ namespace StockhamGenerator
{
str += "uint ioOffset;\n\t";
- if(inInterleaved)
- {
- str += "__global "; str += r2Type; str += " *lwb;\n\n";
- }
- else
+ //Skip if precallback is set and its blockcompute
+ if (!(blockCompute && params.fft_hasPreCallback))
{
- str += "__global "; str += rType; str += " *lwbRe;\n\t";
- str += "__global "; str += rType; str += " *lwbIm;\n\n";
+ if(inInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " *lwb;\n\n";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbIm;\n\n";
+ }
}
}
else
@@ -3000,14 +3004,18 @@ namespace StockhamGenerator
str += "uint iOffset;\n\t";
str += "uint oOffset;\n\t";
- if(inInterleaved)
+ //Skip if precallback is set and its blockcompute
+ if (!(blockCompute && params.fft_hasPreCallback))
{
- str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
- }
- else
- {
- str += "__global "; str += rType; str += " *lwbInRe;\n\t";
- str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ if(inInterleaved)
+ {
+ str += "__global "; str += r2Type; str += " *lwbIn;\n\t";
+ }
+ else
+ {
+ str += "__global "; str += rType; str += " *lwbInRe;\n\t";
+ str += "__global "; str += rType; str += " *lwbInIm;\n\t";
+ }
}
if(outInterleaved)
@@ -3154,14 +3162,19 @@ namespace StockhamGenerator
str += OffsetCalc("ioOffset", true);
str += "\t";
- if(inInterleaved)
- {
- str += "lwb = gb + ioOffset;\n\n";
- }
- else
+
+ //Skip if precallback is set and its blockcompute
+ if (!(blockCompute && params.fft_hasPreCallback))
{
- str += "lwbRe = gbRe + ioOffset;\n\t";
- str += "lwbIm = gbIm + ioOffset;\n\n";
+ if(inInterleaved)
+ {
+ str += "lwb = gb + ioOffset;\n\n";
+ }
+ else
+ {
+ str += "lwbRe = gbRe + ioOffset;\n\t";
+ str += "lwbIm = gbIm + ioOffset;\n\n";
+ }
}
}
else
@@ -3178,14 +3191,19 @@ namespace StockhamGenerator
}
str += "\t";
- if(inInterleaved)
- {
- str += "lwbIn = gbIn + iOffset;\n\t";
- }
- else
+
+ //Skip if precallback is set and its blockcompute
+ if (!(blockCompute && params.fft_hasPreCallback))
{
- str += "lwbInRe = gbInRe + iOffset;\n\t";
- str += "lwbInIm = gbInIm + iOffset;\n\t";
+ if(inInterleaved)
+ {
+ str += "lwbIn = gbIn + iOffset;\n\t";
+ }
+ else
+ {
+ str += "lwbInRe = gbInRe + iOffset;\n\t";
+ str += "lwbInIm = gbInIm + iOffset;\n\t";
+ }
}
if(outInterleaved)
@@ -3200,6 +3218,18 @@ namespace StockhamGenerator
}
}
+ std::string inOffset;
+ if (!r2c2r)
+ {
+ if (params.fft_placeness == CLFFT_INPLACE)
+ {
+ inOffset += "ioOffset";
+ }
+ else
+ {
+ inOffset += "iOffset";
+ }
+ }
// Read data into LDS for blocked access
if(blockCompute)
@@ -3207,9 +3237,17 @@ namespace StockhamGenerator
size_t loopCount = (length * blockWidth)/blockWGS;
+ if ((blockComputeType == BCT_C2C) && params.fft_hasPreCallback)
+ {
+ str += "\n\t"; str += r2Type; str += " retCallback;";
+ }
+
str += "\n\tfor(uint t=0; t<"; str += SztToStr(loopCount);
str += "; t++)\n\t{\n";
+ //get offset
+ std::string bufOffset;
+
for(size_t c=0; c<2; c++)
{
std::string comp = "";
@@ -3220,9 +3258,37 @@ namespace StockhamGenerator
if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_C2R) )
{
- str += "\t\tR0"; str+= comp; str+= " = "; str += readBuf; str += "[(me%"; str+= SztToStr(blockWidth); str += ") + ";
- str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_inStride[0]);
- str += " + t*"; str += SztToStr(params.fft_inStride[0]*blockWGS/blockWidth); str += "];\n";
+ bufOffset.clear();
+ bufOffset += "(me%"; bufOffset += SztToStr(blockWidth); bufOffset += ") + ";
+ bufOffset += "(me/"; bufOffset+= SztToStr(blockWidth); bufOffset+= ")*"; bufOffset += SztToStr(params.fft_inStride[0]);
+ bufOffset += " + t*"; bufOffset += SztToStr(params.fft_inStride[0]*blockWGS/blockWidth);
+
+ if ((blockComputeType == BCT_C2C) && params.fft_hasPreCallback)
+ {
+ if (c == 0)
+ {
+ str += "\t\tretCallback = "; str += params.fft_preCallback.funcname; str += "(";
+
+ if(inInterleaved)
+ {
+ str += (params.fft_placeness == CLFFT_INPLACE) ? "gb, " : "gbIn, ";
+ }
+ else
+ {
+ str += (params.fft_placeness == CLFFT_INPLACE) ? "gbRe, gbIm, " : "gbInRe, gbInIm, ";
+ }
+
+ str += inOffset; str += " + "; str += bufOffset; str += ", userdata";
+ str += (params.fft_preCallback.localMemSize > 0) ? str += ", localmem);\n" : ");\n";
+ }
+
+ str += "\t\tR0"; str+= comp; str+= " = retCallback"; str+= comp; str += ";\n";
+ }
+ else
+ {
+ str += "\t\tR0"; str+= comp; str+= " = ";
+ str += readBuf; str += "["; str += bufOffset; str += "];\n";
+ }
}
else
{
@@ -3318,19 +3384,6 @@ namespace StockhamGenerator
str += "\n\tfor(uint t=0; t<2; t++)\n\t{\n\n";
}
- std::string inOffset;
- if (!r2c2r)
- {
- if (params.fft_placeness == CLFFT_INPLACE)
- {
- inOffset += "ioOffset";
- }
- else
- {
- inOffset += "iOffset";
- }
- }
-
// Call passes
if(numPasses == 1)
{
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
index c4d6e0b..40736b2 100644
--- a/src/library/generator.transpose.gcn.cpp
+++ b/src/library/generator.transpose.gcn.cpp
@@ -315,6 +315,18 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeGCNAction::
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
+ if (params.fft_hasPreCallback)
+ {
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ clKernWrite( transKernel, 0 ) << ", __global void* userdata, __local void* localmem";
+ }
+ else
+ {
+ clKernWrite( transKernel, 0 ) << ", __global void* userdata";
+ }
+ }
+
// Close the method signature
clKernWrite( transKernel, 0 ) << " )\n{" << std::endl;
@@ -383,6 +395,21 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
clKernWrite( transKernel, 3 ) << "size_t y;" << std::endl;
clKernWrite( transKernel, 0 ) << "} Tile;" << std::endl << std::endl;
+ //If pre-callback is set for the plan
+ if (params.fft_hasPreCallback)
+ {
+ //If user defined struct defined for callback function add it to opencl source string
+ if (params.fft_preCallback.userdatastruct != NULL)
+ {
+ clKernWrite( transKernel, 0 ) << params.fft_preCallback.userdatastruct;
+ clKernWrite( transKernel, 0 ) << std::endl;
+ }
+
+ //Insert callback function code at the beginning
+ clKernWrite( transKernel, 0 ) << params.fft_preCallback.funcstring << std::endl;
+ clKernWrite( transKernel, 0 ) << std::endl;
+ }
+
// This detects whether the input matrix is square
bool notSquare = ( params.fft_N[ 0 ] == params.fft_N[ 1 ] ) ? false : true;
@@ -572,7 +599,11 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
clKernWrite( transKernel, 3 ) << "{" << std::endl;
}
-
+ //If precallback is set
+ if (params.fft_hasPreCallback)
+ {
+ clKernWrite( transKernel, 6 ) << dtComplex << " retCallback;" << std::endl;
+ }
clKernWrite( transKernel, 6 ) << "for( uint t=0; t < wgUnroll; t++ )" << std::endl;
clKernWrite( transKernel, 6 ) << "{" << std::endl;
@@ -631,11 +662,46 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Sig
switch( params.fft_inputLayout )
{
case CLFFT_COMPLEX_INTERLEAVED:
- clKernWrite( transKernel, 9 ) << "tmp = tileIn[ gInd ];" << std::endl;
+ {
+ if (params.fft_hasPreCallback)
+ {
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, userdata, localmem);" << std::endl;
+ }
+ else
+ {
+ clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmComplexIn << ", iOffset + gInd, userdata);" << std::endl;
+ }
+ clKernWrite( transKernel, 9 ) << "tmp = retCallback;" << std::endl;
+ }
+ else
+ {
+ clKernWrite( transKernel, 9 ) << "tmp = tileIn[ gInd ];" << std::endl;
+ }
+ }
break;
case CLFFT_COMPLEX_PLANAR:
- clKernWrite( transKernel, 9 ) << "tmp.s0 = realTileIn[ gInd ];" << std::endl;
- clKernWrite( transKernel, 9 ) << "tmp.s1 = imagTileIn[ gInd ];" << std::endl;
+ {
+ if (params.fft_hasPreCallback)
+ {
+ if (params.fft_preCallback.localMemSize > 0)
+ {
+ clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, userdata, localmem);" << std::endl;
+ }
+ else
+ {
+ clKernWrite( transKernel, 9 ) << "retCallback = " << params.fft_preCallback.funcname << "(" << pmRealIn << ", " << pmImagIn << ", iOffset + gInd, userdata);" << std::endl;
+ }
+ clKernWrite( transKernel, 9 ) << "tmp.s0 = retCallback.x;" << std::endl;
+ clKernWrite( transKernel, 9 ) << "tmp.s1 = retCallback.y;" << std::endl;
+ }
+ else
+ {
+ clKernWrite( transKernel, 9 ) << "tmp.s0 = realTileIn[ gInd ];" << std::endl;
+ clKernWrite( transKernel, 9 ) << "tmp.s1 = imagTileIn[ gInd ];" << std::endl;
+ }
+ }
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
@@ -918,6 +984,13 @@ clfftStatus FFTGeneratedTransposeGCNAction::initParams ()
this->signature.fft_R = 1; // Dont think i'll use
this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+ //Set callback if specified
+ if (this->plan->hasPreCallback)
+ {
+ this->signature.fft_hasPreCallback = true;
+ this->signature.fft_preCallback = this->plan->preCallback;
+ }
+
return CLFFT_SUCCESS;
}
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index ae06d07..d0a0a84 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -651,6 +651,14 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans1Plan->gen = Transpose_GCN;
trans1Plan->transflag = true;
+ //Set callback data if set on top level plan
+ if (fftPlan->hasPreCallback)
+ {
+ trans1Plan->hasPreCallback = true;
+ trans1Plan->preCallback = fftPlan->preCallback;
+ trans1Plan->precallUserData = fftPlan->precallUserData;
+ }
+
OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
_T( "BakePlan large1d trans1 plan failed" ) );
@@ -1392,6 +1400,14 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans1Plan->oDist *= fftPlan->length[index];
}
+ //Set callback data if set on top level plan
+ if (fftPlan->hasPreCallback)
+ {
+ trans1Plan->hasPreCallback = true;
+ trans1Plan->preCallback = fftPlan->preCallback;
+ trans1Plan->precallUserData = fftPlan->precallUserData;
+ }
+
OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
_T( "BakePlan large1d trans1 plan failed" ) );
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list