[clfft] 77/128: enabling inplace transposition for select conditions
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:41 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 1e661fb0d7ca45f915ef16d4dc19afc8001a94da
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Tue Sep 15 19:09:10 2015 -0500
enabling inplace transposition for select conditions
---
src/library/generator.transpose.square.cpp | 64 ++++++++++++++++++-------
src/library/plan.cpp | 42 +++++++++++------
src/library/plan.h | 6 +++
src/library/transform.cpp | 75 ++++++++++++++++++++++++------
4 files changed, 140 insertions(+), 47 deletions(-)
diff --git a/src/library/generator.transpose.square.cpp b/src/library/generator.transpose.square.cpp
index 3c1df77..a546aad 100644
--- a/src/library/generator.transpose.square.cpp
+++ b/src/library/generator.transpose.square.cpp
@@ -202,7 +202,7 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeSquareActio
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
-
+ if(params.fft_placeness == CLFFT_OUTOFPLACE)
switch (params.fft_outputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
@@ -332,7 +332,9 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
clKernWrite(transKernel, 3) << std::endl;
OffsetCalc(transKernel, params, true);
- OffsetCalc(transKernel, params, false);
+
+ if(params.fft_placeness == CLFFT_OUTOFPLACE)
+ OffsetCalc(transKernel, params, false);
// Handle planar and interleaved right here
@@ -357,24 +359,52 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
default:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
- switch (params.fft_outputLayout)
+
+ if(params.fft_placeness == CLFFT_OUTOFPLACE)
{
- case CLFFT_COMPLEX_INTERLEAVED:
- clKernWrite(transKernel, 3) << "outputA += oOffset;" << std::endl; // Set A ptr to the start of each slice
+ switch (params.fft_outputLayout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite(transKernel, 3) << "outputA += oOffset;" << std::endl; // Set A ptr to the start of each slice
- break;
- case CLFFT_COMPLEX_PLANAR:
+ break;
+ case CLFFT_COMPLEX_PLANAR:
- clKernWrite(transKernel, 3) << "outputA_R += oOffset;" << std::endl; // Set A ptr to the start of each slice
- clKernWrite(transKernel, 3) << "outputA_I += oOffset;" << std::endl; // Set A ptr to the start of each slice
- break;
- case CLFFT_HERMITIAN_INTERLEAVED:
- case CLFFT_HERMITIAN_PLANAR:
- return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
- case CLFFT_REAL:
- break;
- default:
- return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ clKernWrite(transKernel, 3) << "outputA_R += oOffset;" << std::endl; // Set A ptr to the start of each slice
+ clKernWrite(transKernel, 3) << "outputA_I += oOffset;" << std::endl; // Set A ptr to the start of each slice
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+ }
+ else
+ {
+ switch (params.fft_inputLayout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
+
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
+ clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
+
+
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
}
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 4644d00..e0b7181 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -506,7 +506,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
{
// Enable block compute under these conditions
if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
- && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) )
+ && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1) )
{
fftPlan->blockCompute = true;
@@ -621,7 +621,19 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if( (smallerDim % 64 == 0) || (biggerDim % 64 == 0) )
padding = 64;
- if (fftPlan->tmpBufSize==0 )
+ clfftGenerators transGen = Transpose_GCN;
+
+ if( (clLengths[0] == clLengths[1]) &&
+ (fftPlan->iDist == fftPlan->length[0]) &&
+ (fftPlan->oDist == fftPlan->length[0]) &&
+ fftPlan->placeness == CLFFT_INPLACE )
+ {
+ padding = 0;
+ fftPlan->allOpsInplace = true;
+ transGen = Transpose_SQUARE;
+ }
+
+ if ( (fftPlan->tmpBufSize==0 ) && !fftPlan->allOpsInplace)
{
fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
fftPlan->batchsize * fftPlan->ElementSize();
@@ -636,20 +648,20 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
lockRAII* trans1Lock = NULL;
OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
- trans1Plan->placeness = CLFFT_OUTOFPLACE;
+ trans1Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
trans1Plan->precision = fftPlan->precision;
trans1Plan->tmpBufSize = 0;
trans1Plan->batchsize = fftPlan->batchsize;
trans1Plan->envelope = fftPlan->envelope;
trans1Plan->inputLayout = fftPlan->inputLayout;
- trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans1Plan->outputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
trans1Plan->inStride[0] = fftPlan->inStride[0];
trans1Plan->inStride[1] = clLengths[0];
trans1Plan->outStride[0] = 1;
trans1Plan->outStride[1] = clLengths[1] + padding;
trans1Plan->iDist = fftPlan->iDist;
trans1Plan->oDist = clLengths[0] * trans1Plan->outStride[1];
- trans1Plan->gen = Transpose_GCN;
+ trans1Plan->gen = transGen;
trans1Plan->transflag = true;
OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
@@ -665,7 +677,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
lockRAII* row1Lock = NULL;
OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
- row1Plan->placeness = CLFFT_OUTOFPLACE;
+ row1Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
row1Plan->precision = fftPlan->precision;
row1Plan->forwardScale = 1.0f;
row1Plan->backwardScale = 1.0f;
@@ -679,7 +691,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
row1Plan->large1D = 0;
row1Plan->length.push_back(clLengths[0]);
- row1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row1Plan->inputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
row1Plan->outputLayout = fftPlan->outputLayout;
row1Plan->inStride[0] = 1;
row1Plan->outStride[0] = fftPlan->outStride[0];
@@ -702,20 +714,20 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
lockRAII* trans2Lock = NULL;
OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
- trans2Plan->placeness = CLFFT_OUTOFPLACE;
+ trans2Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
trans2Plan->precision = fftPlan->precision;
trans2Plan->tmpBufSize = 0;
trans2Plan->batchsize = fftPlan->batchsize;
trans2Plan->envelope = fftPlan->envelope;
trans2Plan->inputLayout = fftPlan->outputLayout;
- trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans2Plan->outputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
trans2Plan->inStride[0] = fftPlan->outStride[0];
trans2Plan->inStride[1] = clLengths[1];
trans2Plan->outStride[0] = 1;
trans2Plan->outStride[1] = clLengths[0] + padding;
trans2Plan->iDist = fftPlan->oDist;
trans2Plan->oDist = clLengths[1] * trans2Plan->outStride[1];
- trans2Plan->gen = Transpose_GCN;
+ trans2Plan->gen = transGen;
trans2Plan->large1D = fftPlan->length[0];
trans2Plan->transflag = true;
@@ -744,8 +756,8 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
row2Plan->length.push_back(clLengths[1]);
- row2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- row2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row2Plan->inputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
+ row2Plan->outputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
row2Plan->inStride[0] = 1;
row2Plan->outStride[0] = 1;
row2Plan->inStride.push_back(clLengths[0] + padding);
@@ -766,12 +778,12 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
lockRAII* trans3Lock = NULL;
OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
- trans3Plan->placeness = CLFFT_OUTOFPLACE;
+ trans3Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
trans3Plan->precision = fftPlan->precision;
trans3Plan->tmpBufSize = 0;
trans3Plan->batchsize = fftPlan->batchsize;
trans3Plan->envelope = fftPlan->envelope;
- trans3Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans3Plan->inputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
trans3Plan->outputLayout = fftPlan->outputLayout;
trans3Plan->inStride[0] = 1;
trans3Plan->inStride[1] = clLengths[0] + padding;
@@ -779,7 +791,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans3Plan->outStride[1] = clLengths[1];
trans3Plan->iDist = clLengths[1] * trans3Plan->inStride[1];
trans3Plan->oDist = fftPlan->oDist;
- trans3Plan->gen = Transpose_GCN;
+ trans3Plan->gen = transGen;
trans3Plan->transflag = true;
trans3Plan->transOutHorizontal = true;
diff --git a/src/library/plan.h b/src/library/plan.h
index be6231e..c6d57b9 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -423,6 +423,11 @@ public:
// User created plan
bool userPlan;
+
+ // Allocate no extra memory
+ bool allOpsInplace;
+
+
// A flag to say that blocked FFTs are going to be performed
// It can only be one of these: column to row, row to column or column to column
// row to row is just the normal case where blocking is not needed
@@ -468,6 +473,7 @@ public:
, realSpecial(false)
, realSpecial_Nr(0)
, userPlan(false)
+ , allOpsInplace(false)
, blockCompute(false)
, blockComputeType(BCT_C2C)
, planTX( 0 )
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 29cdc46..a45a2bf 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -303,9 +303,18 @@ clfftStatus clfftEnqueueTransform(
//First transpose
// Input->tmp
cl_event transTXOutEvents = NULL;
- OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &transTXOutEvents, clInputBuffers, &localIntBuffer, NULL ),
- _T("clfftEnqueueTransform for large1D transTX failed"));
+ if(fftPlan->allOpsInplace)
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &transTXOutEvents, clInputBuffers, NULL, NULL ),
+ _T("clfftEnqueueTransform for large1D transTX failed"));
+ }
+ else
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &transTXOutEvents, clInputBuffers, &localIntBuffer, NULL ),
+ _T("clfftEnqueueTransform for large1D transTX failed"));
+ }
cl_mem *mybuffers;
if (fftPlan->placeness==CLFFT_INPLACE)
@@ -325,9 +334,18 @@ clfftStatus clfftEnqueueTransform(
//First Row
//tmp->output
cl_event rowXOutEvents = NULL;
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
- &transTXOutEvents, &rowXOutEvents, &localIntBuffer, mybuffers, NULL ),
- _T("clfftEnqueueTransform for large1D rowX failed"));
+ if(fftPlan->allOpsInplace)
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
+ &transTXOutEvents, &rowXOutEvents, clInputBuffers, NULL, NULL ),
+ _T("clfftEnqueueTransform for large1D rowX failed"));
+ }
+ else
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
+ &transTXOutEvents, &rowXOutEvents, &localIntBuffer, mybuffers, NULL ),
+ _T("clfftEnqueueTransform for large1D rowX failed"));
+ }
clReleaseEvent(transTXOutEvents);
@@ -343,9 +361,18 @@ clfftStatus clfftEnqueueTransform(
//Second Transpose
// output->tmp
cl_event transTYOutEvents = NULL;
- OPENCL_V( clfftEnqueueTransform( fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
- &rowXOutEvents, &transTYOutEvents, mybuffers, &localIntBuffer, NULL ),
- _T("clfftEnqueueTransform for large1D transTY failed"));
+ if(fftPlan->allOpsInplace)
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
+ &rowXOutEvents, &transTYOutEvents, clInputBuffers, NULL, NULL ),
+ _T("clfftEnqueueTransform for large1D transTY failed"));
+ }
+ else
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
+ &rowXOutEvents, &transTYOutEvents, mybuffers, &localIntBuffer, NULL ),
+ _T("clfftEnqueueTransform for large1D transTY failed"));
+ }
clReleaseEvent(rowXOutEvents);
@@ -361,9 +388,18 @@ clfftStatus clfftEnqueueTransform(
//Second Row
//tmp->tmp, inplace
cl_event rowYOutEvents = NULL;
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1,
- &transTYOutEvents, &rowYOutEvents, &localIntBuffer, NULL, NULL ),
- _T("clfftEnqueueTransform for large1D rowY failed"));
+ if(fftPlan->allOpsInplace)
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1,
+ &transTYOutEvents, &rowYOutEvents, clInputBuffers, NULL, NULL ),
+ _T("clfftEnqueueTransform for large1D rowY failed"));
+ }
+ else
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1,
+ &transTYOutEvents, &rowYOutEvents, &localIntBuffer, NULL, NULL ),
+ _T("clfftEnqueueTransform for large1D rowY failed"));
+ }
clReleaseEvent(transTYOutEvents);
#if defined(DEBUGGING)
@@ -377,9 +413,18 @@ clfftStatus clfftEnqueueTransform(
//Third Transpose
// tmp->output
- OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
- &rowYOutEvents, outEvents, &localIntBuffer, mybuffers, NULL ),
- _T("clfftEnqueueTransform for large1D transTZ failed"));
+ if(fftPlan->allOpsInplace)
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
+ &rowYOutEvents, outEvents, clInputBuffers, NULL, NULL ),
+ _T("clfftEnqueueTransform for large1D transTZ failed"));
+ }
+ else
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
+ &rowYOutEvents, outEvents, &localIntBuffer, mybuffers, NULL ),
+ _T("clfftEnqueueTransform for large1D transTZ failed"));
+ }
clReleaseEvent(rowYOutEvents);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list