[clfft] 17/21: fixing 2D transpose perf problems reported in #134
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Mar 16 13:14:04 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit cf0199c580d7297f2ab64cc76637cbc9887feb57
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Wed Mar 9 16:03:46 2016 -0800
fixing 2D transpose perf problems reported in #134
---
src/library/plan.cpp | 43 +++++++++++++++++++++++++++----------------
src/library/transform.cpp | 3 +--
2 files changed, 28 insertions(+), 18 deletions(-)
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 36a2ab5..d7a52dd 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -2249,13 +2249,20 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
clLengths[0] = fftPlan->length[0];
clLengths[1] = fftPlan->length[1];
- // bool xyflag = (clLengths[0]==clLengths[1]) ? false : true;
- bool xyflag = true;
+ size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
+ size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
+ size_t padding = 0;
+
+ bool xyflag = (clLengths[0]==clLengths[1]) ? false : true;
if (xyflag && fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2)
{
+ if ((smallerDim % 64 == 0) || (biggerDim % 64 == 0))
+ if(biggerDim > 512)
+ padding = 64;
+
// we need tmp buffer for x!=y case
// we assume the tmp buffer is packed interleaved
- fftPlan->tmpBufSize = length0 * length1 *
+ fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
fftPlan->batchsize * fftPlan->ElementSize();
}
@@ -2269,7 +2276,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
transPlanX->inputLayout = fftPlan->outputLayout;
transPlanX->precision = fftPlan->precision;
transPlanX->tmpBufSize = 0;
- transPlanX->gen = Transpose_GCN;
+
transPlanX->envelope = fftPlan->envelope;
transPlanX->batchsize = fftPlan->batchsize;
transPlanX->inStride[0] = fftPlan->outStride[0];
@@ -2279,14 +2286,16 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if (xyflag)
{
+ transPlanX->gen = Transpose_GCN;
transPlanX->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
transPlanX->placeness = CLFFT_OUTOFPLACE;
transPlanX->outStride[0] = 1;
- transPlanX->outStride[1] = clLengths[1];
- transPlanX->oDist = clLengths[0] * clLengths[1];
+ transPlanX->outStride[1] = clLengths[1] + padding;
+ transPlanX->oDist = clLengths[0] * transPlanX->outStride[1];
}
else
{
+ transPlanX->gen = Transpose_SQUARE;
transPlanX->outputLayout = fftPlan->outputLayout;
transPlanX->placeness = CLFFT_INPLACE;
transPlanX->outStride[0] = fftPlan->outStride[0];
@@ -2311,15 +2320,15 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
{
colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
colPlan->inStride[0] = 1;
- colPlan->inStride.push_back(clLengths[1]);
- colPlan->iDist = clLengths[0] * clLengths[1];
+ colPlan->inStride.push_back(clLengths[1] + padding);
+ colPlan->iDist = clLengths[0] * colPlan->inStride[1];
if (fftPlan->transposed == CLFFT_NOTRANSPOSE)
{
colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
colPlan->outStride[0] = 1;
- colPlan->outStride.push_back(clLengths[1]);
- colPlan->oDist = clLengths[0] * clLengths[1];
+ colPlan->outStride.push_back(clLengths[1] + padding);
+ colPlan->oDist = clLengths[0] * colPlan->outStride[1];
colPlan->placeness = CLFFT_INPLACE;
}
else
@@ -2365,9 +2374,8 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
//Create transpose plan for second transpose
//x!=y case tmp->In or Out, x=y case In->In or Out->out
- clLengths[0] = fftPlan->length[1];
- clLengths[1] = fftPlan->length[0];
- OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths ),
+ size_t clLengthsY[2] = { clLengths[1], clLengths[0] };
+ OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengthsY ),
_T( "CreateDefaultPlan for planTY failed" ) );
FFTPlan* transPlanY = NULL;
@@ -2376,14 +2384,17 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if (xyflag)
{
+ transPlanY->gen = Transpose_GCN;
transPlanY->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
transPlanY->placeness = CLFFT_OUTOFPLACE;
transPlanY->inStride[0] = 1;
- transPlanY->inStride[1] = clLengths[0];
- transPlanY->iDist = clLengths[0] * clLengths[1];
+ transPlanY->inStride[1] = clLengths[1] + padding;
+ transPlanY->iDist = clLengths[0] * transPlanY->inStride[1];
+ transPlanY->transOutHorizontal = true;
}
else
{
+ transPlanY->gen = Transpose_SQUARE;
transPlanY->inputLayout = fftPlan->outputLayout;
transPlanY->placeness = CLFFT_INPLACE;
transPlanY->inStride[0] = fftPlan->outStride[0];
@@ -2396,7 +2407,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
transPlanY->oDist = fftPlan->oDist;
transPlanY->precision = fftPlan->precision;
transPlanY->tmpBufSize = 0;
- transPlanY->gen = Transpose_GCN;
+
transPlanY->envelope = fftPlan->envelope;
transPlanY->batchsize = fftPlan->batchsize;
transPlanY->transflag = true;
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index a066bbd..0dc557b 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -693,8 +693,7 @@ clfftStatus clfftEnqueueTransform(
cl_event transXOutEvents = NULL;
cl_event colOutEvents = NULL;
- //bool xyflag = (fftPlan->length[0] == fftPlan->length[1]) ? false : true;
- bool xyflag = true;
+ bool xyflag = (fftPlan->length[0] == fftPlan->length[1]) ? false : true;
if (xyflag)
{
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list