[clfft] 77/128: enabling inplace transposition for select conditions

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:41 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 1e661fb0d7ca45f915ef16d4dc19afc8001a94da
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Tue Sep 15 19:09:10 2015 -0500

    enabling inplace transposition for select conditions
---
 src/library/generator.transpose.square.cpp | 64 ++++++++++++++++++-------
 src/library/plan.cpp                       | 42 +++++++++++------
 src/library/plan.h                         |  6 +++
 src/library/transform.cpp                  | 75 ++++++++++++++++++++++++------
 4 files changed, 140 insertions(+), 47 deletions(-)

diff --git a/src/library/generator.transpose.square.cpp b/src/library/generator.transpose.square.cpp
index 3c1df77..a546aad 100644
--- a/src/library/generator.transpose.square.cpp
+++ b/src/library/generator.transpose.square.cpp
@@ -202,7 +202,7 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeSquareActio
         return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
     }
 
-
+	if(params.fft_placeness == CLFFT_OUTOFPLACE)
 	switch (params.fft_outputLayout)
 	{
 		case CLFFT_COMPLEX_INTERLEAVED:
@@ -332,7 +332,9 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 		clKernWrite(transKernel, 3) << std::endl;
 
 		OffsetCalc(transKernel, params, true);
-		OffsetCalc(transKernel, params, false);
+
+		if(params.fft_placeness == CLFFT_OUTOFPLACE)
+			OffsetCalc(transKernel, params, false);
 
 
 		// Handle planar and interleaved right here
@@ -357,24 +359,52 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeSquareAction::
 			default:
 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
 		}
-		switch (params.fft_outputLayout)
+
+		if(params.fft_placeness == CLFFT_OUTOFPLACE)
 		{
-			case CLFFT_COMPLEX_INTERLEAVED:
-				clKernWrite(transKernel, 3) << "outputA += oOffset;" << std::endl;  // Set A ptr to the start of each slice
+			switch (params.fft_outputLayout)
+			{
+				case CLFFT_COMPLEX_INTERLEAVED:
+					clKernWrite(transKernel, 3) << "outputA += oOffset;" << std::endl;  // Set A ptr to the start of each slice
 
-				break;
-			case CLFFT_COMPLEX_PLANAR:
+					break;
+				case CLFFT_COMPLEX_PLANAR:
 
-				clKernWrite(transKernel, 3) << "outputA_R += oOffset;" << std::endl;  // Set A ptr to the start of each slice 
-				clKernWrite(transKernel, 3) << "outputA_I += oOffset;" << std::endl;  // Set A ptr to the start of each slice 
-				break;
-			case CLFFT_HERMITIAN_INTERLEAVED:
-			case CLFFT_HERMITIAN_PLANAR:
-				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
-			case CLFFT_REAL:
-				break;
-			default:
-				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+					clKernWrite(transKernel, 3) << "outputA_R += oOffset;" << std::endl;  // Set A ptr to the start of each slice 
+					clKernWrite(transKernel, 3) << "outputA_I += oOffset;" << std::endl;  // Set A ptr to the start of each slice 
+					break;
+				case CLFFT_HERMITIAN_INTERLEAVED:
+				case CLFFT_HERMITIAN_PLANAR:
+					return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+				case CLFFT_REAL:
+					break;
+				default:
+					return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+			}
+		}
+		else
+		{
+			switch (params.fft_inputLayout)
+			{
+				case CLFFT_COMPLEX_INTERLEAVED:
+					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl; 
+
+					break;
+				case CLFFT_COMPLEX_PLANAR:
+
+					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
+					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
+
+				
+					break;
+				case CLFFT_HERMITIAN_INTERLEAVED:
+				case CLFFT_HERMITIAN_PLANAR:
+					return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+				case CLFFT_REAL:
+					break;
+				default:
+					return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+			}
 		}
 
 		
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 4644d00..e0b7181 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -506,7 +506,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				{
 					// Enable block compute under these conditions
 					if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
-						&& (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) )
+						&& (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1) )
 					{
 						fftPlan->blockCompute = true;
 
@@ -621,7 +621,19 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					if( (smallerDim % 64 == 0) || (biggerDim % 64 == 0) )
 						padding = 64;
 
-					if (fftPlan->tmpBufSize==0 )
+					clfftGenerators transGen = Transpose_GCN;
+
+					if( (clLengths[0] == clLengths[1]) &&
+						(fftPlan->iDist == fftPlan->length[0]) &&
+						(fftPlan->oDist == fftPlan->length[0]) &&
+						fftPlan->placeness == CLFFT_INPLACE )
+					{
+						padding = 0;
+						fftPlan->allOpsInplace = true;
+						transGen = Transpose_SQUARE;
+					}
+
+					if ( (fftPlan->tmpBufSize==0 ) && !fftPlan->allOpsInplace)
 					{
 						fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
 							fftPlan->batchsize * fftPlan->ElementSize();
@@ -636,20 +648,20 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					lockRAII* trans1Lock	= NULL;
 					OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
 
-					trans1Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans1Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
 					trans1Plan->precision     = fftPlan->precision;
 					trans1Plan->tmpBufSize    = 0;
 					trans1Plan->batchsize     = fftPlan->batchsize;
 					trans1Plan->envelope	  = fftPlan->envelope;
 					trans1Plan->inputLayout   = fftPlan->inputLayout;
-					trans1Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					trans1Plan->outputLayout  = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
 					trans1Plan->inStride[0]   = fftPlan->inStride[0];
 					trans1Plan->inStride[1]   = clLengths[0];
 					trans1Plan->outStride[0]  = 1;
 					trans1Plan->outStride[1]  = clLengths[1] + padding;
 					trans1Plan->iDist         = fftPlan->iDist;
 					trans1Plan->oDist         = clLengths[0] * trans1Plan->outStride[1];
-					trans1Plan->gen           = Transpose_GCN;
+					trans1Plan->gen           = transGen;
 					trans1Plan->transflag     = true;
 
 					OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
@@ -665,7 +677,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					lockRAII* row1Lock	= NULL;
 					OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
 
-					row1Plan->placeness     = CLFFT_OUTOFPLACE;
+					row1Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
 					row1Plan->precision     = fftPlan->precision;
 					row1Plan->forwardScale  = 1.0f;
 					row1Plan->backwardScale = 1.0f;
@@ -679,7 +691,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					row1Plan->large1D		= 0;
 
 					row1Plan->length.push_back(clLengths[0]);
-					row1Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					row1Plan->inputLayout   = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
 					row1Plan->outputLayout  = fftPlan->outputLayout;
 					row1Plan->inStride[0]   = 1;
 					row1Plan->outStride[0]  = fftPlan->outStride[0];
@@ -702,20 +714,20 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					lockRAII* trans2Lock	= NULL;
 					OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
 
-					trans2Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans2Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
 					trans2Plan->precision     = fftPlan->precision;
 					trans2Plan->tmpBufSize    = 0;
 					trans2Plan->batchsize     = fftPlan->batchsize;
 					trans2Plan->envelope	  = fftPlan->envelope;
 					trans2Plan->inputLayout   = fftPlan->outputLayout;
-					trans2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					trans2Plan->outputLayout  = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
 					trans2Plan->inStride[0]   = fftPlan->outStride[0];
 					trans2Plan->inStride[1]   = clLengths[1];
 					trans2Plan->outStride[0]  = 1;
 					trans2Plan->outStride[1]  = clLengths[0] + padding;
 					trans2Plan->iDist         = fftPlan->oDist;
 					trans2Plan->oDist         = clLengths[1] * trans2Plan->outStride[1];
-                    trans2Plan->gen           = Transpose_GCN;
+                    trans2Plan->gen           = transGen;
 					trans2Plan->large1D		  = fftPlan->length[0];
 					trans2Plan->transflag     = true;
 
@@ -744,8 +756,8 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 
 					row2Plan->length.push_back(clLengths[1]);
-					row2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-					row2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					row2Plan->inputLayout   = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
+					row2Plan->outputLayout  = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
 					row2Plan->inStride[0]   = 1;
 					row2Plan->outStride[0]  = 1;
 					row2Plan->inStride.push_back(clLengths[0] + padding);
@@ -766,12 +778,12 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					lockRAII* trans3Lock	= NULL;
 					OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
 
-					trans3Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans3Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
 					trans3Plan->precision     = fftPlan->precision;
 					trans3Plan->tmpBufSize    = 0;
 					trans3Plan->batchsize     = fftPlan->batchsize;
 					trans3Plan->envelope	  = fftPlan->envelope;
-					trans3Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					trans3Plan->inputLayout   = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
 					trans3Plan->outputLayout  = fftPlan->outputLayout;
 					trans3Plan->inStride[0]   = 1;
 					trans3Plan->inStride[1]   = clLengths[0] + padding;
@@ -779,7 +791,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					trans3Plan->outStride[1]  = clLengths[1];
 					trans3Plan->iDist         = clLengths[1] * trans3Plan->inStride[1];
 					trans3Plan->oDist         = fftPlan->oDist;
-                    trans3Plan->gen           = Transpose_GCN;
+                    trans3Plan->gen           = transGen;
 					trans3Plan->transflag     = true;
 					trans3Plan->transOutHorizontal = true;
 
diff --git a/src/library/plan.h b/src/library/plan.h
index be6231e..c6d57b9 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -423,6 +423,11 @@ public:
 	// User created plan
 	bool userPlan;
 
+
+	// Allocate no extra memory
+	bool allOpsInplace;
+
+
 	// A flag to say that blocked FFTs are going to be performed
 	// It can only be one of these: column to row, row to column or column to column
 	// row to row is just the normal case where blocking is not needed
@@ -468,6 +473,7 @@ public:
 	,	realSpecial(false)
 	,	realSpecial_Nr(0)
 	,	userPlan(false)
+	,	allOpsInplace(false)
 	,	blockCompute(false)
 	,	blockComputeType(BCT_C2C)
 	,   planTX( 0 )
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 29cdc46..a45a2bf 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -303,9 +303,18 @@ clfftStatus clfftEnqueueTransform(
 					//First transpose
 					// Input->tmp
 					cl_event transTXOutEvents = NULL;
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
-						waitEvents, &transTXOutEvents, clInputBuffers, &localIntBuffer, NULL ),
-						_T("clfftEnqueueTransform for large1D transTX failed"));
+					if(fftPlan->allOpsInplace)
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+							waitEvents, &transTXOutEvents, clInputBuffers, NULL, NULL ),
+							_T("clfftEnqueueTransform for large1D transTX failed"));
+					}
+					else
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+							waitEvents, &transTXOutEvents, clInputBuffers, &localIntBuffer, NULL ),
+							_T("clfftEnqueueTransform for large1D transTX failed"));
+					}
 
 					cl_mem *mybuffers;
 					if (fftPlan->placeness==CLFFT_INPLACE)
@@ -325,9 +334,18 @@ clfftStatus clfftEnqueueTransform(
 					//First Row
 					//tmp->output
 					cl_event rowXOutEvents = NULL;
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
-						&transTXOutEvents, &rowXOutEvents, &localIntBuffer, mybuffers, NULL ),
-						_T("clfftEnqueueTransform for large1D rowX failed"));
+					if(fftPlan->allOpsInplace)
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
+							&transTXOutEvents, &rowXOutEvents, clInputBuffers, NULL, NULL ),
+							_T("clfftEnqueueTransform for large1D rowX failed"));
+					}
+					else
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
+							&transTXOutEvents, &rowXOutEvents, &localIntBuffer, mybuffers, NULL ),
+							_T("clfftEnqueueTransform for large1D rowX failed"));
+					}
 					clReleaseEvent(transTXOutEvents);
 
 
@@ -343,9 +361,18 @@ clfftStatus clfftEnqueueTransform(
 					//Second Transpose
 					// output->tmp
 					cl_event transTYOutEvents = NULL;
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
-						&rowXOutEvents, &transTYOutEvents, mybuffers, &localIntBuffer, NULL ),
-						_T("clfftEnqueueTransform for large1D transTY failed"));
+					if(fftPlan->allOpsInplace)
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
+							&rowXOutEvents, &transTYOutEvents, clInputBuffers, NULL, NULL ),
+							_T("clfftEnqueueTransform for large1D transTY failed"));
+					}
+					else
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
+							&rowXOutEvents, &transTYOutEvents, mybuffers, &localIntBuffer, NULL ),
+							_T("clfftEnqueueTransform for large1D transTY failed"));
+					}
 					clReleaseEvent(rowXOutEvents);
 
 
@@ -361,9 +388,18 @@ clfftStatus clfftEnqueueTransform(
 					//Second Row
 					//tmp->tmp, inplace
 					cl_event rowYOutEvents = NULL;
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1,
-						&transTYOutEvents, &rowYOutEvents, &localIntBuffer, NULL, NULL ),
-						_T("clfftEnqueueTransform for large1D rowY failed"));
+					if(fftPlan->allOpsInplace)
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1,
+							&transTYOutEvents, &rowYOutEvents, clInputBuffers, NULL, NULL ),
+							_T("clfftEnqueueTransform for large1D rowY failed"));
+					}
+					else
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1,
+							&transTYOutEvents, &rowYOutEvents, &localIntBuffer, NULL, NULL ),
+							_T("clfftEnqueueTransform for large1D rowY failed"));
+					}
 					clReleaseEvent(transTYOutEvents);
 
 #if defined(DEBUGGING)
@@ -377,9 +413,18 @@ clfftStatus clfftEnqueueTransform(
 
 					//Third Transpose
 					// tmp->output
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
-						&rowYOutEvents, outEvents, &localIntBuffer, mybuffers, NULL ),
-						_T("clfftEnqueueTransform for large1D transTZ failed"));
+					if(fftPlan->allOpsInplace)
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
+							&rowYOutEvents, outEvents, clInputBuffers, NULL, NULL ),
+							_T("clfftEnqueueTransform for large1D transTZ failed"));
+					}
+					else
+					{
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
+							&rowYOutEvents, outEvents, &localIntBuffer, mybuffers, NULL ),
+							_T("clfftEnqueueTransform for large1D transTZ failed"));
+					}
 					clReleaseEvent(rowYOutEvents);
 
 				}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list