[clfft] 33/74: planner updates to integrate non-square i/p transposes

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Jan 14 19:52:15 UTC 2016


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository clfft.

commit 3d17e2aac332d276dfebcf24cc3785c6f502869b
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Mon Dec 7 11:15:15 2015 -0800

    planner updates to integrate non-square i/p transposes
---
 src/library/generator.transpose.nonsquare.cpp |  12 +--
 src/library/plan.cpp                          | 149 +++++++++++++++++++-------
 src/library/plan.h                            |  16 +--
 3 files changed, 124 insertions(+), 53 deletions(-)

diff --git a/src/library/generator.transpose.nonsquare.cpp b/src/library/generator.transpose.nonsquare.cpp
index 33992ba..4292fc3 100644
--- a/src/library/generator.transpose.nonsquare.cpp
+++ b/src/library/generator.transpose.nonsquare.cpp
@@ -1424,15 +1424,15 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::initParams()
     if (CLFFT_INPLACE == this->signature.fft_placeness)
     {
         //	If this is an in-place transform the
-        //	input and output layout, dimensions and strides
+        //	input and output layout
         //	*MUST* be the same.
         //
         ARG_CHECK(this->signature.fft_inputLayout == this->signature.fft_outputLayout)
 
-            for (size_t u = this->plan->inStride.size(); u-- > 0; )
+    /*        for (size_t u = this->plan->inStride.size(); u-- > 0; )
             {
                 ARG_CHECK(this->plan->inStride[u] == this->plan->outStride[u]);
-            }
+            }*/
     }
 
     this->signature.fft_DataDim = this->plan->length.size() + 1;
@@ -1491,7 +1491,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
 
 
     std::string programCode;
-    if (this->signature.nonSquareKernelType == NON_SQUARE_TRANSPOSE)
+    if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
     {
         OPENCL_V(genTransposeKernel(this->signature, programCode, lwSize, reShapeFactor), _T("GenerateTransposeKernel() failed!"));
     }
@@ -1511,7 +1511,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
 
 
     OPENCL_V(fftRepo.setProgramCode(Transpose_NONSQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
-    if (this->signature.nonSquareKernelType == NON_SQUARE_TRANSPOSE)
+    if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
     {
         // Note:  See genFunctionPrototype( )
         if (this->signature.fft_3StepTwiddle)
@@ -1538,7 +1538,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
     size_t smaller_dim = (this->signature.fft_N[0] < this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
     size_t global_item_size;
 
-    if (this->signature.nonSquareKernelType == NON_SQUARE_TRANSPOSE)
+    if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
     {
         if (smaller_dim % (16 * reShapeFactor) == 0)
             wg_slice = smaller_dim / 16 / reShapeFactor;
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index d85dbe2..e495b71 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -503,7 +503,8 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				{
 					// Enable block compute under these conditions
 					if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
-						&& (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1) )
+						&& (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1)
+						&& (!clfftGetRequestLibNoMemAlloc()) )
 					{
 						fftPlan->blockCompute = true;
 
@@ -540,7 +541,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					}
 					else
 					{
-						if(fftPlan->length[0] > (Large1DThreshold * Large1DThreshold) )
+						if( clfftGetRequestLibNoMemAlloc() )
+						{
+							in_x = BitScanF(fftPlan->length[0]);
+							in_x /= 2;
+							clLengths[1] = (size_t)1 << in_x;
+						}
+						else if( fftPlan->length[0] > (Large1DThreshold * Large1DThreshold) )
 						{
 							clLengths[1] = fftPlan->length[0] / Large1DThreshold;
 						}
@@ -611,8 +618,9 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
 
-					if ( IsPo2(fftPlan->length[0])
-						&& (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) ) break;
+					if ( IsPo2(fftPlan->length[0]) &&
+						 (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) &&
+						 (!clfftGetRequestLibNoMemAlloc()) ) break;
 
 					if ( clLengths[0]<=32 && clLengths[1]<=32) break;
 
@@ -625,10 +633,17 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					clfftGenerators transGen = Transpose_GCN;
 
+					if (clfftGetRequestLibNoMemAlloc() &&
+						(clLengths[0] == 2*clLengths[1]) &&
+						fftPlan->placeness == CLFFT_INPLACE)
+					{
+						padding = 0;
+						fftPlan->allOpsInplace = true;
+						transGen = Transpose_NONSQUARE;
+					}
+
 					if( clfftGetRequestLibNoMemAlloc() &&
 						(clLengths[0] == clLengths[1]) &&
-						(fftPlan->iDist == fftPlan->length[0]) &&
-						(fftPlan->oDist == fftPlan->length[0]) &&
 						fftPlan->placeness == CLFFT_INPLACE )
 					{
 						padding = 0;
@@ -1873,40 +1888,93 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
                 clfftStatus err;
 				if(fftPlan->gen == Transpose_GCN)
 					fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
-                else if (fftPlan->gen == Transpose_SQUARE)
+				else if (fftPlan->gen == Transpose_SQUARE)
+					fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
+                else if (fftPlan->gen == Transpose_NONSQUARE)
                 {
-                    static int test_performed = 0;
-                    size_t backup_0 = fftPlan->length[0];
-                    size_t backup_1 = fftPlan->length[1];
-                    
-                    clfftLayout  inputLayout_bckup = fftPlan->inputLayout;
-                    clfftLayout  outputLayout_bckup = fftPlan->outputLayout;
-                    clfftPrecision precision_bckup = fftPlan->precision;
-
-                    if (!test_performed)
-                    {
-                        //CLFFT_COMPLEX_PLANAR
-                        //CLFFT_COMPLEX_INTERLEAVED
-                        fftPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
-                        fftPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
-                        if(fftPlan->inputLayout == CLFFT_REAL)
-                            test_performed = 1;
-                        fftPlan->nonSquareKernelType = NON_SQUARE_SWAP;
-                        fftPlan->precision = CLFFT_SINGLE;
-                        fftPlan->length[0] = 4096;// fftPlan->length[1];
-                        fftPlan->length[1] = fftPlan->length[0] * 2;
-                        fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
-                        OPENCL_V(err, "FFTGeneratedTransposeNonSquareAction() failed");
-
-                    }
-
-                    fftPlan->precision = precision_bckup;
-                    fftPlan->inputLayout = inputLayout_bckup;
-                    fftPlan->outputLayout = outputLayout_bckup;
-                    fftPlan->length[0] = backup_0;
-                    fftPlan->length[1] = backup_1;
-
-                    fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
+					if(fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
+						fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
+					else if (fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_SWAP)
+						fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
+					else
+					{
+						size_t clLengths[] = { 1, 1, 0 };
+						clLengths[0] = fftPlan->length[0];
+						clLengths[1] = fftPlan->length[1];
+
+						//Transpose stage 1
+						OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths),
+							_T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
+
+						FFTPlan* trans1Plan = NULL;
+						lockRAII* trans1Lock = NULL;
+						OPENCL_V(fftRepo.getPlan(fftPlan->planTX, trans1Plan, trans1Lock), _T("fftRepo.getPlan failed"));
+
+						trans1Plan->placeness = CLFFT_INPLACE;
+						trans1Plan->precision = fftPlan->precision;
+						trans1Plan->tmpBufSize = 0;
+						trans1Plan->batchsize = fftPlan->batchsize;
+						trans1Plan->envelope = fftPlan->envelope;
+						trans1Plan->inputLayout = fftPlan->inputLayout;
+						trans1Plan->outputLayout = fftPlan->outputLayout;
+						trans1Plan->inStride[0] = fftPlan->inStride[0];
+						trans1Plan->outStride[0] = fftPlan->outStride[0];
+						trans1Plan->inStride[1] = fftPlan->inStride[1];
+						trans1Plan->outStride[1] = fftPlan->outStride[1];
+						trans1Plan->iDist = fftPlan->iDist;
+						trans1Plan->oDist = fftPlan->oDist;
+						trans1Plan->gen = Transpose_NONSQUARE;
+						trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE;
+						trans1Plan->transflag = true;
+
+						for (size_t index = 2; index < fftPlan->length.size(); index++)
+						{
+							trans1Plan->length.push_back(fftPlan->length[index]);
+							trans1Plan->inStride.push_back(fftPlan->inStride[index]);
+							trans1Plan->outStride.push_back(fftPlan->outStride[index]);
+						}
+
+
+						OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL),
+							_T("BakePlan transpose_nsq_stage1 plan failed"));
+
+
+						//Transpose stage 2
+						OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths),
+							_T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
+
+						FFTPlan* trans2Plan = NULL;
+						lockRAII* trans2Lock = NULL;
+						OPENCL_V(fftRepo.getPlan(fftPlan->planTY, trans2Plan, trans2Lock), _T("fftRepo.getPlan failed"));
+
+						trans2Plan->placeness = CLFFT_INPLACE;
+						trans2Plan->precision = fftPlan->precision;
+						trans2Plan->tmpBufSize = 0;
+						trans2Plan->batchsize = fftPlan->batchsize;
+						trans2Plan->envelope = fftPlan->envelope;
+						trans2Plan->inputLayout = fftPlan->inputLayout;
+						trans2Plan->outputLayout = fftPlan->outputLayout;
+						trans2Plan->inStride[0] = fftPlan->inStride[0];
+						trans2Plan->outStride[0] = fftPlan->outStride[0];
+						trans2Plan->inStride[1] = fftPlan->inStride[1];
+						trans2Plan->outStride[1] = fftPlan->outStride[1];
+						trans2Plan->iDist = fftPlan->iDist;
+						trans2Plan->oDist = fftPlan->oDist;
+						trans2Plan->gen = Transpose_NONSQUARE;
+						trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
+						trans2Plan->transflag = true;
+
+						for (size_t index = 2; index < fftPlan->length.size(); index++)
+						{
+							trans2Plan->length.push_back(fftPlan->length[index]);
+							trans2Plan->inStride.push_back(fftPlan->inStride[index]);
+							trans2Plan->outStride.push_back(fftPlan->outStride[index]);
+						}
+
+
+						OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL),
+							_T("BakePlan transpose_nsq_stage2 plan failed"));
+					}
                 }
 				else
 					fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
@@ -4169,7 +4237,8 @@ clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
 	{
 	case Stockham:		return GetMax1DLengthStockham(longest);
     case Transpose_GCN:			*longest = 4096; return CLFFT_SUCCESS;
-    case Transpose_SQUARE:     *longest = 4096; return CLFFT_SUCCESS;
+    case Transpose_SQUARE:		*longest = 4096; return CLFFT_SUCCESS;
+	case Transpose_NONSQUARE:	*longest = 4096; return CLFFT_SUCCESS;
     case Copy:					*longest = 4096; return CLFFT_SUCCESS;
 	default:			assert(false); return CLFFT_NOTIMPLEMENTED;
 	}
diff --git a/src/library/plan.h b/src/library/plan.h
index 06919c5..843a2ba 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -88,10 +88,11 @@ enum BlockComputeType
 
 
 //NonSquareKernelType
-enum NonSquareKernelType
+enum NonSquareTransposeKernelType
 {
-    NON_SQUARE_TRANSPOSE,
-    NON_SQUARE_SWAP
+	NON_SQUARE_TRANS_PARENT,
+    NON_SQUARE_TRANS_TRANSPOSE,
+    NON_SQUARE_TRANS_SWAP
 };
 
 #define CLFFT_CB_SIZE 32
@@ -151,7 +152,8 @@ struct FFTKernelGenKeyParams {
 	BlockComputeType		 blockComputeType;
 	size_t					 blockSIMD;
 	size_t					 blockLDS;
-    NonSquareKernelType      nonSquareKernelType;
+    
+	NonSquareTransposeKernelType      nonSquareKernelType;
 
 	bool fft_hasPreCallback;
 	clfftCallbackParam fft_preCallback;
@@ -190,7 +192,7 @@ struct FFTKernelGenKeyParams {
 		blockComputeType = BCT_C2C;
 		blockSIMD = 0;
 		blockLDS = 0;
-        nonSquareKernelType = NON_SQUARE_TRANSPOSE;
+        nonSquareKernelType = NON_SQUARE_TRANS_PARENT;
 		fft_hasPreCallback = false;
 	}
 };
@@ -467,7 +469,7 @@ public:
     // The action
     FFTAction * action;
 
-    NonSquareKernelType nonSquareKernelType;
+    NonSquareTransposeKernelType nonSquareKernelType;
 
 	FFTPlan ()
 	:	baked (false)
@@ -512,7 +514,7 @@ public:
 	,	const_buffer( NULL )
 	,	gen(Stockham)
     ,   action(0)
-    ,   nonSquareKernelType(NON_SQUARE_TRANSPOSE)
+    ,   nonSquareKernelType(NON_SQUARE_TRANS_PARENT)
     ,   plHandle(0)
 	,   hasPreCallback(false)
 	{

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list