[clfft] 10/128: Precallback - C2C double precision updates

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:33 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 5e9b96bdf92716d6224b81760d7d350c40384c63
Author: Pradeep <pradeep.rao at amd.com>
Date:   Thu Jul 30 12:26:19 2015 +0530

    Precallback - C2C double precision updates
---
 src/client-callback/callback-client.cpp | 243 +++++++++++++++++++++++---------
 src/library/generator.stockham.cpp      |   2 +-
 src/library/plan.cpp                    |   8 ++
 3 files changed, 185 insertions(+), 68 deletions(-)

diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
index 336d4ee..5fb6c9e 100644
--- a/src/client-callback/callback-client.cpp
+++ b/src/client-callback/callback-client.cpp
@@ -23,26 +23,43 @@ namespace po = boost::program_options;
 				return ret; \n \
 				}
 
+#define MULVAL_DP double2 mulval(__global void* in, uint offset, __global void* userdata)\n \
+				{ \n \
+				int scalar = *((__global int*)userdata + offset); \n \
+				double2 ret = *((__global double2*)in + offset) * scalar; \n \
+				return ret; \n \
+				}
+
 #define MULVAL_PLANAR float2 mulval(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
 				{ \n \
 				__global USER_DATA *data = ((__global USER_DATA *)userdata + offset); \n \
-				int scalar = (int)data->scalar1 + (int)data->scalar2 + (int)data->scalar3; \n \
+				int scalar = (int)data->scalar1 + (int)data->scalar2; \n \
 				float2 ret; \n \
 				ret.x = *((__global float*)inRe + offset) * scalar; \n \
 				ret.y = *((__global float*)inIm + offset) * scalar; \n \
 				return ret; \n \
 				}
 
+#define MULVAL_PLANAR_DP double2 mulval(__global void* inRe, __global void* inIm, uint offset, __global void* userdata)\n \
+				{ \n \
+				__global USER_DATA *data = ((__global USER_DATA *)userdata + offset); \n \
+				int scalar = (int)data->scalar1 + (int)data->scalar2; \n \
+				double2 ret; \n \
+				ret.x = *((__global double*)inRe + offset) * scalar; \n \
+				ret.y = *((__global double*)inIm + offset) * scalar; \n \
+				return ret; \n \
+				}
+
 #define STRUCT_USERDATA typedef struct USER_DATA  \
 					   {  \
 						int scalar1;  \
 						int scalar2;  \
-						int scalar3;  \
 						} USER_DATA; 
 STRUCT_USERDATA
 
-template < typename T >
-bool compare(fftw_complex *refData, std::vector< std::complex< T > > data,
+//Compare reference and opencl output
+template < typename T1, typename T2>
+bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
              const int length, const float epsilon = 1e-6f)
 {
     float error = 0.0f;
@@ -98,8 +115,9 @@ bool compare(fftw_complex *refData, std::vector< std::complex< T > > data,
 	return true;
 }
 
-template < typename T >
-bool compare(fftw_complex *refData, std::valarray< T > real, std::valarray< T > imag,
+//Compare reference and opencl output
+template < typename T1, typename T2 >
+bool compare(T1 *refData, std::valarray< T2 > real, std::valarray< T2 > imag,
              const int length, const float epsilon = 1e-6f)
 {
     float error = 0.0f;
@@ -157,6 +175,98 @@ bool compare(fftw_complex *refData, std::valarray< T > real, std::valarray< T >
 	return true;
 }
 
+// Compute reference output using fftw for float type
+fftwf_complex* get_fftwf_output(size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+								size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
+								size_t outfftVectorSizePadded, size_t fftVectorSize, clfftDim dim, clfftDirection dir)
+{
+	//In FFTW last dimension has the fastest changing index
+	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+
+	fftwf_plan refPlan;
+
+	fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftBatchSize);
+	fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*outfftBatchSize);
+
+	refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, 
+									refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded, 
+									refout, &fftwLengths[3 - dim], outStrides[0], outfftVectorSizePadded, 
+									dir, FFTW_ESTIMATE);
+
+	int scalar;
+	for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+	{
+		switch (in_layout)
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			scalar = SCALAR + (i % fftVectorSize);
+			break;
+		case CLFFT_COMPLEX_PLANAR:
+			scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1);
+			break;
+		default:
+			break;
+		}
+
+		refin[i][0] = 1 * scalar;
+		refin[i][1] = 0 * scalar;
+	}
+
+	fftwf_execute(refPlan);
+
+	fftw_free(refin);
+
+	fftwf_destroy_plan(refPlan);
+
+	return refout;
+}
+
+// Compute reference output using fftw for double type
+fftw_complex* get_fftw_output(size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+								size_t fftBatchSize, size_t outfftBatchSize, size_t fftVectorSizePadded, clfftLayout in_layout,
+								size_t outfftVectorSizePadded, size_t fftVectorSize, clfftDim dim, clfftDirection dir)
+{
+	fftw_plan refPlan;
+
+	fftw_complex *refin = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*fftBatchSize);
+	fftw_complex *refout = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*outfftBatchSize);
+	
+	//In FFTW last dimension has the fastest changing index
+	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+
+	refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, 
+									refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded, 
+									refout, &fftwLengths[3 - dim], outStrides[0], outfftVectorSizePadded, 
+									dir, FFTW_ESTIMATE);
+							
+	int scalar;
+	for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+	{
+		switch (in_layout)
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			scalar = SCALAR + (i % fftVectorSize);
+			break;
+		case CLFFT_COMPLEX_PLANAR:
+			scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1);
+			break;
+		default:
+			break;
+		}
+
+		refin[i][0] = 1 * scalar;
+		refin[i][1] = 0 * scalar;
+	}
+
+	fftw_execute(refPlan);
+
+	fftw_free(refin);
+
+	fftw_destroy_plan(refPlan);
+
+	return refout;
+}
+
 //	This is used with the program_options class so that the user can type an integer on the command line
 //	and we store into an enum varaible
 template<class _Elem, class _Traits>
@@ -268,9 +378,9 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 		return 1;
 	}
 
-	if (hasPrecallback && (sizeof(T) != sizeof(float)))
+	if (hasPrecallback && !(in_layout == CLFFT_COMPLEX_INTERLEAVED || in_layout == CLFFT_COMPLEX_PLANAR))
 	{
-		terr << _T("Pre-callback feature is currently supported only for Single Precision FFT " ) << std::endl;
+		terr << _T("Pre-callback feature is currently supported only for Complex-Complex FFT " ) << std::endl;
 		return 1;
 	}
 
@@ -432,7 +542,8 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 			{
 			case 1: //C2C 1D Interleaved 
 				{
-					char* precallbackstr = STRINGIFY(MULVAL);
+					char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL) : STRINGIFY(MULVAL_DP);
+
 					int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
 					for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
 					{
@@ -455,13 +566,12 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 			{
 			case 1: //C2C 1D PLANAR 
 				{
-					char* precallbackstr = STRINGIFY(MULVAL_PLANAR);
+					char* precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(MULVAL_PLANAR) : STRINGIFY(MULVAL_PLANAR_DP);
 					USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
 					for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
 					{
 						h_userdata[i].scalar1 = SCALAR + (i % fftVectorSize);
 						h_userdata[i].scalar2 = SCALAR + (i % fftVectorSize) + 1;
-						h_userdata[i].scalar3 = SCALAR + (i % fftVectorSize) + 2;
 					}
 					userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
 
@@ -593,39 +703,40 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 					{
 					case CLFFT_COMPLEX_INTERLEAVED:
 						{
-							fftw_complex *refin, *refout;
-							fftw_plan refPlan;
-							refin = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*fftBatchSize);
-							refout = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*outfftBatchSize);
-
-							//In FFTW last dimension has the fastest changing index
-							int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
-
-							refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded, refout, &fftwLengths[3 - dim]
-																, outStrides[0], outfftVectorSizePadded, dir, FFTW_ESTIMATE);
-							
-							int scalar;
-							for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+							if (precision == CLFFT_SINGLE)
 							{
-								scalar = SCALAR + (i % fftVectorSize);
-								refin[i][0] = 1 * scalar;
-								refin[i][1] = 0 * scalar;
-							}
+								fftwf_complex *refout;
 
-							fftw_execute(refPlan);
+								refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
 
-							if (!compare(refout, output, outfftBatchSize))
-								checkflag = true;
+								if (!compare(refout, output, outfftBatchSize))
+									checkflag = true;
 
-							fftw_destroy_plan(refPlan);
+								//for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+								//{
+								//	std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+								//}
 							
-							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+								fftwf_free(refout);
+							}
+							else if (precision == CLFFT_DOUBLE)
 							{
-								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
-							}*/
+								fftw_complex *refout;
 							
-							fftw_free(refin);
-							fftw_free(refout);		
+								refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+								if (!compare(refout, output, outfftBatchSize))
+									checkflag = true;
+
+								/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+								{
+									std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+								}*/
+							
+								fftw_free(refout);
+							}
 						}
 						break;
 					}
@@ -658,10 +769,6 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 							break;
 						}
 					}
-					/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
-					{
-							std::cout << "i " << i << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
-					}*/
 				}
 			}
 			break;
@@ -697,38 +804,40 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 					{
 					case CLFFT_COMPLEX_PLANAR:
 						{
-							fftw_complex *refin, *refout;
-							fftw_plan refPlan;
-							refin = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*fftBatchSize);
-							refout = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*outfftBatchSize);
-
-							//In FFTW last dimension has the fastest changing index
-							int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
-
-							refPlan = fftw_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, refin, &fftwLengths[3 - dim], inStrides[0], fftVectorSizePadded, refout, &fftwLengths[3 - dim]
-																, outStrides[0], outfftVectorSizePadded, dir, FFTW_ESTIMATE);
-							int scalar;
-							for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+							if (precision == CLFFT_SINGLE)
 							{
-								scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1) + (SCALAR + (i % fftVectorSize) + 2);
-								refin[i][0] = 1 * scalar;
-								refin[i][1] = 0 * scalar;
-							}
-
-							fftw_execute(refPlan);
+								fftwf_complex *refout;
 
-							if (!compare(refout, real, imag, outfftBatchSize))
-								checkflag = true;
+								refout = get_fftwf_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
 
-							fftw_destroy_plan(refPlan);
+								if (!compare(refout, real, imag, outfftBatchSize))
+									checkflag = true;
 
-							/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+								/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+								{
+									std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
+								}*/
+							
+								fftwf_free(refout);
+							}
+							else if (precision == CLFFT_DOUBLE)
 							{
-								std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
-							}*/
+								fftw_complex *refout;
 
-							fftw_free(refin);
-							fftw_free(refout);		
+								refout = get_fftw_output(lengths, inStrides, outStrides, batch_size, fftBatchSize, outfftBatchSize, fftVectorSizePadded,
+															in_layout, outfftVectorSizePadded, fftVectorSize, dim, dir);
+
+								if (!compare(refout, real, imag, outfftBatchSize))
+									checkflag = true;
+
+								/*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+								{
+									std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
+								}*/
+							
+								fftw_free(refout);
+							}
 						}
 						break;
 					}
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index b4b1c29..854ffcc 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -1934,7 +1934,7 @@ namespace StockhamGenerator
 					//If precallback is set
 					if (fft_doPreCallback)
 					{
-						passStr += "\n\tfloat2 retPrecallback["; 
+						passStr += "\n\t"; passStr += regB2Type; passStr += " retPrecallback["; 
 						passStr += (numB4 > 0) ? "4" : (numB2 > 0) ? "2" : "1"; 
 						passStr += "];";
 					}
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 0e3b330..7ad7845 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1886,6 +1886,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				rowPlan->inStride.push_back(fftPlan->inStride[1]);
 				rowPlan->iDist           = fftPlan->iDist;
 				
+				//Set callback data if set on top level plan
+				if (fftPlan->hasPreCallback)
+				{
+					rowPlan->hasPreCallback = true;
+					rowPlan->preCallback = fftPlan->preCallback;
+					rowPlan->precallUserData = fftPlan->precallUserData;
+				}
+
 				OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
 					_T( "BakePlan for planX failed" ) );
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list