[clfft] 04/128: Precallback-verify with fftw output, user data as an array
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:32 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 9228c06fa547665f7f29d9a96c6cc2807354c5ba
Author: Pradeep <pradeep.rao at amd.com>
Date: Thu Jul 23 20:54:59 2015 +0530
Precallback-verify with fftw output, user data as an array
---
src/client-callback/CMakeLists.txt | 4 +-
src/client-callback/callback-client.cpp | 287 +++++++++++++++++++++++++++-----
src/library/generator.stockham.cpp | 8 +-
3 files changed, 250 insertions(+), 49 deletions(-)
diff --git a/src/client-callback/CMakeLists.txt b/src/client-callback/CMakeLists.txt
index 268f2b6..81c7096 100644
--- a/src/client-callback/CMakeLists.txt
+++ b/src/client-callback/CMakeLists.txt
@@ -41,11 +41,11 @@ else( )
endif( )
# Include standard OpenCL headers
-include_directories( ${Boost_INCLUDE_DIRS} ${OPENCL_INCLUDE_DIRS} ../../../common ${PROJECT_BINARY_DIR}/include ../include )
+include_directories( ${Boost_INCLUDE_DIRS} ${OPENCL_INCLUDE_DIRS} ${FFTW_INCLUDE_DIRS} ../../../common ${PROJECT_BINARY_DIR}/include ../include )
add_executable( clFFT-callback ${Client.Files} )
-target_link_libraries( clFFT-callback clFFT ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} ${DL_LIB} )
+target_link_libraries( clFFT-callback clFFT ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} ${FFTW_LIBRARIES} ${DL_LIB} )
set_target_properties( clFFT-callback PROPERTIES VERSION ${CLFFT_VERSION} )
set_target_properties( clFFT-callback PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
index b6f9e51..5b939a0 100644
--- a/src/client-callback/callback-client.cpp
+++ b/src/client-callback/callback-client.cpp
@@ -9,6 +9,8 @@
#include "../include/sharedLibrary.h"
#include "../include/unicode.compatibility.h"
+#include <fftw3.h>
+
namespace po = boost::program_options;
#define SCALAR 100
@@ -16,15 +18,15 @@ namespace po = boost::program_options;
#define MULVAL float2 mulval(__global void* in, int offset, __global void* userdata)\n \
{ \n \
- int scalar = *((__global int*)userdata); \n \
+ int scalar = *((__global int*)userdata + offset); \n \
float2 ret = *((__global float2*)in + offset) * scalar; \n \
return ret; \n \
}
#define MULVAL_PLANAR float2 mulval(__global void* inRe, __global void* inIm, int offset, __global void* userdata)\n \
{ \n \
- __global USER_DATA *data = (__global USER_DATA *)userdata; \n \
- int scalar = (int)data->scalar; \n \
+ __global USER_DATA *data = ((__global USER_DATA *)userdata + offset); \n \
+ int scalar = (int)data->scalar1 + (int)data->scalar2 + (int)data->scalar3; \n \
float2 ret; \n \
ret.x = *((__global float*)inRe + offset) * scalar; \n \
ret.y = *((__global float*)inIm + offset) * scalar; \n \
@@ -33,11 +35,110 @@ namespace po = boost::program_options;
#define STRUCT_USERDATA typedef struct USER_DATA \
{ \
- int scalar; \
- int datalength; \
+ int scalar1; \
+ int scalar2; \
+ int scalar3; \
} USER_DATA;
STRUCT_USERDATA
+template < typename T >
+bool compare(fftw_complex *refData, std::vector< std::complex< T > > data,
+ const int length, const float epsilon = 1e-6f)
+{
+ float error = 0.0f;
+ float ref = 0.0f;
+ float diff = 0.0f;
+
+ for(int i = 0; i < length; ++i)
+ {
+ diff = refData[i][0] - data[i].real();
+ error += diff * diff;
+ ref += refData[i][0] * refData[i][0];
+ }
+ float normRef =::sqrtf((float) ref);
+ if (::fabs((float) ref) < 1e-7f)
+ {
+ return false;
+ }
+ float normError = ::sqrtf((float) error);
+ error = normError / normRef;
+
+ if (error > epsilon)
+ return false;
+
+ //imag
+ error = 0.0f;
+ ref = 0.0f;
+ for(int i = 0; i < length; ++i)
+ {
+ diff = refData[i][1] - data[i].imag();
+ error += diff * diff;
+ ref += refData[i][1] * refData[i][1];
+ }
+ normRef =::sqrtf((float) ref);
+ if (::fabs((float) ref) < 1e-7f)
+ {
+ return false;
+ }
+ normError = ::sqrtf((float) error);
+ error = normError / normRef;
+
+ if (error > epsilon)
+ return false;
+
+ return true;
+}
+
+template < typename T >
+bool compare(fftw_complex *refData, std::valarray< T > real, std::valarray< T > imag,
+ const int length, const float epsilon = 1e-6f)
+{
+ float error = 0.0f;
+ float ref = 0.0f;
+ float diff = 0.0f;
+
+ //real compare
+ for(int i = 0; i < length; ++i)
+ {
+ diff = refData[i][0] - real[i];
+ error += diff * diff;
+ ref += refData[i][0] * refData[i][0];
+ }
+ float normRef =::sqrtf((float) ref);
+ if (::fabs((float) ref) < 1e-7f)
+ {
+ return false;
+ }
+ float normError = ::sqrtf((float) error);
+ error = normError / normRef;
+
+ if (error > epsilon)
+ return false;
+
+ //imag compare
+ error = 0.0f;
+ ref = 0.0f;
+
+ for(int i = 0; i < length; ++i)
+ {
+ diff = refData[i][1] - imag[i];
+ error += diff * diff;
+ ref += refData[i][1] * refData[i][1];
+ }
+ normRef =::sqrtf((float) ref);
+ if (::fabs((float) ref) < 1e-7f)
+ {
+ return false;
+ }
+ normError = ::sqrtf((float) error);
+ error = normError / normRef;
+
+ if (error > epsilon)
+ return false;
+
+ return true;
+}
+
// This is used with the program_options class so that the user can type an integer on the command line
// and we store into an enum varaible
template<class _Elem, class _Traits>
@@ -149,6 +250,12 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
return 1;
}
+ if (hasPrecallback && (dim != CLFFT_1D || fftVectorSize > 4096 || sizeof(T) != sizeof(float)))
+ {
+ terr << _T("Pre-callback feature is currently supported only for Single Precision 1D FFT and size upto 4096" ) << std::endl;
+ return 1;
+ }
+
// Fill the input buffers
switch( in_layout )
{
@@ -305,11 +412,15 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
{
switch (precallbakType)
{
- case 1: //C2C 1D Interleaved without LDS
+ case 1: //C2C 1D Interleaved
{
char* precallbackstr = STRINGIFY(MULVAL);
- int h_userdata[1] = { SCALAR };
- userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), (void*)h_userdata, NULL);
+ int *h_userdata = (int*)malloc(sizeof(int)*fftBatchSize);
+ for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ {
+ h_userdata[ i ] = SCALAR + (i % fftVectorSize);
+ }
+ userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * fftBatchSize, (void*)h_userdata, NULL);
//Register the callback
OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
@@ -324,12 +435,17 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
{
switch (precallbakType)
{
- case 1: //C2C 1D PLANAR without LDS
+ case 1: //C2C 1D PLANAR
{
char* precallbackstr = STRINGIFY(MULVAL_PLANAR);
- USER_DATA h_userdata[1];
- h_userdata[0].scalar = SCALAR;
- userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA), (void*)h_userdata, NULL);
+ USER_DATA *h_userdata = (USER_DATA*)malloc(sizeof(USER_DATA) * fftBatchSize);
+ for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ {
+ h_userdata[i].scalar1 = SCALAR + (i % fftVectorSize);
+ h_userdata[i].scalar2 = SCALAR + (i % fftVectorSize) + 1;
+ h_userdata[i].scalar3 = SCALAR + (i % fftVectorSize) + 2;
+ }
+ userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * fftBatchSize, (void*)h_userdata, NULL);
//Register the callback
OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
@@ -453,42 +569,77 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
}
//check output data
- for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ if (hasPrecallback && dim == CLFFT_1D)
{
- if (0 == (i % outfftVectorSizePadded))
+ switch(in_layout)
{
- if (hasPrecallback && dim == CLFFT_1D)
+ case CLFFT_COMPLEX_INTERLEAVED:
{
- if (output[i].real() != outfftVectorSize * SCALAR)
+ fftw_complex *refin, *refout;
+ fftw_plan refPlan;
+ refin = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*fftBatchSize);
+ refout = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*outfftBatchSize);
+
+ refPlan = fftw_plan_many_dft(1, (const int*)lengths, batch_size, refin, 0, inStrides[0], fftVectorSizePadded, refout, 0, outStrides[0], outfftVectorSizePadded, dir, FFTW_ESTIMATE);
+
+ int scalar;
+ for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ {
+ scalar = SCALAR + (i % fftVectorSize);
+ refin[i][0] = 1 * scalar;
+ refin[i][1] = 0 * scalar;
+ }
+
+ fftw_execute(refPlan);
+
+ if (!compare(refout, output, outfftBatchSize))
+ checkflag = true;
+
+ fftw_destroy_plan(refPlan);
+
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ }*/
+
+ fftw_free(refin);
+ fftw_free(refout);
+ }
+ break;
+ }
+ }
+ else
+ {
+ for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ {
+ if (0 == (i % outfftVectorSizePadded))
+ {
+ if (output[i].real() != outfftVectorSize)
{
checkflag = true;
break;
}
+
}
else
{
- if (output[i].real() != outfftVectorSize)
+ if (output[ i ].real() != 0)
{
checkflag = true;
break;
}
}
- }
- else
- {
- if (output[ i ].real() != 0)
+
+ if (output[ i ].imag() != 0)
{
checkflag = true;
break;
}
}
-
- if (output[ i ].imag() != 0)
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
{
- checkflag = true;
- break;
- }
- //std::cout << i << " real = " << output[i].real() << " img = " << output[ i ].imag() << std::endl;
+ std::cout << "i " << i << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ }*/
}
}
break;
@@ -518,13 +669,52 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
}
// Check output data
- for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ if (hasPrecallback && dim == CLFFT_1D)
{
- if (0 == (i % outfftVectorSizePadded))
+ switch(in_layout)
{
- if (hasPrecallback && dim == CLFFT_1D)
+ case CLFFT_COMPLEX_PLANAR:
{
- if (real[i] != outfftVectorSize * SCALAR)
+ fftw_complex *refin, *refout;
+ fftw_plan refPlan;
+ refin = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*fftBatchSize);
+ refout = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*outfftBatchSize);
+
+ refPlan = fftw_plan_many_dft(1, (const int*)lengths, batch_size, refin, 0, inStrides[0], fftVectorSizePadded, refout, 0, outStrides[0], outfftVectorSizePadded, dir, FFTW_ESTIMATE);
+
+ int scalar;
+ for( cl_uint i = 0; i < fftBatchSize; i = i + inStrides[0])
+ {
+ scalar = (SCALAR + (i % fftVectorSize)) + (SCALAR + (i % fftVectorSize) + 1) + (SCALAR + (i % fftVectorSize) + 2);
+ refin[i][0] = 1 * scalar;
+ refin[i][1] = 0 * scalar;
+ }
+
+ fftw_execute(refPlan);
+
+ if (!compare(refout, real, imag, outfftBatchSize))
+ checkflag = true;
+
+ fftw_destroy_plan(refPlan);
+
+ /*for( cl_uint i = 0; i < outfftBatchSize; i = i + outStrides[0])
+ {
+ std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << real[i] << " climag " << imag[i] << std::endl;
+ }*/
+
+ fftw_free(refin);
+ fftw_free(refout);
+ }
+ break;
+ }
+ }
+ else
+ {
+ for( cl_uint i = 0; i < outfftBatchSize; ++i )
+ {
+ if (0 == (i % outfftVectorSizePadded))
+ {
+ if (real[i] != outfftVectorSize)
{
checkflag = true;
break;
@@ -532,28 +722,19 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
}
else
{
- if (real[i] != outfftVectorSize)
+ if (real[i] != 0)
{
checkflag = true;
break;
}
}
- }
- else
- {
- if (real[i] != 0)
+
+ if (imag[i] != 0)
{
checkflag = true;
break;
}
}
-
- if (imag[i] != 0)
- {
- checkflag = true;
- break;
- }
- //std::cout << i << " real = " << real[i] << " img = " << imag[ i ] << std::endl;
}
}
break;
@@ -677,6 +858,26 @@ int main(int argc, char **argv)
deviceType = CL_DEVICE_TYPE_ALL;
}
+ if( vm.count( "outPlace" ) )
+ {
+ place = CLFFT_OUTOFPLACE;
+ }
+
+ if( vm.count( "double" ) )
+ {
+ precision = CLFFT_DOUBLE;
+ }
+
+ if( vm.count( "inv" ) )
+ {
+ dir = CLFFT_BACKWARD;
+ }
+
+ if( profile_count > 1 )
+ {
+ command_queue_flags |= CL_QUEUE_PROFILING_ENABLE;
+ }
+
if( vm.count( "dumpKernels" ) )
{
setupData->debugFlags |= CLFFT_DUMP_PROGRAMS;
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 5fd0cd9..610ef93 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -2394,7 +2394,7 @@ namespace StockhamGenerator
//Pass precallback information to Pass object if its the first pass.
//This will be used in single kernel transforms
- if (!r2c2r && i == 0 && params.fft_hasPreCallback)
+ if (!r2c2r && i == 0 && !params.blockCompute && params.fft_hasPreCallback)
{
passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
}
@@ -3289,8 +3289,8 @@ namespace StockhamGenerator
}
else
{
- if(inInterleaved) inBuf = "lwbIn, ";
- else inBuf = "lwbInRe, lwbInIm, ";
+ if(inInterleaved) inBuf = "gbIn, ";
+ else inBuf = "gbInRe, gbInIm, ";
if(outInterleaved) outBuf = "lwbOut";
else outBuf = "lwbOutRe, lwbOutIm";
}
@@ -3410,7 +3410,7 @@ namespace StockhamGenerator
str += ldsArgs; str += IterRegs("&");
//if precalback set, append additional arguments
- if (!r2c2r && params.fft_hasPreCallback)
+ if (!r2c2r && !blockCompute && params.fft_hasPreCallback)
{
str += ", userdata";
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list