[clfft] 09/107: merging internal fork with performance optimizations for large sizes and other fixes
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jul 30 18:06:27 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 79dc76bdf2c9611981973c9a8857009f2161af3c
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Sun Feb 22 19:45:06 2015 -0600
merging internal fork with performance optimizations for large sizes and other fixes
---
src/client/client.cpp | 15 +
src/client/client.h | 41 +
src/client/openCL.misc.cpp | 926 +++++-----
src/library/CMakeLists.txt | 6 +-
src/library/accessors.cpp | 63 +-
src/library/generator.copy.cpp | 338 ++--
src/library/generator.h | 9 +-
src/library/generator.stockham.cpp | 840 +++++----
src/library/generator.stockham.h | 278 ++-
src/library/generator.transpose.gcn.cpp | 660 +++++++
...rator.transpose.h => generator.transpose.gcn.h} | 0
....transpose.cpp => generator.transpose.vliw.cpp} | 27 +-
...ator.transpose.h => generator.transpose.vliw.h} | 0
src/library/plan.cpp | 1844 ++++++++++----------
src/library/plan.h | 172 +-
src/library/private.h | 22 +-
src/library/repo.cpp | 36 +-
src/library/repo.h | 16 +-
src/library/transform.cpp | 478 +++--
src/tests/accuracy_test_pow2.cpp | 85 +
20 files changed, 3365 insertions(+), 2491 deletions(-)
diff --git a/src/client/client.cpp b/src/client/client.cpp
index be9698e..c495854 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -515,6 +515,9 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
//
cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
+ Timer tr;
+ tr.Start();
+
for( cl_uint i = 0; i < profile_count; ++i )
{
if( timer ) timer->Start( clFFTID );
@@ -526,6 +529,18 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
if( timer ) timer->Stop( clFFTID );
}
OPENCL_V_THROW( clFinish( queue ), "clFinish failed" );
+ if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+ double wtime = tr.Sample()/((double)profile_count);
+ size_t totalLen = 1;
+ for(int i=0; i<dim; i++) totalLen *= lengths[i];
+ double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+ if(profile_count > 1)
+ {
+ tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl;
+ tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl;
+ }
if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) )
{
diff --git a/src/client/client.h b/src/client/client.h
index ad22f36..2c70aba 100644
--- a/src/client/client.h
+++ b/src/client/client.h
@@ -23,4 +23,45 @@
// #define BOOST_PROGRAM_OPTIONS_DYN_LINK
#include <boost/program_options.hpp>
+#ifdef WIN32
+
+struct Timer
+{
+ LARGE_INTEGER start, stop, freq;
+
+public:
+ Timer() { QueryPerformanceFrequency( &freq ); }
+
+ void Start() { QueryPerformanceCounter(&start); }
+ double Sample()
+ {
+ QueryPerformanceCounter ( &stop );
+ double time = (double)(stop.QuadPart-start.QuadPart) / (double)(freq.QuadPart);
+ return time;
+ }
+};
+
+#else
+
+#include <time.h>
+#include <math.h>
+
+struct Timer
+{
+ struct timespec start, end;
+
+public:
+ Timer() { }
+
+ void Start() { clock_gettime(CLOCK_MONOTONIC, &start); }
+ double Sample()
+ {
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ double time = 1000000000L * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+ return time * 1E-9;
+ }
+};
+
+#endif
+
#endif
diff --git a/src/client/openCL.misc.cpp b/src/client/openCL.misc.cpp
index 6bbdec3..71e4650 100644
--- a/src/client/openCL.misc.cpp
+++ b/src/client/openCL.misc.cpp
@@ -29,507 +29,509 @@
void prettyPrintPlatformInfo( const cl_platform_id& pId )
{
- size_t platformProfileSize = 0;
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, 0, NULL, &platformProfileSize ),
- "Getting CL_PLATFORM_PROFILE Platform Info string size ( ::clGetPlatformInfo() )" );
-
- std::vector< char > szPlatformProfile( platformProfileSize );
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, platformProfileSize, &szPlatformProfile[ 0 ], NULL),
- "Getting CL_PLATFORM_PROFILE Platform Info string ( ::clGetPlatformInfo() )" );
-
- size_t platformVersionSize = 0;
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, 0, NULL, &platformVersionSize ),
- "Getting CL_PLATFORM_VERSION Platform Info string size ( ::clGetPlatformInfo() )" );
-
- std::vector< char > szPlatformVersion( platformVersionSize );
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, platformVersionSize, &szPlatformVersion[ 0 ], NULL),
- "Getting CL_PLATFORM_VERSION Platform Info string ( ::clGetPlatformInfo() )" );
-
- size_t platformNameSize = 0;
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, 0, NULL, &platformNameSize ),
- "Getting CL_PLATFORM_NAME Platform Info string size ( ::clGetPlatformInfo() )" );
-
- std::vector< char > szPlatformName( platformNameSize );
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, platformNameSize, &szPlatformName[ 0 ], NULL),
- "Getting CL_PLATFORM_NAME Platform Info string ( ::clGetPlatformInfo() )" );
-
- size_t vendorStringSize = 0;
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, 0, NULL, &vendorStringSize ),
- "Getting CL_PLATFORM_VENDOR Platform Info string size ( ::clGetPlatformInfo() )" );
-
- std::vector< char > szPlatformVendor( vendorStringSize );
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, vendorStringSize, &szPlatformVendor[ 0 ], NULL),
- "Getting CL_PLATFORM_VENDOR Platform Info string ( ::clGetPlatformInfo() )" );
-
- size_t platformExtensionsSize = 0;
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, 0, NULL, &platformExtensionsSize ),
- "Getting CL_PLATFORM_EXTENSIONS Platform Info string size ( ::clGetPlatformInfo() )" );
-
- std::vector< char > szPlatformExtensions( platformExtensionsSize );
- OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, platformExtensionsSize, &szPlatformExtensions[ 0 ], NULL),
- "Getting CL_PLATFORM_EXTENSIONS Platform Info string ( ::clGetPlatformInfo() )" );
-
- const int indent = countOf( " CL_PLATFORM_EXTENSIONS: " );
- std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_PROFILE: " << &szPlatformProfile[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VERSION: " << &szPlatformVersion[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_NAME: " << &szPlatformName[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VENDOR: " << &szPlatformVendor[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_EXTENSIONS: " << &szPlatformExtensions[ 0 ] << std::endl;
- std::cout << std::right << std::endl;
+ size_t platformProfileSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, 0, NULL, &platformProfileSize ),
+ "Getting CL_PLATFORM_PROFILE Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformProfile( platformProfileSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, platformProfileSize, &szPlatformProfile[ 0 ], NULL),
+ "Getting CL_PLATFORM_PROFILE Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformVersionSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, 0, NULL, &platformVersionSize ),
+ "Getting CL_PLATFORM_VERSION Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformVersion( platformVersionSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, platformVersionSize, &szPlatformVersion[ 0 ], NULL),
+ "Getting CL_PLATFORM_VERSION Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformNameSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, 0, NULL, &platformNameSize ),
+ "Getting CL_PLATFORM_NAME Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformName( platformNameSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, platformNameSize, &szPlatformName[ 0 ], NULL),
+ "Getting CL_PLATFORM_NAME Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t vendorStringSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, 0, NULL, &vendorStringSize ),
+ "Getting CL_PLATFORM_VENDOR Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformVendor( vendorStringSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, vendorStringSize, &szPlatformVendor[ 0 ], NULL),
+ "Getting CL_PLATFORM_VENDOR Platform Info string ( ::clGetPlatformInfo() )" );
+
+ size_t platformExtensionsSize = 0;
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, 0, NULL, &platformExtensionsSize ),
+ "Getting CL_PLATFORM_EXTENSIONS Platform Info string size ( ::clGetPlatformInfo() )" );
+
+ std::vector< char > szPlatformExtensions( platformExtensionsSize );
+ OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, platformExtensionsSize, &szPlatformExtensions[ 0 ], NULL),
+ "Getting CL_PLATFORM_EXTENSIONS Platform Info string ( ::clGetPlatformInfo() )" );
+
+ const int indent = countOf( " CL_PLATFORM_EXTENSIONS: " );
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_PROFILE: " << &szPlatformProfile[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VERSION: " << &szPlatformVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_NAME: " << &szPlatformName[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_VENDOR: " << &szPlatformVendor[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_PLATFORM_EXTENSIONS: " << &szPlatformExtensions[ 0 ] << std::endl;
+ std::cout << std::right << std::endl;
}
void prettyPrintDeviceInfo( const cl_device_id& dId )
{
- size_t deviceNameSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, 0, NULL, &deviceNameSize ),
- "Getting CL_DEVICE_NAME Platform Info string size ( ::clGetDeviceInfo() )" );
-
- std::vector< char > szDeviceName( deviceNameSize );
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, deviceNameSize, &szDeviceName[ 0 ], NULL ),
- "Getting CL_DEVICE_NAME Platform Info string ( ::clGetDeviceInfo() )" );
-
- size_t deviceVersionSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
- "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
- std::vector< char > szDeviceVersion( deviceVersionSize );
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
- "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
- size_t driverVersionSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, 0, NULL, &driverVersionSize ),
- "Getting CL_DRIVER_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
- std::vector< char > szDriverVersion( driverVersionSize );
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, driverVersionSize, &szDriverVersion[ 0 ], NULL ),
- "Getting CL_DRIVER_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
- size_t openCLVersionSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &openCLVersionSize ),
- "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
- std::vector< char > szOpenCLVersion( openCLVersionSize );
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, openCLVersionSize, &szOpenCLVersion[ 0 ], NULL ),
- "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
- cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_TYPE, sizeof( cl_device_type ), &devType, NULL ),
- "Getting CL_DEVICE_TYPE device info ( ::clGetDeviceInfo() )" );
-
- cl_uint devAddrBits = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_ADDRESS_BITS, sizeof( cl_uint ), &devAddrBits, NULL ),
- "Getting CL_DEVICE_ADDRESS_BITS device info ( ::clGetDeviceInfo() )" );
-
- cl_uint maxClockFreq = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( cl_uint ), &maxClockFreq, NULL ),
- "Getting CL_DEVICE_MAX_CLOCK_FREQUENCY device info ( ::clGetDeviceInfo() )" );
-
- cl_bool devAvailable = CL_FALSE;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_AVAILABLE, sizeof( cl_bool ), &devAvailable, NULL ),
- "Getting CL_DEVICE_AVAILABLE device info ( ::clGetDeviceInfo() )" );
-
- cl_bool devCompAvailable = CL_FALSE;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_COMPILER_AVAILABLE, sizeof( cl_bool ), &devCompAvailable, NULL ),
- "Getting CL_DEVICE_COMPILER_AVAILABLE device info ( ::clGetDeviceInfo() )" );
-
- size_t devMaxWorkGroup = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &devMaxWorkGroup, NULL ),
- "Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )" );
-
- cl_uint devMaxWorkItemDim = CL_FALSE;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( cl_uint ), &devMaxWorkItemDim, NULL ),
- "Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )" );
-
- std::vector< size_t > devMaxWorkItemSizes( devMaxWorkItemDim );
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( size_t )*devMaxWorkItemSizes.size( ), &devMaxWorkItemSizes[0], NULL),
- "Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )" );
-
- cl_bool deviceHostUnified = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( cl_bool ), &deviceHostUnified, NULL ),
- "Getting CL_DEVICE_HOST_UNIFIED_MEMORY Platform Info string ( ::clGetDeviceInfo() )" );
-
- cl_ulong devMaxConstantBuffer = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( cl_ulong ), &devMaxConstantBuffer, NULL ),
- "Getting CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE device info ( ::clGetDeviceInfo() )" );
-
- cl_ulong devLocalMemSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &devLocalMemSize, NULL ),
- "Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
-
- cl_ulong deviceGlobalMemSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &deviceGlobalMemSize, NULL ),
- "Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
-
- cl_ulong deviceMaxMemAllocSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &deviceMaxMemAllocSize, NULL ),
- "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
-
- size_t deviceExtSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
- "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
-
- std::vector< char > szDeviceExt( deviceExtSize );
- OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
- "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
-
- const int indent = countOf( " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " );
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_NAME: " << &szDeviceName[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_VERSION: " << &szDeviceVersion[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DRIVER_VERSION: " << &szDriverVersion[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_TYPE: "
- << (CL_DEVICE_TYPE_DEFAULT & devType ? "default" : "")
- << (CL_DEVICE_TYPE_CPU & devType ? "CPU" : "")
- << (CL_DEVICE_TYPE_GPU & devType ? "GPU" : "")
- << (CL_DEVICE_TYPE_ACCELERATOR & devType ? "Accelerator" : "")
- << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CLOCK_FREQUENCY: " << maxClockFreq << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_ADDRESS_BITS: " << devAddrBits << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_AVAILABLE: " << ( devAvailable ? "TRUE": "FALSE") << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_COMPILER_AVAILABLE: " << ( devCompAvailable ? "TRUE": "FALSE") << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_OPENCL_C_VERSION: " << &szOpenCLVersion[ 0 ] << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_GROUP_SIZE: " << devMaxWorkGroup << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << devMaxWorkItemDim << std::endl;
- for( cl_uint wis = 0; wis < devMaxWorkItemSizes.size( ); ++wis )
- {
- std::stringstream dimString;
- dimString << "Dimension[ " << wis << " ] ";
- std::cout << std::right << std::setw( indent ) << dimString.str( ) << devMaxWorkItemSizes[wis] << std::endl;
- }
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_HOST_UNIFIED_MEMORY: " << ( deviceHostUnified ? "TRUE": "FALSE") << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << devMaxConstantBuffer;
- std::cout << " ( " << devMaxConstantBuffer / 1024 << " KB )" << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_LOCAL_MEM_SIZE: " << devLocalMemSize;
- std::cout << " ( " << devLocalMemSize / 1024 << " KB )" << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_GLOBAL_MEM_SIZE: " << deviceGlobalMemSize;
- std::cout << " ( " << deviceGlobalMemSize / 1048576 << " MB )" << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << deviceMaxMemAllocSize;
- std::cout << " ( " << deviceMaxMemAllocSize / 1048576 << " MB )" << std::endl;
- std::cout << std::left << std::setw( indent ) << " CL_DEVICE_EXTENSIONS: " << &szDeviceExt[ 0 ] << std::endl;
-
- std::cout << std::right << std::endl;
+ size_t deviceNameSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, 0, NULL, &deviceNameSize ),
+ "Getting CL_DEVICE_NAME Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceName( deviceNameSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, deviceNameSize, &szDeviceName[ 0 ], NULL ),
+ "Getting CL_DEVICE_NAME Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t deviceVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+ "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceVersion( deviceVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+ "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t driverVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, 0, NULL, &driverVersionSize ),
+ "Getting CL_DRIVER_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDriverVersion( driverVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, driverVersionSize, &szDriverVersion[ 0 ], NULL ),
+ "Getting CL_DRIVER_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ size_t openCLVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &openCLVersionSize ),
+ "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szOpenCLVersion( openCLVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, openCLVersionSize, &szOpenCLVersion[ 0 ], NULL ),
+ "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_TYPE, sizeof( cl_device_type ), &devType, NULL ),
+ "Getting CL_DEVICE_TYPE device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint devAddrBits = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_ADDRESS_BITS, sizeof( cl_uint ), &devAddrBits, NULL ),
+ "Getting CL_DEVICE_ADDRESS_BITS device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint maxClockFreq = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( cl_uint ), &maxClockFreq, NULL ),
+ "Getting CL_DEVICE_MAX_CLOCK_FREQUENCY device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool devAvailable = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_AVAILABLE, sizeof( cl_bool ), &devAvailable, NULL ),
+ "Getting CL_DEVICE_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool devCompAvailable = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_COMPILER_AVAILABLE, sizeof( cl_bool ), &devCompAvailable, NULL ),
+ "Getting CL_DEVICE_COMPILER_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+ size_t devMaxWorkGroup = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &devMaxWorkGroup, NULL ),
+ "Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_uint devMaxWorkItemDim = CL_FALSE;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( cl_uint ), &devMaxWorkItemDim, NULL ),
+ "Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )" );
+
+ std::vector< size_t > devMaxWorkItemSizes( devMaxWorkItemDim );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( size_t )*devMaxWorkItemSizes.size( ), &devMaxWorkItemSizes[0], NULL),
+ "Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )" );
+
+ cl_bool deviceHostUnified = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( cl_bool ), &deviceHostUnified, NULL ),
+ "Getting CL_DEVICE_HOST_UNIFIED_MEMORY Platform Info string ( ::clGetDeviceInfo() )" );
+
+ cl_ulong devMaxConstantBuffer = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( cl_ulong ), &devMaxConstantBuffer, NULL ),
+ "Getting CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong devLocalMemSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &devLocalMemSize, NULL ),
+ "Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong deviceGlobalMemSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &deviceGlobalMemSize, NULL ),
+ "Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ cl_ulong deviceMaxMemAllocSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &deviceMaxMemAllocSize, NULL ),
+ "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
+
+ size_t deviceExtSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceExt( deviceExtSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+ const int indent = countOf( " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " );
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_NAME: " << &szDeviceName[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_VERSION: " << &szDeviceVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DRIVER_VERSION: " << &szDriverVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_TYPE: "
+ << (CL_DEVICE_TYPE_DEFAULT & devType ? "default" : "")
+ << (CL_DEVICE_TYPE_CPU & devType ? "CPU" : "")
+ << (CL_DEVICE_TYPE_GPU & devType ? "GPU" : "")
+ << (CL_DEVICE_TYPE_ACCELERATOR & devType ? "Accelerator" : "")
+ << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CLOCK_FREQUENCY: " << maxClockFreq << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_ADDRESS_BITS: " << devAddrBits << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_AVAILABLE: " << ( devAvailable ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_COMPILER_AVAILABLE: " << ( devCompAvailable ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_OPENCL_C_VERSION: " << &szOpenCLVersion[ 0 ] << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_GROUP_SIZE: " << devMaxWorkGroup << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << devMaxWorkItemDim << std::endl;
+ for( cl_uint wis = 0; wis < devMaxWorkItemSizes.size( ); ++wis )
+ {
+ std::stringstream dimString;
+ dimString << "Dimension[ " << wis << " ] ";
+ std::cout << std::right << std::setw( indent ) << dimString.str( ) << devMaxWorkItemSizes[wis] << std::endl;
+ }
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_HOST_UNIFIED_MEMORY: " << ( deviceHostUnified ? "TRUE": "FALSE") << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << devMaxConstantBuffer;
+ std::cout << " ( " << devMaxConstantBuffer / 1024 << " KB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_LOCAL_MEM_SIZE: " << devLocalMemSize;
+ std::cout << " ( " << devLocalMemSize / 1024 << " KB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_GLOBAL_MEM_SIZE: " << deviceGlobalMemSize;
+ std::cout << " ( " << deviceGlobalMemSize / 1048576 << " MB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << deviceMaxMemAllocSize;
+ std::cout << " ( " << deviceMaxMemAllocSize / 1048576 << " MB )" << std::endl;
+ std::cout << std::left << std::setw( indent ) << " CL_DEVICE_EXTENSIONS: " << &szDeviceExt[ 0 ] << std::endl;
+
+ std::cout << std::right << std::endl;
}
// Verify a failed condition; return true on fail
inline cl_bool OPENCL_V_FAIL( cl_int res )
{
- if( res == CL_SUCCESS )
- return CL_FALSE;
- else
- return CL_TRUE;
+ if( res == CL_SUCCESS )
+ return CL_FALSE;
+ else
+ return CL_TRUE;
}
std::string prettyPrintclFFTStatus( const cl_int& status )
{
- switch( status )
- {
- case CLFFT_INVALID_GLOBAL_WORK_SIZE:
- return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
- case CLFFT_INVALID_MIP_LEVEL:
- return "CLFFT_INVALID_MIP_LEVEL";
- case CLFFT_INVALID_BUFFER_SIZE:
- return "CLFFT_INVALID_BUFFER_SIZE";
- case CLFFT_INVALID_GL_OBJECT:
- return "CLFFT_INVALID_GL_OBJECT";
- case CLFFT_INVALID_OPERATION:
- return "CLFFT_INVALID_OPERATION";
- case CLFFT_INVALID_EVENT:
- return "CLFFT_INVALID_EVENT";
- case CLFFT_INVALID_EVENT_WAIT_LIST:
- return "CLFFT_INVALID_EVENT_WAIT_LIST";
- case CLFFT_INVALID_GLOBAL_OFFSET:
- return "CLFFT_INVALID_GLOBAL_OFFSET";
- case CLFFT_INVALID_WORK_ITEM_SIZE:
- return "CLFFT_INVALID_WORK_ITEM_SIZE";
- case CLFFT_INVALID_WORK_GROUP_SIZE:
- return "CLFFT_INVALID_WORK_GROUP_SIZE";
- case CLFFT_INVALID_WORK_DIMENSION:
- return "CLFFT_INVALID_WORK_DIMENSION";
- case CLFFT_INVALID_KERNEL_ARGS:
- return "CLFFT_INVALID_KERNEL_ARGS";
- case CLFFT_INVALID_ARG_SIZE:
- return "CLFFT_INVALID_ARG_SIZE";
- case CLFFT_INVALID_ARG_VALUE:
- return "CLFFT_INVALID_ARG_VALUE";
- case CLFFT_INVALID_ARG_INDEX:
- return "CLFFT_INVALID_ARG_INDEX";
- case CLFFT_INVALID_KERNEL:
- return "CLFFT_INVALID_KERNEL";
- case CLFFT_INVALID_KERNEL_DEFINITION:
- return "CLFFT_INVALID_KERNEL_DEFINITION";
- case CLFFT_INVALID_KERNEL_NAME:
- return "CLFFT_INVALID_KERNEL_NAME";
- case CLFFT_INVALID_PROGRAM_EXECUTABLE:
- return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
- case CLFFT_INVALID_PROGRAM:
- return "CLFFT_INVALID_PROGRAM";
- case CLFFT_INVALID_BUILD_OPTIONS:
- return "CLFFT_INVALID_BUILD_OPTIONS";
- case CLFFT_INVALID_BINARY:
- return "CLFFT_INVALID_BINARY";
- case CLFFT_INVALID_SAMPLER:
- return "CLFFT_INVALID_SAMPLER";
- case CLFFT_INVALID_IMAGE_SIZE:
- return "CLFFT_INVALID_IMAGE_SIZE";
- case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
- return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
- case CLFFT_INVALID_MEM_OBJECT:
- return "CLFFT_INVALID_MEM_OBJECT";
- case CLFFT_INVALID_HOST_PTR:
- return "CLFFT_INVALID_HOST_PTR";
- case CLFFT_INVALID_COMMAND_QUEUE:
- return "CLFFT_INVALID_COMMAND_QUEUE";
- case CLFFT_INVALID_QUEUE_PROPERTIES:
- return "CLFFT_INVALID_QUEUE_PROPERTIES";
- case CLFFT_INVALID_CONTEXT:
- return "CLFFT_INVALID_CONTEXT";
- case CLFFT_INVALID_DEVICE:
- return "CLFFT_INVALID_DEVICE";
- case CLFFT_INVALID_PLATFORM:
- return "CLFFT_INVALID_PLATFORM";
- case CLFFT_INVALID_DEVICE_TYPE:
- return "CLFFT_INVALID_DEVICE_TYPE";
- case CLFFT_INVALID_VALUE:
- return "CLFFT_INVALID_VALUE";
- case CLFFT_MAP_FAILURE:
- return "CLFFT_MAP_FAILURE";
- case CLFFT_BUILD_PROGRAM_FAILURE:
- return "CLFFT_BUILD_PROGRAM_FAILURE";
- case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
- return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
- case CLFFT_IMAGE_FORMAT_MISMATCH:
- return "CLFFT_IMAGE_FORMAT_MISMATCH";
- case CLFFT_MEM_COPY_OVERLAP:
- return "CLFFT_MEM_COPY_OVERLAP";
- case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
- return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
- case CLFFT_OUT_OF_HOST_MEMORY:
- return "CLFFT_OUT_OF_HOST_MEMORY";
- case CLFFT_OUT_OF_RESOURCES:
- return "CLFFT_OUT_OF_RESOURCES";
- case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
- return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
- case CLFFT_COMPILER_NOT_AVAILABLE:
- return "CLFFT_COMPILER_NOT_AVAILABLE";
- case CLFFT_DEVICE_NOT_AVAILABLE:
- return "CLFFT_DEVICE_NOT_AVAILABLE";
- case CLFFT_DEVICE_NOT_FOUND:
- return "CLFFT_DEVICE_NOT_FOUND";
- case CLFFT_SUCCESS:
- return "CLFFT_SUCCESS";
- case CLFFT_NOTIMPLEMENTED:
- return "CLFFT_NOTIMPLEMENTED";
- case CLFFT_FILE_NOT_FOUND:
- return "CLFFT_FILE_NOT_FOUND";
- case CLFFT_FILE_CREATE_FAILURE:
- return "CLFFT_FILE_CREATE_FAILURE";
- case CLFFT_VERSION_MISMATCH:
- return "CLFFT_VERSION_MISMATCH";
- case CLFFT_INVALID_PLAN:
- return "CLFFT_INVALID_PLAN";
- default:
- return "Error code not defined";
- break;
- }
+ switch( status )
+ {
+ case CLFFT_INVALID_GLOBAL_WORK_SIZE:
+ return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
+ case CLFFT_INVALID_MIP_LEVEL:
+ return "CLFFT_INVALID_MIP_LEVEL";
+ case CLFFT_INVALID_BUFFER_SIZE:
+ return "CLFFT_INVALID_BUFFER_SIZE";
+ case CLFFT_INVALID_GL_OBJECT:
+ return "CLFFT_INVALID_GL_OBJECT";
+ case CLFFT_INVALID_OPERATION:
+ return "CLFFT_INVALID_OPERATION";
+ case CLFFT_INVALID_EVENT:
+ return "CLFFT_INVALID_EVENT";
+ case CLFFT_INVALID_EVENT_WAIT_LIST:
+ return "CLFFT_INVALID_EVENT_WAIT_LIST";
+ case CLFFT_INVALID_GLOBAL_OFFSET:
+ return "CLFFT_INVALID_GLOBAL_OFFSET";
+ case CLFFT_INVALID_WORK_ITEM_SIZE:
+ return "CLFFT_INVALID_WORK_ITEM_SIZE";
+ case CLFFT_INVALID_WORK_GROUP_SIZE:
+ return "CLFFT_INVALID_WORK_GROUP_SIZE";
+ case CLFFT_INVALID_WORK_DIMENSION:
+ return "CLFFT_INVALID_WORK_DIMENSION";
+ case CLFFT_INVALID_KERNEL_ARGS:
+ return "CLFFT_INVALID_KERNEL_ARGS";
+ case CLFFT_INVALID_ARG_SIZE:
+ return "CLFFT_INVALID_ARG_SIZE";
+ case CLFFT_INVALID_ARG_VALUE:
+ return "CLFFT_INVALID_ARG_VALUE";
+ case CLFFT_INVALID_ARG_INDEX:
+ return "CLFFT_INVALID_ARG_INDEX";
+ case CLFFT_INVALID_KERNEL:
+ return "CLFFT_INVALID_KERNEL";
+ case CLFFT_INVALID_KERNEL_DEFINITION:
+ return "CLFFT_INVALID_KERNEL_DEFINITION";
+ case CLFFT_INVALID_KERNEL_NAME:
+ return "CLFFT_INVALID_KERNEL_NAME";
+ case CLFFT_INVALID_PROGRAM_EXECUTABLE:
+ return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
+ case CLFFT_INVALID_PROGRAM:
+ return "CLFFT_INVALID_PROGRAM";
+ case CLFFT_INVALID_BUILD_OPTIONS:
+ return "CLFFT_INVALID_BUILD_OPTIONS";
+ case CLFFT_INVALID_BINARY:
+ return "CLFFT_INVALID_BINARY";
+ case CLFFT_INVALID_SAMPLER:
+ return "CLFFT_INVALID_SAMPLER";
+ case CLFFT_INVALID_IMAGE_SIZE:
+ return "CLFFT_INVALID_IMAGE_SIZE";
+ case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+ return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+ case CLFFT_INVALID_MEM_OBJECT:
+ return "CLFFT_INVALID_MEM_OBJECT";
+ case CLFFT_INVALID_HOST_PTR:
+ return "CLFFT_INVALID_HOST_PTR";
+ case CLFFT_INVALID_COMMAND_QUEUE:
+ return "CLFFT_INVALID_COMMAND_QUEUE";
+ case CLFFT_INVALID_QUEUE_PROPERTIES:
+ return "CLFFT_INVALID_QUEUE_PROPERTIES";
+ case CLFFT_INVALID_CONTEXT:
+ return "CLFFT_INVALID_CONTEXT";
+ case CLFFT_INVALID_DEVICE:
+ return "CLFFT_INVALID_DEVICE";
+ case CLFFT_INVALID_PLATFORM:
+ return "CLFFT_INVALID_PLATFORM";
+ case CLFFT_INVALID_DEVICE_TYPE:
+ return "CLFFT_INVALID_DEVICE_TYPE";
+ case CLFFT_INVALID_VALUE:
+ return "CLFFT_INVALID_VALUE";
+ case CLFFT_MAP_FAILURE:
+ return "CLFFT_MAP_FAILURE";
+ case CLFFT_BUILD_PROGRAM_FAILURE:
+ return "CLFFT_BUILD_PROGRAM_FAILURE";
+ case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
+ return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
+ case CLFFT_IMAGE_FORMAT_MISMATCH:
+ return "CLFFT_IMAGE_FORMAT_MISMATCH";
+ case CLFFT_MEM_COPY_OVERLAP:
+ return "CLFFT_MEM_COPY_OVERLAP";
+ case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
+ return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
+ case CLFFT_OUT_OF_HOST_MEMORY:
+ return "CLFFT_OUT_OF_HOST_MEMORY";
+ case CLFFT_OUT_OF_RESOURCES:
+ return "CLFFT_OUT_OF_RESOURCES";
+ case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
+ return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
+ case CLFFT_COMPILER_NOT_AVAILABLE:
+ return "CLFFT_COMPILER_NOT_AVAILABLE";
+ case CLFFT_DEVICE_NOT_AVAILABLE:
+ return "CLFFT_DEVICE_NOT_AVAILABLE";
+ case CLFFT_DEVICE_NOT_FOUND:
+ return "CLFFT_DEVICE_NOT_FOUND";
+ case CLFFT_SUCCESS:
+ return "CLFFT_SUCCESS";
+ case CLFFT_NOTIMPLEMENTED:
+ return "CLFFT_NOTIMPLEMENTED";
+ case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
+ return "CLFFT_TRANSPOSED_NOTIMPLEMENTED";
+ case CLFFT_FILE_NOT_FOUND:
+ return "CLFFT_FILE_NOT_FOUND";
+ case CLFFT_FILE_CREATE_FAILURE:
+ return "CLFFT_FILE_CREATE_FAILURE";
+ case CLFFT_VERSION_MISMATCH:
+ return "CLFFT_VERSION_MISMATCH";
+ case CLFFT_INVALID_PLAN:
+ return "CLFFT_INVALID_PLAN";
+ default:
+ return "Error code not defined";
+ break;
+ }
}
std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
- cl_uint deviceGpuList,
- cl_context& context,
- bool printclInfo )
+ cl_uint deviceGpuList,
+ cl_context& context,
+ bool printclInfo )
{
- cl_int status = 0;
-
- /*
- * Have a look at the available platforms and pick either
- * the AMD one if available or a reasonable default.
- */
-
- cl_uint numPlatforms = 0;
- cl_platform_id platform = NULL;
- OPENCL_V_THROW( ::clGetPlatformIDs( 0, NULL, &numPlatforms ),
- "Getting number of platforms( ::clGetPlatformsIDs() )" );
-
- if( numPlatforms > 0 )
- {
- std::vector< cl_platform_id > platforms( numPlatforms );
- OPENCL_V_THROW( ::clGetPlatformIDs( numPlatforms, &platforms[ 0 ], NULL ),
- "Getting Platform Id's ( ::clGetPlatformsIDs() )" );
-
- // TODO: How should we determine what platform to choose? We are just defaulting to the last one reported, as we
- // print out the info
- for( unsigned int i=0; i < numPlatforms; ++i )
- {
- if( printclInfo )
- {
- std::cout << "OpenCL platform [ " << i << " ]:" << std::endl;
- prettyPrintPlatformInfo( platforms[i] );
- }
-
- platform = platforms[i];
- }
- }
-
- if( NULL == platform )
- {
- throw std::runtime_error( "No appropriate OpenCL platform could be found" );
- }
-
- /*
- * If we could find our platform, use it. Otherwise use just available platform.
- */
-
- // Get the device list for this type.
- //
- cl_uint num_devices = 0;
- OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, 0, NULL, &num_devices ),
- "Getting OpenCL devices ( ::clGetDeviceIDs() )" );
- if( 0 == num_devices )
- {
- OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
- }
-
- std::vector< cl_device_id > deviceIDs( num_devices );
- OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, num_devices, &deviceIDs[0], NULL),
- "Getting OpenCL deviceIDs ( ::clGetDeviceIDs() )" );
-
- if( (CL_DEVICE_TYPE_GPU == deviceType) && (~cl_uint(0) != deviceGpuList) )
- {
- // The command line options specify to user certain gpu(s)
- //
- for( unsigned u = (unsigned) deviceIDs.size(); u-- > 0; )
- {
- if( 0 != (deviceGpuList & (1<<u) ) )
- continue;
-
- // Remove this GPU from the list
- deviceIDs[u] = deviceIDs.back();
- deviceIDs.pop_back();
- }
- }
-
- if( 0 == deviceIDs.size( ) )
- {
- OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
- }
-
- cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
-
- /////////////////////////////////////////////////////////////////
- // Create an OpenCL context
- /////////////////////////////////////////////////////////////////
- context = clCreateContext( cps,
- (cl_uint) deviceIDs.size(),
- & deviceIDs[0],
- NULL,
- NULL,
- &status);
- OPENCL_V_THROW( status, "Creating Context ( ::clCreateContextFromType() )" );
-
- /* First, get the size of device list data */
- size_t deviceListSize;
- OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
- "Getting device array size ( ::clGetContextInfo() )" );
-
- /////////////////////////////////////////////////////////////////
- // Detect OpenCL devices
- /////////////////////////////////////////////////////////////////
- std::vector< cl_device_id > devices( deviceListSize/sizeof( cl_device_id ) );
-
- /* Now, get the device list data */
- OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &devices[ 0 ], NULL ),
- "Getting device array ( ::clGetContextInfo() )" );
-
- if( printclInfo )
- {
- cl_uint cContextDevices = 0;
-
- size_t deviceVersionSize = 0;
- OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
- "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
- std::vector< char > szDeviceVersion( deviceVersionSize );
- OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
- "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
- char openclstr[11]="OpenCL 1.0";
-
- if (!strncmp((const char*)&szDeviceVersion[ 0 ], openclstr, 10))
- {
- cContextDevices = 1;
- }
- else
- {
- OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_NUM_DEVICES, sizeof( cContextDevices ), &cContextDevices, NULL ),
- "Getting number of context devices ( ::clGetContextInfo() )" );
- }
-
- for( cl_uint i = 0; i < cContextDevices; ++i )
- {
- std::cout << "OpenCL devices [ " << i << " ]:" << std::endl;
- prettyPrintDeviceInfo( devices[i] );
- }
- }
-
- return devices;
+ cl_int status = 0;
+
+ /*
+ * Have a look at the available platforms and pick either
+ * the AMD one if available or a reasonable default.
+ */
+
+ cl_uint numPlatforms = 0;
+ cl_platform_id platform = NULL;
+ OPENCL_V_THROW( ::clGetPlatformIDs( 0, NULL, &numPlatforms ),
+ "Getting number of platforms( ::clGetPlatformsIDs() )" );
+
+ if( numPlatforms > 0 )
+ {
+ std::vector< cl_platform_id > platforms( numPlatforms );
+ OPENCL_V_THROW( ::clGetPlatformIDs( numPlatforms, &platforms[ 0 ], NULL ),
+ "Getting Platform Id's ( ::clGetPlatformsIDs() )" );
+
+ // TODO: How should we determine what platform to choose? We are just defaulting to the last one reported, as we
+ // print out the info
+ for( unsigned int i=0; i < numPlatforms; ++i )
+ {
+ if( printclInfo )
+ {
+ std::cout << "OpenCL platform [ " << i << " ]:" << std::endl;
+ prettyPrintPlatformInfo( platforms[i] );
+ }
+
+ platform = platforms[i];
+ }
+ }
+
+ if( NULL == platform )
+ {
+ throw std::runtime_error( "No appropriate OpenCL platform could be found" );
+ }
+
+ /*
+ * If we could find our platform, use it. Otherwise use just available platform.
+ */
+
+ // Get the device list for this type.
+ //
+ cl_uint num_devices = 0;
+ OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, 0, NULL, &num_devices ),
+ "Getting OpenCL devices ( ::clGetDeviceIDs() )" );
+ if( 0 == num_devices )
+ {
+ OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+ }
+
+ std::vector< cl_device_id > deviceIDs( num_devices );
+ OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, num_devices, &deviceIDs[0], NULL),
+ "Getting OpenCL deviceIDs ( ::clGetDeviceIDs() )" );
+
+ if( (CL_DEVICE_TYPE_GPU == deviceType) && (~cl_uint(0) != deviceGpuList) )
+ {
+ // The command line options specify to user certain gpu(s)
+ //
+ for( unsigned u = (unsigned) deviceIDs.size(); u-- > 0; )
+ {
+ if( 0 != (deviceGpuList & (1<<u) ) )
+ continue;
+
+ // Remove this GPU from the list
+ deviceIDs[u] = deviceIDs.back();
+ deviceIDs.pop_back();
+ }
+ }
+
+ if( 0 == deviceIDs.size( ) )
+ {
+ OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+ }
+
+ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
+
+ /////////////////////////////////////////////////////////////////
+ // Create an OpenCL context
+ /////////////////////////////////////////////////////////////////
+ context = clCreateContext( cps,
+ (cl_uint) deviceIDs.size(),
+ & deviceIDs[0],
+ NULL,
+ NULL,
+ &status);
+ OPENCL_V_THROW( status, "Creating Context ( ::clCreateContextFromType() )" );
+
+ /* First, get the size of device list data */
+ size_t deviceListSize;
+ OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
+ "Getting device array size ( ::clGetContextInfo() )" );
+
+ /////////////////////////////////////////////////////////////////
+ // Detect OpenCL devices
+ /////////////////////////////////////////////////////////////////
+ std::vector< cl_device_id > devices( deviceListSize/sizeof( cl_device_id ) );
+
+ /* Now, get the device list data */
+ OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &devices[ 0 ], NULL ),
+ "Getting device array ( ::clGetContextInfo() )" );
+
+ if( printclInfo )
+ {
+ cl_uint cContextDevices = 0;
+
+ size_t deviceVersionSize = 0;
+ OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+ "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceVersion( deviceVersionSize );
+ OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+ "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+ char openclstr[11]="OpenCL 1.0";
+
+ if (!strncmp((const char*)&szDeviceVersion[ 0 ], openclstr, 10))
+ {
+ cContextDevices = 1;
+ }
+ else
+ {
+ OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_NUM_DEVICES, sizeof( cContextDevices ), &cContextDevices, NULL ),
+ "Getting number of context devices ( ::clGetContextInfo() )" );
+ }
+
+ for( cl_uint i = 0; i < cContextDevices; ++i )
+ {
+ std::cout << "OpenCL devices [ " << i << " ]:" << std::endl;
+ prettyPrintDeviceInfo( devices[i] );
+ }
+ }
+
+ return devices;
}
int cleanupCL( cl_context* context, cl_command_queue* commandQueue,
- const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent )
+ const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent )
{
- if( *outEvent != NULL )
- OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" );
+ if( *outEvent != NULL )
+ OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" );
- releaseOpenCLMemBuffer( numBuffersIn, inputBuffer);
- releaseOpenCLMemBuffer( numBuffersOut, outputBuffer);
+ releaseOpenCLMemBuffer( numBuffersIn, inputBuffer);
+ releaseOpenCLMemBuffer( numBuffersOut, outputBuffer);
- if( *commandQueue != NULL )
- OPENCL_V_THROW( clReleaseCommandQueue( *commandQueue ), "Error: In clReleaseCommandQueue\n" );
+ if( *commandQueue != NULL )
+ OPENCL_V_THROW( clReleaseCommandQueue( *commandQueue ), "Error: In clReleaseCommandQueue\n" );
- if( *context != NULL )
- OPENCL_V_THROW( clReleaseContext( *context ), "Error: In clReleaseContext\n" );
+ if( *context != NULL )
+ OPENCL_V_THROW( clReleaseContext( *context ), "Error: In clReleaseContext\n" );
- return 0;
+ return 0;
}
int createOpenCLMemoryBuffer( cl_context& context, const size_t bufferSizeBytes, const cl_uint numBuffers, cl_mem buffer[], cl_mem_flags accessibility) {
- cl_int status = 0;
+ cl_int status = 0;
- for( cl_uint i = 0; i < numBuffers; ++i )
- {
- buffer[ i ] = ::clCreateBuffer( context, accessibility, bufferSizeBytes, NULL, &status);
- OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
- }
+ for( cl_uint i = 0; i < numBuffers; ++i )
+ {
+ buffer[ i ] = ::clCreateBuffer( context, accessibility, bufferSizeBytes, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
+ }
- return 0;
+ return 0;
}
int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[])
{
- for( cl_uint i = 0; i < numBuffers; ++i )
- {
- if( buffer[ i ] != NULL )
- OPENCL_V_THROW( clReleaseMemObject( buffer[ i ] ), "Error: In clReleaseMemObject\n" );
- }
+ for( cl_uint i = 0; i < numBuffers; ++i )
+ {
+ if( buffer[ i ] != NULL )
+ OPENCL_V_THROW( clReleaseMemObject( buffer[ i ] ), "Error: In clReleaseMemObject\n" );
+ }
- return 0;
+ return 0;
}
void createOpenCLCommandQueue( cl_context& context,
- cl_uint commandQueueFlags,
- cl_command_queue& commandQueue,
- std::vector< cl_device_id > devices,
- const size_t bufferSizeBytesIn,
- const cl_uint numBuffersIn,
- cl_mem clMemBufferIn[],
- const size_t bufferSizeBytesOut,
- const cl_uint numBuffersOut,
- cl_mem clMemBufferOut[] )
+ cl_uint commandQueueFlags,
+ cl_command_queue& commandQueue,
+ std::vector< cl_device_id > devices,
+ const size_t bufferSizeBytesIn,
+ const cl_uint numBuffersIn,
+ cl_mem clMemBufferIn[],
+ const size_t bufferSizeBytesOut,
+ const cl_uint numBuffersOut,
+ cl_mem clMemBufferOut[] )
{
- cl_int status = 0;
- commandQueue = ::clCreateCommandQueue( context, devices[0], commandQueueFlags, &status );
- OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
+ cl_int status = 0;
+ commandQueue = ::clCreateCommandQueue( context, devices[0], commandQueueFlags, &status );
+ OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
- createOpenCLMemoryBuffer( context, bufferSizeBytesIn, numBuffersIn, clMemBufferIn, CL_MEM_READ_WRITE);
- createOpenCLMemoryBuffer( context, bufferSizeBytesOut, numBuffersOut, clMemBufferOut, CL_MEM_READ_WRITE);
+ createOpenCLMemoryBuffer( context, bufferSizeBytesIn, numBuffersIn, clMemBufferIn, CL_MEM_READ_WRITE);
+ createOpenCLMemoryBuffer( context, bufferSizeBytesOut, numBuffersOut, clMemBufferOut, CL_MEM_READ_WRITE);
}
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 63ac0f9..0c81ae3 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -20,7 +20,8 @@ set( clFFT.Source transform.cpp
plan.cpp
repo.cpp
generator.stockham.cpp
- generator.transpose.cpp
+ generator.transpose.vliw.cpp
+ generator.transpose.gcn.cpp
generator.copy.cpp
lifetime.cpp
stdafx.cpp )
@@ -37,7 +38,8 @@ set( clFFT.Headers private.h
mainpage.h
generator.h
generator.stockham.h
- generator.transpose.h
+ generator.transpose.vliw.h
+ generator.transpose.gcn.h
../include/stdafx.h
../include/unicode.compatibility.h
../include/targetver.h
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index 8d6ce65..81f3fec 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -79,29 +79,7 @@ clfftStatus clfftGetPlanPrecision( const clfftPlanHandle plHandle, clfftPrecisio
return CLFFT_SUCCESS;
}
-// This is a helper function to query a device for it's caps and check whether a certain user supplied cap is present
-// Returns CLFFT_SUCCESS if the cap is present, CLFFT_INVALID_OPERATION if it is not found. All devices specified
-// in the devices vector must contain the cap.
-clfftStatus checkDevExt( std::string cap, std::vector< cl_device_id >& devices )
-{
- for( size_t d = 0; d < devices.size( ); ++d)
- {
- size_t deviceExtSize = 0;
- OPENCL_V( ::clGetDeviceInfo( devices[ d ], CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
- "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
-
- std::vector< char > szDeviceExt( deviceExtSize );
- OPENCL_V( ::clGetDeviceInfo( devices[ d ], CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
- "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
-
- std::string strDeviceExt = &szDeviceExt[ 0 ];
- if( strDeviceExt.find( cap.c_str( ), 0 ) == std::string::npos )
- return CLFFT_DEVICE_NO_DOUBLE;
- }
-
- return CLFFT_SUCCESS;
-}
clfftStatus clfftSetPlanPrecision( clfftPlanHandle plHandle, clfftPrecision precision )
{
FFTRepo& fftRepo = FFTRepo::getInstance( );
@@ -118,18 +96,7 @@ clfftStatus clfftSetPlanPrecision( clfftPlanHandle plHandle, clfftPrecision prec
if( precision == CLFFT_SINGLE_FAST || precision == CLFFT_DOUBLE_FAST )
return CLFFT_NOTIMPLEMENTED;
- // If the user specifies double precision, check that the device supports double precision first
- if( precision == CLFFT_DOUBLE || precision == CLFFT_DOUBLE_FAST )
- {
- clfftStatus retAmdFp64 = checkDevExt( "cl_amd_fp64", fftPlan->devices );
- if( retAmdFp64 != CLFFT_SUCCESS )
- {
- // If AMD's extention is not supported, check for Khronos extention
- clfftStatus retKhrFp64 = checkDevExt( "cl_khr_fp64", fftPlan->devices );
- if( retKhrFp64 != CLFFT_SUCCESS )
- return retKhrFp64;
- }
- }
+
// If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
fftPlan->baked = false;
@@ -784,34 +751,6 @@ clfftStatus clfftGetTmpBufSize( const clfftPlanHandle plHandle, size_t* buffersi
return CLFFT_INVALID_OPERATION;
}
-clfftStatus clfftSetInternal( clfftPlanHandle plHandle, void* data )
-{
- FFTRepo& fftRepo = FFTRepo::getInstance( );
- FFTPlan* fftPlan = NULL;
- lockRAII* planLock = NULL;
-
- OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
- scopedLock sLock( *planLock, _T( "clfftSetResultLocation" ) );
-
- struct InternalData {
- size_t large1D_Xfactor;
- size_t cacheSize;
- bool bLdsComplex;
- bool ldsPadding;
- unsigned uLdsFraction;
- } *mydata;
-
- mydata = (InternalData *) data;
-
- fftPlan->large1D_Xfactor = mydata->large1D_Xfactor;
- fftPlan->cacheSize = mydata->cacheSize;
- fftPlan->bLdsComplex = mydata->bLdsComplex;
- fftPlan->ldsPadding = mydata->ldsPadding;
- fftPlan->uLdsFraction = mydata->uLdsFraction;
-
- return CLFFT_SUCCESS;
-}
-
clfftStatus clfftLocalMemSize( const clfftPlanHandle plHandle, cl_ulong* local_mem_size )
{
FFTRepo& repo = FFTRepo::getInstance( );
diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
index b88adae..1221882 100644
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@ -32,6 +32,7 @@ namespace CopyGenerator
size_t Nt;
const FFTKernelGenKeyParams params;
bool h2c, c2h;
+ bool general;
inline std::string OffsetCalc(const std::string &off, bool input = true)
{
@@ -39,72 +40,21 @@ namespace CopyGenerator
const size_t *pStride = input ? params.fft_inStride : params.fft_outStride;
- std::string batch = "batch";
-
- switch(params.fft_DataDim)
+ str += "\t"; str += off; str += " = ";
+ std::string nextBatch = "batch";
+ for(size_t i=(params.fft_DataDim - 1); i>1; i--)
{
- case 5:
- {
- str += "\t{\n\tuint ocalc1 = ";
- str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
- str += ";\n";
-
- str += "\tuint ocalc0 = ";
- str += "ocalc1"; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
- str += ";\n";
-
- str += "\t"; str += off; str += " = ";
- str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
- str += ")*"; str += SztToStr(pStride[4]); str += " + ";
+ size_t currentLength = 1;
+ for(int j=1; j<i; j++) currentLength *= params.fft_N[j];
- str += "(ocalc1"; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
- str += SztToStr(pStride[3]); str += " + ";
-
- str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[2]); str += " + ";
- str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[1]); str += ";\n";
-
- str += "\t}\n";
- }
- break;
- case 4:
- {
- str += "\t{\n\tuint ocalc0 = ";
- str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
- str += ";\n";
+ str += "("; str += nextBatch; str += "/"; str += SztToStr(currentLength);
+ str += ")*"; str += SztToStr(pStride[i]); str += " + ";
- str += "\t"; str += off; str += " = ";
- str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
- str += SztToStr(pStride[3]); str += " + ";
-
- str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[2]); str += " + ";
- str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[1]); str += ";\n";
-
- str += "\t}\n";
- }
- break;
- case 3:
- {
- str += "\t"; str += off; str += " = ";
- str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[2]); str += " + ";
- str += "("; str += batch; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[1]); str += ";\n";
- }
- break;
- case 2:
- {
- str += "\t"; str += off; str += " = ";
- str += batch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
- }
- break;
- default:
- assert(false);
+ nextBatch = "(" + nextBatch + "%" + SztToStr(currentLength) + ")";
}
+ str += nextBatch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
+
return str;
}
@@ -121,6 +71,8 @@ namespace CopyGenerator
c2h = ( (params.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
(params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ general = !(h2c || c2h);
+
// We only do out-of-place copies at this point
assert(params.fft_placeness == CLFFT_OUTOFPLACE);
}
@@ -148,8 +100,13 @@ namespace CopyGenerator
str += "__kernel void ";
// Function name
- if(h2c) str += "copy_h2c";
- else str += "copy_c2h";
+ if(general)
+ str += "copy_general";
+ else
+ {
+ if(h2c) str += "copy_h2c";
+ else str += "copy_c2h";
+ }
str += "(";
@@ -177,7 +134,15 @@ namespace CopyGenerator
str += "{\n";
// Initialize
- str += "\tuint me = get_global_id(0);\n\t";
+ if(general)
+ {
+ str += "\tuint me = get_local_id(0);\n\t";
+ str += "uint batch = get_group_id(0);\n\t";
+ }
+ else
+ {
+ str += "\tuint me = get_global_id(0);\n\t";
+ }
// Declare memory pointers
str += "\n\t";
@@ -222,11 +187,14 @@ namespace CopyGenerator
// Setup registers
str += "\t"; str += RegBaseType<PR>(2); str += " R;\n\n";
- // Setup variables
- str += "\tuint batch, mel, mel2;\n\t";
- str += "batch = me/"; str += SztToStr(Nt); str += ";\n\t";
- str += "mel = me%"; str += SztToStr(Nt); str += ";\n\t";
- str += "mel2 = ("; str += SztToStr(N); str += " - mel)%"; str += SztToStr(N); str += ";\n\n";
+ if(!general)
+ {
+ // Setup variables
+ str += "\tuint batch, mel, mel2;\n\t";
+ str += "batch = me/"; str += SztToStr(Nt); str += ";\n\t";
+ str += "mel = me%"; str += SztToStr(Nt); str += ";\n\t";
+ str += "mel2 = ("; str += SztToStr(N); str += " - mel)%"; str += SztToStr(N); str += ";\n\n";
+ }
// Setup memory pointers
@@ -235,96 +203,132 @@ namespace CopyGenerator
// offset strings
std::string inF, inF2, outF, outF2;
- inF = "(mel*"; inF += SztToStr(params.fft_inStride[0]); inF += ")";
- inF2 = "(mel2*"; inF2 += SztToStr(params.fft_inStride[0]); inF2 += ")";
- outF = "(mel*"; outF += SztToStr(params.fft_outStride[0]); outF += ")";
- outF2 = "(mel2*"; outF2 += SztToStr(params.fft_outStride[0]); outF2 += ")";
+ if(general)
+ {
+ inF = inF2 = outF = outF2 = "";
+ }
+ else
+ {
+ inF = " + (mel*"; inF += SztToStr(params.fft_inStride[0]); inF += ")";
+ inF2 = " + (mel2*"; inF2 += SztToStr(params.fft_inStride[0]); inF2 += ")";
+ outF = " + (mel*"; outF += SztToStr(params.fft_outStride[0]); outF += ")";
+ outF2 = " + (mel2*"; outF2 += SztToStr(params.fft_outStride[0]); outF2 += ")";
+ }
str += "\n\t";
// inputs
if(inIlvd)
{
- str += "lwbIn = gbIn + iOffset + "; str += inF; str += ";\n\t";
+ str += "lwbIn = gbIn + iOffset"; str += inF; str += ";\n\t";
}
else
{
- str += "lwbInRe = gbInRe + iOffset + "; str += inF; str += ";\n\t";
- str += "lwbInIm = gbInIm + iOffset + "; str += inF; str += ";\n\t";
+ str += "lwbInRe = gbInRe + iOffset"; str += inF; str += ";\n\t";
+ str += "lwbInIm = gbInIm + iOffset"; str += inF; str += ";\n\t";
}
// outputs
if(outIlvd)
{
- str += "lwbOut = gbOut + oOffset + "; str += outF; str += ";\n";
+ str += "lwbOut = gbOut + oOffset"; str += outF; str += ";\n";
if(h2c)
{
str += "\t";
- str += "lwbOut2 = gbOut + oOffset + "; str += outF2; str += ";\n";
+ str += "lwbOut2 = gbOut + oOffset"; str += outF2; str += ";\n";
}
}
else
{
- str += "lwbOutRe = gbOutRe + oOffset + "; str += outF; str += ";\n\t";
- str += "lwbOutIm = gbOutIm + oOffset + "; str += outF; str += ";\n";
+ str += "lwbOutRe = gbOutRe + oOffset"; str += outF; str += ";\n\t";
+ str += "lwbOutIm = gbOutIm + oOffset"; str += outF; str += ";\n";
if(h2c)
{
str += "\t";
- str += "lwbOutRe2 = gbOutRe + oOffset + "; str += outF2; str += ";\n\t";
- str += "lwbOutIm2 = gbOutIm + oOffset + "; str += outF2; str += ";\n";
+ str += "lwbOutRe2 = gbOutRe + oOffset"; str += outF2; str += ";\n\t";
+ str += "lwbOutIm2 = gbOutIm + oOffset"; str += outF2; str += ";\n";
}
}
str += "\n\t";
// Do the copy
- if(c2h)
+ if(general)
{
+ str += "for(uint t=0; t<"; str += SztToStr(N/64); str += "; t++)\n\t{\n\t\t";
+
if(inIlvd)
{
- str += "R = lwbIn[0];\n\t";
+ str += "R = lwbIn[me + t*64];\n\t\t";
}
else
{
- str += "R.x = lwbInRe[0];\n\t";
- str += "R.y = lwbInIm[0];\n\t";
+ str += "R.x = lwbInRe[me + t*64];\n\t\t";
+ str += "R.y = lwbInIm[me + t*64];\n\t\t";
}
if(outIlvd)
{
- str += "lwbOut[0] = R;\n\n";
+ str += "lwbOut[me + t*64] = R;\n";
}
else
{
- str += "lwbOutRe[0] = R.x;\n\t";
- str += "lwbOutIm[0] = R.y;\n\t";
+ str += "lwbOutRe[me + t*64] = R.x;\n\t\t";
+ str += "lwbOutIm[me + t*64] = R.y;\n";
}
+
+ str += "\t}\n\n";
}
else
{
- if(inIlvd)
+ if(c2h)
{
- str += "R = lwbIn[0];\n\t";
+ if(inIlvd)
+ {
+ str += "R = lwbIn[0];\n\t";
+ }
+ else
+ {
+ str += "R.x = lwbInRe[0];\n\t";
+ str += "R.y = lwbInIm[0];\n\t";
+ }
+
+ if(outIlvd)
+ {
+ str += "lwbOut[0] = R;\n\n";
+ }
+ else
+ {
+ str += "lwbOutRe[0] = R.x;\n\t";
+ str += "lwbOutIm[0] = R.y;\n\t";
+ }
}
else
{
- str += "R.x = lwbInRe[0];\n\t";
- str += "R.y = lwbInIm[0];\n\t";
- }
-
- if(outIlvd)
- {
- str += "lwbOut[0] = R;\n\t";
- str += "R.y = -R.y;\n\t";
- str += "lwbOut2[0] = R;\n\n";
- }
- else
- {
- str += "lwbOutRe[0] = R.x;\n\t";
- str += "lwbOutIm[0] = R.y;\n\t";
- str += "R.y = -R.y;\n\t";
- str += "lwbOutRe2[0] = R.x;\n\t";
- str += "lwbOutIm2[0] = R.y;\n\n";
+ if(inIlvd)
+ {
+ str += "R = lwbIn[0];\n\t";
+ }
+ else
+ {
+ str += "R.x = lwbInRe[0];\n\t";
+ str += "R.y = lwbInIm[0];\n\t";
+ }
+
+ if(outIlvd)
+ {
+ str += "lwbOut[0] = R;\n\t";
+ str += "R.y = -R.y;\n\t";
+ str += "lwbOut2[0] = R;\n\n";
+ }
+ else
+ {
+ str += "lwbOutRe[0] = R.x;\n\t";
+ str += "lwbOutIm[0] = R.y;\n\t";
+ str += "R.y = -R.y;\n\t";
+ str += "lwbOutRe2[0] = R.x;\n\t";
+ str += "lwbOutIm2[0] = R.y;\n\n";
+ }
}
}
@@ -355,58 +359,17 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Copy> (FFTKernelGenKeyParams & params) c
params.fft_outputLayout = this->outputLayout;
- switch (this->inStride.size()) {
- // 1-D array is a 2-D data structure.
- // 1-D unit is a special case of 1-D array.
- case 1:
- ARG_CHECK(this->length .size() > 0);
- ARG_CHECK(this->outStride.size() > 0);
- params.fft_DataDim = 2;
- params.fft_N[0] = this->length[0];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->oDist;
- break;
-
- // 2-D array is a 3-D data structure
- // 2-D unit is a speical case of 2-D array.
- case 2:
- ARG_CHECK(this->length .size() > 1);
- ARG_CHECK(this->outStride.size() > 1);
- params.fft_DataDim = 3;
- params.fft_N[0] = this->length[0];
- params.fft_N[1] = this->length[1];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->inStride[1];
- params.fft_inStride[2] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->outStride[1];
- params.fft_outStride[2] = this->oDist;
- break;
-
- // 3-D array is a 4-D data structure
- // 3-D unit is a special case of 3-D array.
- case 3:
- ARG_CHECK(this->length .size() > 2);
- ARG_CHECK(this->outStride.size() > 2);
- params.fft_DataDim = 4;
- params.fft_N[0] = this->length[0];
- params.fft_N[1] = this->length[1];
- params.fft_N[2] = this->length[2];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->inStride[1];
- params.fft_inStride[2] = this->inStride[2];
- params.fft_inStride[3] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->outStride[1];
- params.fft_outStride[2] = this->outStride[2];
- params.fft_outStride[3] = this->oDist;
- break;
-
- default:
- ARG_CHECK (false);
- }
+ params.fft_DataDim = this->length.size() + 1;
+ int i = 0;
+ for(i = 0; i < (params.fft_DataDim - 1); i++)
+ {
+ params.fft_N[i] = this->length[i];
+ params.fft_inStride[i] = this->inStride[i];
+ params.fft_outStride[i] = this->outStride[i];
+
+ }
+ params.fft_inStride[i] = this->iDist;
+ params.fft_outStride[i] = this->oDist;
params.fft_fwdScale = this->forwardScale;
params.fft_backScale = this->backwardScale;
@@ -420,13 +383,33 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std:
FFTKernelGenKeyParams fftParams;
OPENCL_V( this->GetKernelGenKeyPvt<Copy>( fftParams ), _T("GetKernelGenKey() failed!") );
+ bool h2c, c2h;
+ h2c = ( (fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
+ (fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ c2h = ( (fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
+ (fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+
+ bool general = !(h2c || c2h);
+
size_t count = this->batchsize;
+
switch(fftParams.fft_DataDim)
{
case 5: assert(false);
case 4: count *= fftParams.fft_N[2];
case 3: count *= fftParams.fft_N[1];
- case 2: count *= (1 + fftParams.fft_N[0]/2); break;
+ case 2:
+ {
+ if(general)
+ {
+ count *= 64;
+ }
+ else
+ {
+ count *= (1 + fftParams.fft_N[0]/2);
+ }
+ }
+ break;
case 1: assert(false);
}
@@ -445,11 +428,19 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Copy> (size_t * longest) const
using namespace CopyGenerator;
template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
+clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
{
FFTKernelGenKeyParams params;
OPENCL_V( this->GetKernelGenKeyPvt<Copy> (params), _T("GetKernelGenKey() failed!") );
+ bool h2c, c2h;
+ h2c = ( (params.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
+ (params.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ c2h = ( (params.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
+ (params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+
+ bool general = !(h2c || c2h);
+
std::string programCode;
Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
switch(pr)
@@ -466,14 +457,25 @@ clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_
} break;
}
- cl_int status = CL_SUCCESS;
- cl_context QueueContext = NULL;
- status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+ cl_int status = CL_SUCCESS;
+ cl_device_id Device = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+ OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+ cl_context QueueContext = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+ OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- OPENCL_V( fftRepo.setProgramCode( Copy, params, programCode, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
- OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_c2h", "copy_h2c", QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramCode( Copy, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+
+ if(general)
+ {
+ OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_general", "copy_general", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ }
+ else
+ {
+ OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_c2h", "copy_h2c", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ }
return CLFFT_SUCCESS;
}
diff --git a/src/library/generator.h b/src/library/generator.h
index 2aac983..590f4a6 100644
--- a/src/library/generator.h
+++ b/src/library/generator.h
@@ -22,10 +22,11 @@
// Enum to help provide descriptive names to array indices, when indexing into our various vectors
enum clfftGenerators
{
- Stockham, // Using the Stockham autosort frameworks
- Transpose,
- Copy,
- ENDGENERATORS ///< This value will always be last, and marks the length of clfftGenerators
+ Stockham, // Using the Stockham autosort frameworks
+ Transpose_VLIW,
+ Transpose_GCN,
+ Copy,
+ ENDGENERATORS ///< This value will always be last, and marks the length of clfftGenerators
};
#endif
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index f6f7241..08f173c 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -272,7 +272,6 @@ namespace StockhamGenerator
// Length, WorkGroupSize, NumTransforms, NumPasses, Radices
{ 1024, 128, 1, 4, 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 },
- //{ 128, 64, 1, 7, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0 },
{ 128, 64, 4, 3, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 8, 64, 16, 3, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
@@ -334,21 +333,17 @@ namespace StockhamGenerator
size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);
size_t l = length;
- std::map<size_t, size_t> primeFactors;
std::map<size_t, size_t> primeFactorsExpanded;
for(size_t r=0; r<baseRadixSize; r++)
{
size_t rad = baseRadix[r];
- size_t p = 0;
size_t e = 1;
while(!(l%rad))
{
l /= rad;
e *= rad;
- p++;
}
- primeFactors[rad] = p;
primeFactorsExpanded[rad] = e;
}
@@ -515,117 +510,6 @@ namespace StockhamGenerator
};
- // Twiddle factors table for large N
- // used in 3-step algorithm
- class TwiddleTableLarge
- {
- size_t N; // length
- size_t X, Y;
- size_t tableSize;
- double *wc, *ws; // cosine, sine arrays
-
- public:
- TwiddleTableLarge(size_t length) : N(length)
- {
- X = size_t(1) << ARBITRARY::TWIDDLE_DEE;
- Y = DivRoundingUp<size_t> (CeilPo2(N), ARBITRARY::TWIDDLE_DEE);
- tableSize = X * Y;
-
- // Allocate memory for the tables
- wc = new double[tableSize];
- ws = new double[tableSize];
- }
-
- ~TwiddleTableLarge()
- {
- // Free
- delete[] wc;
- delete[] ws;
- }
-
- template <Precision PR>
- void GenerateTwiddleTable(std::string &twStr)
- {
- const double TWO_PI = -6.283185307179586476925286766559;
-
- // Generate the table
- size_t nt = 0;
- double phi = TWO_PI / double (N);
- for (size_t iY = 0; iY < Y; ++iY)
- {
- size_t i = size_t(1) << (iY * ARBITRARY::TWIDDLE_DEE);
- for (size_t iX = 0; iX < X; ++iX)
- {
- size_t j = i * iX;
-
- double c = cos(phi * (double)j);
- double s = sin(phi * (double)j);
-
- //if (fabs(c) < 1.0E-12) c = 0.0;
- //if (fabs(s) < 1.0E-12) s = 0.0;
-
- wc[nt] = c;
- ws[nt++] = s;
- }
- }
-
- std::string sfx = FloatSuffix<PR>();
-
- // Stringize the table
- std::stringstream ss;
- nt = 0;
-
- ss << "\n __constant ";
- ss << RegBaseType<PR>(2);
- ss << " " << TwTableLargeName();
- ss << "[" << Y << "][" << X << "] = {\n";
- for (size_t iY = 0; iY < Y; ++iY)
- {
- ss << "{ ";
- for (size_t iX = 0; iX < X; ++iX)
- {
- char cv[64], sv[64];
- sprintf(cv, "%036.34lf", wc[nt]);
- sprintf(sv, "%036.34lf", ws[nt++]);
- ss << "("; ss << RegBaseType<PR>(2); ss << ")(";
- ss << cv; ss << sfx; ss << ", ";
- ss << sv; ss << sfx; ss << ")";
- ss << ", ";
- }
- ss << " },\n";
- }
- ss << "};\n\n";
-
-
- // Twiddle calc function
- ss << "__attribute__((always_inline)) ";
- ss << RegBaseType<PR>(2);
- ss << "\n" << TwTableLargeFunc() << "(uint u)\n{\n";
-
- ss << "\t" "uint j = u & " << unsigned(X-1) << ";\n";
- ss << "\t" ; ss << RegBaseType<PR>(2); ss << " result = ";
- ss << TwTableLargeName();
- ss << "[0][j];\n";
-
- for (size_t iY = 1; iY < Y; ++iY)
- {
- std::string phasor = TwTableLargeName();
- phasor += "[";
- phasor += SztToStr(iY);
- phasor += "][j]";
-
- stringpair product = ComplexMul((RegBaseType<PR>(2)).c_str(), "result", phasor.c_str());
-
- ss << "\t" "u >>= " << unsigned (ARBITRARY::TWIDDLE_DEE) << ";\n";
- ss << "\t" "j = u & " << unsigned(X-1) << ";\n";
- ss << "\t" "result = " << product.first << "\n";
- ss << "\t" "\t" << product.second <<";\n";
- }
- ss << "\t" "return result;\n}\n\n";
-
- twStr += ss.str();
- }
- };
// A pass inside an FFT kernel
template <Precision PR>
@@ -654,8 +538,9 @@ namespace StockhamGenerator
bool rcFull;
bool rcSimple;
- bool enableGrouping;
- bool linearRegs;
+ bool enableGrouping;
+ bool linearRegs; // scalar registers (non-vectorized registers) to be used
+ bool halfLds; // only half the LDS of a complex length need to be used
Pass<PR> *nextPass;
inline void RegBase(size_t regC, std::string &str) const
@@ -767,7 +652,7 @@ namespace StockhamGenerator
// SweepRegs is to iterate through the registers to do the three basic operations:
// reading, twiddle multiplication, writing
void SweepRegs( size_t flag, bool fwd, bool interleaved, size_t stride, size_t component,
- double scale,
+ double scale, bool frontTwiddle,
const std::string &bufferRe, const std::string &bufferIm, const std::string &offset,
size_t regC, size_t numB, size_t numPrev, std::string &passStr) const
{
@@ -982,10 +867,23 @@ namespace StockhamGenerator
{
passStr += "\n\t{\n\t\t"; passStr += twType; passStr += " W = ";
passStr += tw3StepFunc; passStr += "( ";
- passStr += "(("; passStr += SztToStr(numButterfly); passStr += "*me + ";
- passStr += SztToStr(butterflyIndex);
- passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
- passStr += SztToStr(r*algLS); passStr += ") * b "; passStr += ");\n\t\t";
+
+ if(frontTwiddle)
+ {
+ assert(linearRegs);
+ passStr += "("; passStr += "me*"; passStr += SztToStr(numButterfly);
+ passStr += " + "; passStr += SztToStr(i); passStr += " + ";
+ passStr += SztToStr(r*length/radix); passStr += ") * b";
+ }
+ else
+ {
+ passStr += "(("; passStr += SztToStr(numButterfly); passStr += "*me + ";
+ passStr += SztToStr(butterflyIndex);
+ passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
+ passStr += SztToStr(r*algLS); passStr += ") * b";
+ }
+
+ passStr += " );\n\t\t";
}
passStr += rType; passStr += " TR, TI;\n\t\t";
@@ -1454,9 +1352,9 @@ namespace StockhamGenerator
public:
Pass( size_t positionVal, size_t lengthVal, size_t radixVal, size_t cnPerWIVal,
- size_t L, size_t LS, size_t R, bool linearRegsVal, bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal) :
+ size_t L, size_t LS, size_t R, bool linearRegsVal, bool halfLdsVal, bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal) :
position(positionVal), length(lengthVal), radix(radixVal), cnPerWI(cnPerWIVal),
- algL(L), algLS(LS), algR(R), linearRegs(linearRegsVal),
+ algL(L), algLS(LS), algR(R), linearRegs(linearRegsVal), halfLds(halfLdsVal),
r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal),
enableGrouping(true),
numB1(0), numB2(0), numB4(0),
@@ -1488,6 +1386,10 @@ namespace StockhamGenerator
assert(numButterfly == (numB4*4 + numB2*2 + numB1));
}
+
+ // if only half LDS can be used, we need the passes to share registers
+ // and hence they need to be linear registers
+ if(halfLds) assert(linearRegs);
}
size_t GetNumB1() const { return numB1; }
@@ -1543,13 +1445,10 @@ namespace StockhamGenerator
passStr += "(";
passStr += "uint rw, uint b, uint me, uint inOffset, uint outOffset, ";
- // For now, interleaved support is there for only global buffers
- // TODO : add support for LDS interleaved
- if(inInterleaved) assert(gIn);
- if(outInterleaved) assert(gOut);
-
if(r2c || c2r)
{
+ assert(halfLds);
+
if(gIn)
{
if(inInterleaved)
@@ -1618,8 +1517,15 @@ namespace StockhamGenerator
}
else
{
- passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
- passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+ if(inInterleaved)
+ {
+ passStr += "__local "; passStr += regB2Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ }
+ else
+ {
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+ }
}
@@ -1637,8 +1543,15 @@ namespace StockhamGenerator
}
else
{
- passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
- passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+ if(outInterleaved)
+ {
+ passStr += "__local "; passStr += regB2Type; passStr += " *"; passStr += bufferOutRe;
+ }
+ else
+ {
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
+ passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+ }
}
}
@@ -1690,7 +1603,7 @@ namespace StockhamGenerator
if(position == 0)
{
passStr += "\n\tif(rw)\n\t{";
- SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
passStr += "\n\t}\n";
if(rcSimple)
@@ -1702,7 +1615,7 @@ namespace StockhamGenerator
else
{
passStr += "\n\tif(rw > 1)\n\t{";
- SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, bufferInRe2, bufferInIm2, "inOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, bufferInRe2, bufferInIm2, "inOffset", 1, numB1, 0, passStr);
passStr += "\n\t}\n";
passStr += "\telse\n\t{";
@@ -1773,7 +1686,7 @@ namespace StockhamGenerator
}
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
- SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
+ SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, false, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
@@ -1823,31 +1736,49 @@ namespace StockhamGenerator
}
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
- SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
+ SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, false, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
}
}
else
{
- if( (!linearRegs) || (linearRegs && (position == 0)) )
+ if( (!halfLds) || (halfLds && (position == 0)) )
{
passStr += "\n\tif(rw)\n\t{";
- SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
- SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
- SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 4, numB4, 2*numB2 + numB1, passStr);
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
+ SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 4, numB4, 2*numB2 + numB1, passStr);
passStr += "\n\t}\n";
}
}
+ passStr += "\n";
+
+ // 3-step twiddle multiplies done in the front
+ bool tw3Done = false;
+ if(fft_3StepTwiddle && (position == 0))
+ {
+ tw3Done = true;
+ if(linearRegs)
+ {
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ }
+ else
+ {
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+ }
+ }
passStr += "\n";
// Twiddle multiply
if( (position > 0) && (radix > 1) )
{
- SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
- SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
- SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+ SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+ SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
}
// Butterfly calls
@@ -1858,31 +1789,30 @@ namespace StockhamGenerator
if(numB4) CallButterfly(ButterflyName(radix, 4, fwd), 4, numB4, passStr);
}
- passStr += "\n";
if( (position != 0) && (!linearRegs) && (nextPass != NULL) )
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
- passStr += "\n";
+ passStr += "\n\n";
// 3-step twiddle multiplies
- if(fft_3StepTwiddle)
+ if(fft_3StepTwiddle && !tw3Done)
{
assert(nextPass == NULL);
if(linearRegs)
{
- SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
}
else
{
- SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
- SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
- SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+ SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
}
}
// Write back from registers
- if(linearRegs)
+ if(halfLds)
{
// In this case, we have to write & again read back for the next pass since we are
// using only half the lds. Number of barriers will increase at the cost of halving the lds.
@@ -1893,7 +1823,7 @@ namespace StockhamGenerator
{
if(!singlePass)
{
- SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, false, false, bufferInRe, bufferInIm, "inOffset", passStr);
if(oddp)
@@ -1919,7 +1849,7 @@ namespace StockhamGenerator
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
- SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInRe, bufferInIm, "inOffset", passStr);
if(oddp)
@@ -1972,39 +1902,39 @@ namespace StockhamGenerator
else if(c2r)
{
passStr += "\n\tif(rw)\n\t{";
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
passStr += "\n\t}\n";
if(!rcSimple)
{
passStr += "\n\tif(rw > 1)\n\t{";
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe2, bufferOutIm2, "outOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe2, bufferOutIm2, "outOffset", 1, numB1, 0, passStr);
passStr += "\n\t}\n";
}
}
else
{
passStr += "\n\tif(rw)\n\t{";
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
passStr += "\n\t}\n";
}
}
else
{
passStr += "\n\tif(rw)\n\t{";
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
passStr += "\n\t}\n";
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
passStr += "\n\tif(rw)\n\t{";
- nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
+ nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
passStr += "\n\t}\n";
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
passStr += "\n\tif(rw)\n\t{";
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
passStr += "\n\t}\n";
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
passStr += "\n\tif(rw)\n\t{";
- nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
+ nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
passStr += "\n\t}\n";
passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
}
@@ -2012,9 +1942,9 @@ namespace StockhamGenerator
else
{
passStr += "\n\tif(rw)\n\t{";
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 2, numB2, numB1, passStr);
- SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 4, numB4, 2*numB2 + numB1, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 2, numB2, numB1, passStr);
+ SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 4, numB4, 2*numB2 + numB1, passStr);
passStr += "\n\t}\n";
}
@@ -2041,6 +1971,8 @@ namespace StockhamGenerator
// for passing intermediate data between the passes, if this is set
// then each pass-function should accept same set of registers
+ bool linearRegs; // scalar registers
+
// Future optimization ideas
// bool limitRegs; // TODO: Incrementally write to LDS, thereby using same set of registers for more than 1 butterflies
// bool combineReadTwMul; // TODO: Combine reading into registers and Twiddle multiply
@@ -2050,6 +1982,11 @@ namespace StockhamGenerator
bool rcFull;
bool rcSimple;
+ bool blockCompute; // When we have to compute FFT in blocks (either read or write is along columns)
+ BlockComputeType blockComputeType;
+ size_t blockWidth, blockWGS, blockLDS;
+
+
const FFTKernelGenKeyParams params; // key params
@@ -2057,7 +1994,7 @@ namespace StockhamGenerator
{
std::string str = "";
- if(halfLds)
+ if(linearRegs)
{
if(initComma) str += ", ";
@@ -2099,6 +2036,38 @@ namespace StockhamGenerator
return possible;
}
+ inline std::string OffsetCalcBlock(const std::string &off, bool input = true)
+ {
+ std::string str;
+
+ const size_t *pStride = input ? params.fft_inStride : params.fft_outStride;
+
+ str += "\t"; str += off; str += " = ";
+ std::string nextBatch = "batch";
+ for(size_t i=(params.fft_DataDim - 1); i>2; i--)
+ {
+ size_t currentLength = 1;
+ for(int j=2; j<i; j++) currentLength *= params.fft_N[j];
+ currentLength *= (params.fft_N[1]/blockWidth);
+
+ str += "("; str += nextBatch; str += "/"; str += SztToStr(currentLength);
+ str += ")*"; str += SztToStr(pStride[i]); str += " + ";
+
+ nextBatch = "(" + nextBatch + "%" + SztToStr(currentLength) + ")";
+ }
+
+ str += "("; str += nextBatch; str += "/"; str += SztToStr(params.fft_N[1]/blockWidth);
+ str += ")*"; str += SztToStr(pStride[2]); str += " + ("; str += nextBatch;
+ str += "%"; str += SztToStr(params.fft_N[1]/blockWidth); str += ")*";
+ if( (input && (blockComputeType == BCT_R2C)) || (!input && (blockComputeType == BCT_C2R)) )
+ str += SztToStr(blockWidth*length);
+ else
+ str += SztToStr(blockWidth);
+ str += ";\n";
+
+ return str;
+ }
+
inline std::string OffsetCalc(const std::string &off, bool input = true, bool rc_second_index = false)
{
std::string str;
@@ -2122,76 +2091,27 @@ namespace StockhamGenerator
batch += " + (me/"; batch += SztToStr(workGroupSizePerTrans); batch += "))"; }
}
- switch(params.fft_DataDim)
+ str += "\t"; str += off; str += " = ";
+ std::string nextBatch = batch;
+ for(size_t i=(params.fft_DataDim - 1); i>1; i--)
{
- case 5:
- {
- str += "\t{\n\tuint ocalc1 = ";
- str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
- str += ";\n";
-
- str += "\tuint ocalc0 = ";
- str += "ocalc1"; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
- str += ";\n";
+ size_t currentLength = 1;
+ for(int j=1; j<i; j++) currentLength *= params.fft_N[j];
- str += "\t"; str += off; str += " = ";
- str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
- str += ")*"; str += SztToStr(pStride[4]); str += " + ";
+ str += "("; str += nextBatch; str += "/"; str += SztToStr(currentLength);
+ str += ")*"; str += SztToStr(pStride[i]); str += " + ";
- str += "(ocalc1"; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
- str += SztToStr(pStride[3]); str += " + ";
-
- str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[2]); str += " + ";
- str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[1]); str += ";\n";
-
- str += "\t}\n";
- }
- break;
- case 4:
- {
- str += "\t{\n\tuint ocalc0 = ";
- str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
- str += ";\n";
-
- str += "\t"; str += off; str += " = ";
- str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
- str += SztToStr(pStride[3]); str += " + ";
-
- str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[2]); str += " + ";
- str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[1]); str += ";\n";
-
- str += "\t}\n";
- }
- break;
- case 3:
- {
- str += "\t"; str += off; str += " = ";
- str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[2]); str += " + ";
- str += "("; str += batch; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
- str += SztToStr(pStride[1]); str += ";\n";
- }
- break;
- case 2:
- {
- str += "\t"; str += off; str += " = ";
- str += batch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
- }
- break;
- default:
- assert(false);
+ nextBatch = "(" + nextBatch + "%" + SztToStr(currentLength) + ")";
}
+ str += nextBatch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
+
return str;
}
public:
Kernel( const FFTKernelGenKeyParams ¶msVal) :
- params(paramsVal), r2c2r(false)
+ params(paramsVal), r2c2r(false)
{
length = params.fft_N[0];
@@ -2222,12 +2142,24 @@ namespace StockhamGenerator
halfLds = ( (params.fft_inputLayout == CLFFT_COMPLEX_INTERLEAVED) &&
(params.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ) ? true : false;
halfLds = halfLds ? ((length & (length-1)) ? false : true) : false;
- //halfLds = false;
// Set half lds for real transforms
halfLds = r2c2r ? true : halfLds;
- bool linearRegs = halfLds ? true : false;
+ linearRegs = halfLds;
+
+ blockCompute = params.blockCompute;
+ blockComputeType = params.blockComputeType;
+ // Make sure we can utilize all Lds if we are going to
+ // use blocked columns to compute FFTs
+ if(blockCompute)
+ {
+ assert(length <= 256); // 256 parameter comes from prototype experiments
+ // largest length at which block column possible given 32KB LDS limit
+ // if LDS limit is different this number need to be changed appropriately
+ halfLds = false;
+ linearRegs = true;
+ }
assert( ((length*numTrans)%workGroupSize) == 0 );
cnPerWI = (numTrans * length) / workGroupSize;
@@ -2258,7 +2190,7 @@ namespace StockhamGenerator
R /= rad;
radices.push_back(rad);
- passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, r2c, c2r, rcFull, rcSimple));
+ passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
LS *= rad;
}
@@ -2271,17 +2203,18 @@ namespace StockhamGenerator
size_t cRad[] = {10,8,6,5,4,3,2,1}; // Must be in descending order
size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));
+ // Generate the radix and pass objects
while(true)
{
size_t rad;
assert(cRadSize >= 1);
+
+ // Picks the radices in descending order (biggest radix first)
for(size_t r=0; r<cRadSize; r++)
{
rad = cRad[r];
- if( (rad == 16) && !linearRegs ) continue; // temporary - fix this !!!
-
if((rad > cnPerWI) || (cnPerWI%rad))
continue;
@@ -2295,7 +2228,7 @@ namespace StockhamGenerator
R /= rad;
radices.push_back(rad);
- passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, r2c, c2r, rcFull, rcSimple));
+ passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
pid++;
LS *= rad;
@@ -2346,8 +2279,72 @@ namespace StockhamGenerator
for(size_t i=0; i < (numPasses - 1); i++)
passes[i].SetNextPass(&passes[i+1]);
+
+ if(blockCompute)
+ {
+ blockWidth = BlockSizes::BlockWidth(length);
+ blockWGS = BlockSizes::BlockWorkGroupSize(length);
+ blockLDS = BlockSizes::BlockLdsSize(length);
+ }
+ else
+ {
+ blockWidth = blockWGS = blockLDS = 0;
+ }
}
+ class BlockSizes
+ {
+ public:
+ enum ValType
+ {
+ BS_VT_WGS,
+ BS_VT_BWD,
+ BS_VT_LDS,
+ };
+
+ static size_t BlockLdsSize(size_t N) { return GetValue(N, BS_VT_LDS); }
+ static size_t BlockWidth(size_t N) { return GetValue(N, BS_VT_BWD); }
+ static size_t BlockWorkGroupSize(size_t N) { return GetValue(N, BS_VT_WGS); }
+
+ private:
+
+ static size_t GetValue(size_t N, ValType vt)
+ {
+ size_t wgs; // preferred work group size
+ size_t bwd; // block width to be used
+ size_t lds; // LDS size to be used for the block
+
+
+ KernelCoreSpecs<PR> kcs;
+ size_t t_wgs, t_nt;
+ kcs.GetWGSAndNT(N, t_wgs, t_nt);
+
+ switch(N)
+ {
+ case 256: bwd = 8/PrecisionWidth<PR>(); wgs = (bwd > t_nt) ? 256 : t_wgs; break;
+ case 128: bwd = 8/PrecisionWidth<PR>(); wgs = (bwd > t_nt) ? 128 : t_wgs; break;
+ case 64: bwd = 16/PrecisionWidth<PR>(); wgs = (bwd > t_nt) ? 128 : t_wgs; break;
+ case 32: bwd = 32/PrecisionWidth<PR>(); wgs = (bwd > t_nt) ? 64 : t_wgs; break;
+ case 16: bwd = 64/PrecisionWidth<PR>(); wgs = (bwd > t_nt) ? 64 : t_wgs; break;
+ case 8: bwd = 128/PrecisionWidth<PR>(); wgs = (bwd > t_nt) ? 64 : t_wgs; break;
+ default: assert(false);
+ }
+
+ // block width cannot be less than numTrans, math in other parts of code depend on this assumption
+ assert(bwd >= t_nt);
+
+ lds = N*bwd;
+
+ switch(vt)
+ {
+ case BS_VT_WGS: return wgs;
+ case BS_VT_BWD: return bwd;
+ case BS_VT_LDS: return lds;
+ default: assert(false); return 0;
+ }
+ }
+ };
+
void GenerateKernel(std::string &str, cl_device_id Dev_ID)
{
std::string twType = RegBaseType<PR>(2);
@@ -2361,6 +2358,11 @@ namespace StockhamGenerator
outInterleaved = ( (params.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ||
(params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ // use interleaved LDS when halfLds constraint absent
+ bool ldsInterleaved = inInterleaved || outInterleaved;
+ ldsInterleaved = halfLds ? false : ldsInterleaved;
+ ldsInterleaved = blockCompute ? true : ldsInterleaved;
+
bool inReal; // Input is real format
bool outReal; // Output is real format
inReal = (params.fft_inputLayout == CLFFT_REAL) ? true : false;
@@ -2410,7 +2412,7 @@ namespace StockhamGenerator
str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
str += "\n";
- bool cReg = halfLds ? true : false;
+ bool cReg = linearRegs ? true : false;
// Generate butterflies for all unique radices
std::list<size_t> uradices;
@@ -2455,7 +2457,6 @@ namespace StockhamGenerator
}
double scale = fwd ? params.fft_fwdScale : params.fft_backScale;
- bool tw3Step = false;
for(p = passes.begin(); p != passes.end(); p++)
{
@@ -2464,8 +2465,25 @@ namespace StockhamGenerator
bool gIn = false, gOut = false;
bool inIlvd = false, outIlvd = false;
bool inRl = false, outRl = false;
- if(p == passes.begin()) { inIlvd = inInterleaved; inRl = inReal; gIn = true; ins = params.fft_inStride[0]; }
- if((p+1) == passes.end()) { outIlvd = outInterleaved; outRl = outReal; gOut = true; outs = params.fft_outStride[0]; s = scale; tw3Step = params.fft_3StepTwiddle; }
+ bool tw3Step = false;
+
+
+ if(p == passes.begin() && params.fft_twiddleFront ) { tw3Step = params.fft_3StepTwiddle; }
+ if((p+1) == passes.end()) { s = scale; if(!params.fft_twiddleFront) tw3Step = params.fft_3StepTwiddle; }
+
+ if(blockCompute && !r2c2r)
+ {
+ inIlvd = ldsInterleaved;
+ outIlvd = ldsInterleaved;
+ }
+ else
+ {
+ if(p == passes.begin()) { inIlvd = inInterleaved; inRl = inReal; gIn = true; ins = params.fft_inStride[0]; }
+ if((p+1) == passes.end()) { outIlvd = outInterleaved; outRl = outReal; gOut = true; outs = params.fft_outStride[0]; }
+
+ if(p != passes.begin()) { inIlvd = ldsInterleaved; }
+ if((p+1) != passes.end()) { outIlvd = ldsInterleaved; }
+ }
p->GeneratePass(fwd, str, tw3Step, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn, gOut);
}
@@ -2475,6 +2493,8 @@ namespace StockhamGenerator
break;
}
+
+
// TODO : address this kludge
str += " typedef union { uint u; int i; } cb_t;\n\n";
@@ -2494,7 +2514,9 @@ namespace StockhamGenerator
// FFT kernel begin
// Function attribute
str += "__kernel __attribute__((reqd_work_group_size (";
- str += SztToStr(workGroupSize); str += ",1,1)))\nvoid ";
+ if(blockCompute) str += SztToStr(blockWGS);
+ else str += SztToStr(workGroupSize);
+ str += ",1,1)))\nvoid ";
// Function name
if(fwd) str += "fft_fwd";
@@ -2610,13 +2632,25 @@ namespace StockhamGenerator
str += "uint batch = get_group_id(0);";
str += "\n";
+
+
// Allocate LDS
- size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
- if(numPasses > 1)
+ if(blockCompute)
{
- str += "\n\t";
- str += "__local "; str += rType; str += " lds[";
- str += SztToStr(ldsSize); str += "];\n";
+ str += "\n\t"; str += "__local "; str += r2Type; str += " lds[";
+ str += SztToStr(blockLDS); str += "];\n";
+ }
+ else
+ {
+ size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
+ ldsSize = ldsInterleaved ? ldsSize/2 : ldsSize;
+
+ if(numPasses > 1)
+ {
+ str += "\n\t";
+ str += "__local "; str += ldsInterleaved ? r2Type: rType; str += " lds[";
+ str += SztToStr(ldsSize); str += "];\n";
+ }
}
// Declare memory pointers
@@ -2715,7 +2749,7 @@ namespace StockhamGenerator
}
// Setup registers if needed
- if(halfLds)
+ if(linearRegs)
{
str += "\t"; str += RegBaseType<PR>(2);
str += " "; str += IterRegs("", false);
@@ -2730,7 +2764,7 @@ namespace StockhamGenerator
totalBatch += SztToStr(params.fft_N[i+1]); totalBatch += " * ";
i++;
}
- totalBatch += "cb["; totalBatch += SztToStr(i); totalBatch += "].u)";
+ totalBatch += "cb[0].u)";
// Conditional read-write ('rw') for arbitrary batch number
if(r2c2r && !rcSimple)
@@ -2742,16 +2776,20 @@ namespace StockhamGenerator
}
else
{
- if(numTrans > 1)
+ if( (numTrans > 1) && !blockCompute )
{
str += "\tuint rw = (me < ("; str += totalBatch;
str += " - batch*"; str += SztToStr(numTrans); str += ")*";
str += SztToStr(workGroupSizePerTrans); str += ") ? 1 : 0;\n\n";
}
+ else
+ {
+ str += "\tuint rw = 1;\n\n";
+ }
}
// Transform index for 3-step twiddles
- if(params.fft_3StepTwiddle)
+ if(params.fft_3StepTwiddle && !blockCompute)
{
if(numTrans == 1)
{
@@ -2831,7 +2869,10 @@ namespace StockhamGenerator
{
if(params.fft_placeness == CLFFT_INPLACE)
{
- str += OffsetCalc("ioOffset", true);
+ if(blockCompute)
+ str += OffsetCalcBlock("ioOffset", true);
+ else
+ str += OffsetCalc("ioOffset", true);
str += "\t";
if(inInterleaved)
@@ -2846,8 +2887,16 @@ namespace StockhamGenerator
}
else
{
- str += OffsetCalc("iOffset", true);
- str += OffsetCalc("oOffset", false);
+ if(blockCompute)
+ {
+ str += OffsetCalcBlock("iOffset", true);
+ str += OffsetCalcBlock("oOffset", false);
+ }
+ else
+ {
+ str += OffsetCalc("iOffset", true);
+ str += OffsetCalc("oOffset", false);
+ }
str += "\t";
if(inInterleaved)
@@ -2872,6 +2921,55 @@ namespace StockhamGenerator
}
}
+
+ // Read data into LDS for blocked access
+ if(blockCompute)
+ {
+
+ size_t loopCount = (length * blockWidth)/blockWGS;
+
+ str += "\n\tfor(uint t=0; t<"; str += SztToStr(loopCount);
+ str += "; t++)\n\t{\n";
+
+ for(size_t c=0; c<2; c++)
+ {
+ std::string comp = "";
+ std::string readBuf = (params.fft_placeness == CLFFT_INPLACE) ? "lwb" : "lwbIn";
+ if(!inInterleaved) comp = c ? ".y" : ".x";
+ if(!inInterleaved)
+ readBuf = (params.fft_placeness == CLFFT_INPLACE) ? (c ? "lwbIm" : "lwbRe") : (c ? "lwbInIm" : "lwbInRe");
+
+ if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_C2R) )
+ {
+ str += "\t\tR0"; str+= comp; str+= " = "; str += readBuf; str += "[(me%"; str+= SztToStr(blockWidth); str += ") + ";
+ str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_inStride[0]);
+ str += " + t*"; str += SztToStr(params.fft_inStride[0]*blockWGS/blockWidth); str += "];\n";
+ }
+ else
+ {
+ str += "\t\tR0"; str+= comp; str+= " = "; str += readBuf; str += "[me + t*"; str += SztToStr(blockWGS); str += "];\n";
+ }
+
+
+ if(inInterleaved) break;
+ }
+
+ if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_C2R) )
+ {
+ str += "\t\tlds[t*"; str += SztToStr(blockWGS/blockWidth); str += " + ";
+ str += "(me%"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(length); str += " + ";
+ str += "(me/"; str+= SztToStr(blockWidth); str+= ")] = R0;"; str +="\n";
+ }
+ else
+ {
+ str += "\t\tlds[t*"; str += SztToStr(blockWGS); str += " + me] = R0;"; str +="\n";
+ }
+
+ str += "\t}\n\n";
+ str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n";
+ }
+
+
// Set rw and 'me' per transform
// rw string also contains 'b'
std::string rw, me;
@@ -2882,6 +2980,8 @@ namespace StockhamGenerator
if(numTrans > 1) { me += "me%"; me += SztToStr(workGroupSizePerTrans); me += ", "; }
else { me += "me, "; }
+ if(blockCompute) { me = "me%"; me += SztToStr(workGroupSizePerTrans); me += ", "; }
+
// Buffer strings
std::string inBuf, outBuf;
if(r2c2r)
@@ -2917,6 +3017,24 @@ namespace StockhamGenerator
}
}
+
+ if(blockCompute)
+ {
+ str += "\n\tfor(uint t=0; t<"; str += SztToStr(blockWidth/(blockWGS/workGroupSizePerTrans));
+ str += "; t++)\n\t{\n\n";
+
+ inBuf = "lds, ";
+ outBuf = "lds";
+
+ if(params.fft_3StepTwiddle)
+ {
+ str += "\t\tb = (batch%"; str += SztToStr(params.fft_N[1]/blockWidth); str += ")*";
+ str += SztToStr(blockWidth); str += " + t*"; str += SztToStr(blockWGS/workGroupSizePerTrans);
+ str += " + (me/"; str += SztToStr(workGroupSizePerTrans); str += ");\n\n";
+ }
+ }
+
+
// Call passes
if(numPasses == 1)
{
@@ -2932,40 +3050,55 @@ namespace StockhamGenerator
{
for(typename std::vector<Pass<PR> >::const_iterator p = passes.begin(); p != passes.end(); p++)
{
+ std::string exTab = "";
+ if(blockCompute) exTab = "\t";
+
+ str += exTab;
str += "\t";
str += PassName(p->GetPosition(), fwd);
str += "(";
std::string ldsOff;
- if(numTrans > 1)
+ if(blockCompute)
{
- ldsOff += "(me/"; ldsOff += SztToStr(workGroupSizePerTrans);
- ldsOff += ")*"; ldsOff += SztToStr(length);
+ ldsOff += "t*"; ldsOff += SztToStr(length*(blockWGS/workGroupSizePerTrans)); ldsOff += " + (me/";
+ ldsOff += SztToStr(workGroupSizePerTrans); ldsOff += ")*"; ldsOff += SztToStr(length);
}
else
{
- ldsOff += "0";
+ if(numTrans > 1)
+ {
+ ldsOff += "(me/"; ldsOff += SztToStr(workGroupSizePerTrans);
+ ldsOff += ")*"; ldsOff += SztToStr(length);
+ }
+ else
+ {
+ ldsOff += "0";
+ }
}
std::string ldsArgs;
if(halfLds) { ldsArgs += "lds, lds"; }
- else { ldsArgs += "lds, lds + "; ldsArgs += SztToStr(length*numTrans); }
+ else { if(ldsInterleaved) { ldsArgs += "lds"; }
+ else { ldsArgs += "lds, lds + "; ldsArgs += SztToStr(length*numTrans); } }
str += rw; str += me;
if(p == passes.begin()) // beginning pass
{
- str += "0, ";
+ str += blockCompute ? ldsOff : "0";
+ str += ", ";
str += ldsOff;
str += ", ";
str += inBuf;
str += ldsArgs; str += IterRegs("&"); str += ");\n";
- if(!halfLds) str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ if(!halfLds) { str += exTab; str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"; }
}
else if((p+1) == passes.end()) // ending pass
{
str += ldsOff;
str += ", ";
- str += "0, ";
+ str += blockCompute ? ldsOff : "0";
+ str += ", ";
str += ldsArgs; str += ", ";
str += outBuf;
str += IterRegs("&"); str += ");\n";
@@ -2978,11 +3111,66 @@ namespace StockhamGenerator
str += ", ";
str += ldsArgs; str += ", ";
str += ldsArgs; str += IterRegs("&"); str += ");\n";
- if(!halfLds) str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+ if(!halfLds) { str += exTab; str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"; }
}
}
}
+
+ if(blockCompute)
+ {
+ str += "\n\t}\n\n";
+ }
+
+
+ // Write data from LDS for blocked access
+ if(blockCompute)
+ {
+
+ size_t loopCount = (length * blockWidth)/blockWGS;
+
+ str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n";
+ str += "\n\tfor(uint t=0; t<"; str += SztToStr(loopCount);
+ str += "; t++)\n\t{\n";
+
+ if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_R2C) )
+ {
+ str += "\t\tR0 = lds[t*"; str += SztToStr(blockWGS/blockWidth); str += " + ";
+ str += "(me%"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(length); str += " + ";
+ str += "(me/"; str+= SztToStr(blockWidth); str+= ")];"; str +="\n";
+ }
+ else
+ {
+ str += "\t\tR0 = lds[t*"; str += SztToStr(blockWGS); str += " + me];"; str +="\n";
+ }
+
+ for(size_t c=0; c<2; c++)
+ {
+ std::string comp = "";
+ std::string writeBuf = (params.fft_placeness == CLFFT_INPLACE) ? "lwb" : "lwbOut";
+ if(!outInterleaved) comp = c ? ".y" : ".x";
+ if(!outInterleaved)
+ writeBuf = (params.fft_placeness == CLFFT_INPLACE) ? (c ? "lwbIm" : "lwbRe") : (c ? "lwbOutIm" : "lwbOutRe");
+
+ if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_R2C) )
+ {
+ str += "\t\t"; str += writeBuf; str += "[(me%"; str+= SztToStr(blockWidth); str += ") + ";
+ str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_outStride[0]);
+ str += " + t*"; str += SztToStr(params.fft_outStride[0]*blockWGS/blockWidth); str += "] = R0"; str+= comp; str += ";\n";
+ }
+ else
+ {
+ str += "\t\t"; str += writeBuf; str += "[me + t*"; str += SztToStr(blockWGS); str += "] = R0"; str+= comp; str += ";\n";
+ }
+
+ if(outInterleaved) break;
+ }
+
+ str += "\t}\n\n";
+ }
+
+
+
str += "}\n\n";
if(r2c2r)
@@ -3011,6 +3199,10 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
params.fft_inputLayout = this->inputLayout;
params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
+ ARG_CHECK(this->length.size() > 0);
+ ARG_CHECK(this->inStride.size() > 0);
+ ARG_CHECK(this->outStride.size() > 0);
+
ARG_CHECK (this->inStride.size() == this->outStride.size())
bool real_transform = ((this->inputLayout == CLFFT_REAL) || (this->outputLayout == CLFFT_REAL));
@@ -3029,93 +3221,26 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
params.fft_outputLayout = this->outputLayout;
}
- switch (this->inStride.size()) {
- // 1-D array is a 2-D data structure.
- // 1-D unit is a special case of 1-D array.
- case 1:
- ARG_CHECK(this->length .size() > 0);
- ARG_CHECK(this->outStride.size() > 0);
- params.fft_DataDim = 2;
- params.fft_N[0] = this->length[0];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->oDist;
- break;
-
- // 2-D array is a 3-D data structure
- // 2-D unit is a speical case of 2-D array.
- case 2:
- ARG_CHECK(this->length .size() > 1);
- ARG_CHECK(this->outStride.size() > 1);
- params.fft_DataDim = 3;
- params.fft_N[0] = this->length[0];
- params.fft_N[1] = this->length[1];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->inStride[1];
- params.fft_inStride[2] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->outStride[1];
- params.fft_outStride[2] = this->oDist;
- break;
-
- // 3-D array is a 4-D data structure
- // 3-D unit is a special case of 3-D array.
- case 3:
- ARG_CHECK(this->length .size() > 2);
- ARG_CHECK(this->outStride.size() > 2);
- params.fft_DataDim = 4;
- params.fft_N[0] = this->length[0];
- params.fft_N[1] = this->length[1];
- params.fft_N[2] = this->length[2];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->inStride[1];
- params.fft_inStride[2] = this->inStride[2];
- params.fft_inStride[3] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->outStride[1];
- params.fft_outStride[2] = this->outStride[2];
- params.fft_outStride[3] = this->oDist;
- break;
-
- // 5-D data structure
- // This can occur when a large dimension is split into two for
- // the "3-step" algorithm.
- //
- case 4:
- ARG_CHECK(this->length .size() > 3);
- ARG_CHECK(this->outStride.size() > 3);
- params.fft_DataDim = 5;
- params.fft_N[0] = this->length[0];
- params.fft_N[1] = this->length[1];
- params.fft_N[2] = this->length[2];
- params.fft_N[3] = this->length[3];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->inStride[1];
- params.fft_inStride[2] = this->inStride[2];
- params.fft_inStride[3] = this->inStride[3];
- params.fft_inStride[4] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->outStride[1];
- params.fft_outStride[2] = this->outStride[2];
- params.fft_outStride[3] = this->outStride[3];
- params.fft_outStride[4] = this->oDist;
- break;
- default:
- ARG_CHECK (false);
- }
+ params.fft_DataDim = this->length.size() + 1;
+ int i = 0;
+ for(i = 0; i < (params.fft_DataDim - 1); i++)
+ {
+ params.fft_N[i] = this->length[i];
+ params.fft_inStride[i] = this->inStride[i];
+ params.fft_outStride[i] = this->outStride[i];
- // TODO: we could simplify the address calculations in the kernel
- // when the input data is contiguous.
- // For example, a 3-D data structure with
- // lengths: [*, 64, *]
- // strides: [*, 1024, 65536]
- // could be reduced to a 2-D data structure.
+ }
+ params.fft_inStride[i] = this->iDist;
+ params.fft_outStride[i] = this->oDist;
- params.fft_LdsComplex = this->bLdsComplex;
params.fft_RCsimple = this->RCsimple;
+ params.blockCompute = this->blockCompute;
+ params.blockComputeType = this->blockComputeType;
+
+ params.fft_twiddleFront = this->twiddleFront;
+
size_t wgs, nt;
#ifdef PARMETERS_TO_BE_READ
ParamRead pr;
@@ -3131,11 +3256,21 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
{
KernelCoreSpecs<P_SINGLE> kcs;
kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
+ if(params.blockCompute)
+ {
+ params.blockSIMD = Kernel<P_SINGLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
+ params.blockLDS = Kernel<P_SINGLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+ }
} break;
case P_DOUBLE:
{
KernelCoreSpecs<P_DOUBLE> kcs;
kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
+ if(params.blockCompute)
+ {
+ params.blockSIMD = Kernel<P_DOUBLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
+ params.blockLDS = Kernel<P_DOUBLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+ }
} break;
}
@@ -3155,14 +3290,11 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
params.fft_SIMD = wgs;
- params.fft_MaxRadix = params.fft_R;
- params.fft_UseFMA = true;
-
if (this->large1D != 0) {
ARG_CHECK (params.fft_N[0] != 0)
ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
params.fft_3StepTwiddle = true;
- params.fft_N[1] = this->large1D / params.fft_N[0];
+ ARG_CHECK ( this->large1D == (params.fft_N[1] * params.fft_N[0]) );
}
params.fft_fwdScale = this->forwardScale;
@@ -3182,11 +3314,21 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Stockham> (std::vector<size_t> & globalWS,
}
count *= this->batchsize;
-
FFTKernelGenKeyParams fftParams;
// Translate the user plan into the structure that we use to map plans to clPrograms
OPENCL_V( this->GetKernelGenKeyPvt<Stockham>( fftParams ), _T("GetKernelGenKey() failed!") );
+ if(fftParams.blockCompute)
+ {
+ count = DivRoundingUp<unsigned long long> (count, fftParams.blockLDS);
+ count = count * fftParams.blockSIMD;
+
+ globalWS.push_back( static_cast< size_t >( count ) );
+ localWS.push_back( fftParams.blockSIMD );
+
+ return CLFFT_SUCCESS;
+ }
+
count = DivRoundingUp<unsigned long long> (count, fftParams.fft_R); // count of WorkItems
count = DivRoundingUp<unsigned long long> (count, fftParams.fft_SIMD); // count of WorkGroups
@@ -3229,7 +3371,7 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Stockham> (size_t * longest) const
}
template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
+clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
{
FFTKernelGenKeyParams params;
OPENCL_V( this->GetKernelGenKeyPvt<Stockham> (params), _T("GetKernelGenKey() failed!") );
@@ -3237,12 +3379,10 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
cl_int status = CL_SUCCESS;
cl_device_id Device = NULL;
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
-
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
cl_context QueueContext = NULL;
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
-
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
std::string programCode;
@@ -3265,8 +3405,8 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
ReadKernelFromFile(programCode);
#endif
- OPENCL_V( fftRepo.setProgramCode( Stockham, params, programCode, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
- OPENCL_V( fftRepo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back", QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramCode( Stockham, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
return CLFFT_SUCCESS;
}
diff --git a/src/library/generator.stockham.h b/src/library/generator.stockham.h
index bba7d64..201e4a0 100644
--- a/src/library/generator.stockham.h
+++ b/src/library/generator.stockham.h
@@ -79,6 +79,7 @@ namespace StockhamGenerator
return ss.str();
}
+
// Find the smallest power of 2 that is >= n; return its power of 2 factor
// e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
inline size_t CeilPo2 (size_t n)
@@ -131,6 +132,7 @@ namespace StockhamGenerator
return result;
}
+
// Register data base types
template <Precision PR>
inline std::string RegBaseType(size_t count)
@@ -209,6 +211,123 @@ namespace StockhamGenerator
return "TW3step";
}
+
+
+ // Twiddle factors table for large N
+ // used in 3-step algorithm
+ class TwiddleTableLarge
+ {
+ size_t N; // length
+ size_t X, Y;
+ size_t tableSize;
+ double *wc, *ws; // cosine, sine arrays
+
+ public:
+ TwiddleTableLarge(size_t length) : N(length)
+ {
+ X = size_t(1) << ARBITRARY::TWIDDLE_DEE;
+ Y = DivRoundingUp<size_t> (CeilPo2(N), ARBITRARY::TWIDDLE_DEE);
+ tableSize = X * Y;
+
+ // Allocate memory for the tables
+ wc = new double[tableSize];
+ ws = new double[tableSize];
+ }
+
+ ~TwiddleTableLarge()
+ {
+ // Free
+ delete[] wc;
+ delete[] ws;
+ }
+
+ template <Precision PR>
+ void GenerateTwiddleTable(std::string &twStr)
+ {
+ const double TWO_PI = -6.283185307179586476925286766559;
+
+ // Generate the table
+ size_t nt = 0;
+ double phi = TWO_PI / double (N);
+ for (size_t iY = 0; iY < Y; ++iY)
+ {
+ size_t i = size_t(1) << (iY * ARBITRARY::TWIDDLE_DEE);
+ for (size_t iX = 0; iX < X; ++iX)
+ {
+ size_t j = i * iX;
+
+ double c = cos(phi * (double)j);
+ double s = sin(phi * (double)j);
+
+ //if (fabs(c) < 1.0E-12) c = 0.0;
+ //if (fabs(s) < 1.0E-12) s = 0.0;
+
+ wc[nt] = c;
+ ws[nt++] = s;
+ }
+ }
+
+ std::string sfx = FloatSuffix<PR>();
+
+ // Stringize the table
+ std::stringstream ss;
+ nt = 0;
+
+ ss << "\n __constant ";
+ ss << RegBaseType<PR>(2);
+ ss << " " << TwTableLargeName();
+ ss << "[" << Y << "][" << X << "] = {\n";
+ for (size_t iY = 0; iY < Y; ++iY)
+ {
+ ss << "{ ";
+ for (size_t iX = 0; iX < X; ++iX)
+ {
+ char cv[64], sv[64];
+ sprintf(cv, "%036.34lf", wc[nt]);
+ sprintf(sv, "%036.34lf", ws[nt++]);
+ ss << "("; ss << RegBaseType<PR>(2); ss << ")(";
+ ss << cv; ss << sfx; ss << ", ";
+ ss << sv; ss << sfx; ss << ")";
+ ss << ", ";
+ }
+ ss << " },\n";
+ }
+ ss << "};\n\n";
+
+
+ // Twiddle calc function
+ ss << "__attribute__((always_inline)) ";
+ ss << RegBaseType<PR>(2);
+ ss << "\n" << TwTableLargeFunc() << "(uint u)\n{\n";
+
+ ss << "\t" "uint j = u & " << unsigned(X-1) << ";\n";
+ ss << "\t" ; ss << RegBaseType<PR>(2); ss << " result = ";
+ ss << TwTableLargeName();
+ ss << "[0][j];\n";
+
+ for (size_t iY = 1; iY < Y; ++iY)
+ {
+ std::string phasor = TwTableLargeName();
+ phasor += "[";
+ phasor += SztToStr(iY);
+ phasor += "][j]";
+
+ stringpair product = ComplexMul((RegBaseType<PR>(2)).c_str(), "result", phasor.c_str());
+
+ ss << "\t" "u >>= " << unsigned (ARBITRARY::TWIDDLE_DEE) << ";\n";
+ ss << "\t" "j = u & " << unsigned(X-1) << ";\n";
+ ss << "\t" "result = " << product.first << "\n";
+ ss << "\t" "\t" << product.second <<";\n";
+ }
+ ss << "\t" "return result;\n}\n\n";
+
+ twStr += ss.str();
+ }
+ };
+
+
+
+
// FFT butterfly
template <Precision PR>
class Butterfly
@@ -1181,165 +1300,6 @@ namespace StockhamGenerator
}
}
} break;
- case 16:
- {
- if(fwd)
- {
- if(cReg)
- {
- bflyStr +=
-
- "(*R1) = (*R0) - (*R1);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
- "(*R3) = (*R2) - (*R3);\n\t"
- "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
- "(*R5) = (*R4) - (*R5);\n\t"
- "(*R4) = 2.0f * (*R4) - (*R5);\n\t"
- "(*R7) = (*R6) - (*R7);\n\t"
- "(*R6) = 2.0f * (*R6) - (*R7);\n\t"
- "(*R9) = (*R8) - (*R9);\n\t"
- "(*R8) = 2.0f * (*R8) - (*R9);\n\t"
- "(*R11) = (*R10) - (*R11);\n\t"
- "(*R10) = 2.0f * (*R10) - (*R11);\n\t"
- "(*R13) = (*R12) - (*R13);\n\t"
- "(*R12) = 2.0f * (*R12) - (*R13);\n\t"
- "(*R15) = (*R14) - (*R15);\n\t"
- "(*R14) = 2.0f * (*R14) - (*R15);\n\t"
- "\n\t"
- "(*R2) = (*R0) - (*R2);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
- "(*R3) = (*R1) + (fvect2)(-(*R3).y, (*R3).x);\n\t"
- "(*R1) = 2.0f * (*R1) - (*R3);\n\t"
- "(*R6) = (*R4) - (*R6);\n\t"
- "(*R4) = 2.0f * (*R4) - (*R6);\n\t"
- "(*R7) = (*R5) + (fvect2)(-(*R7).y, (*R7).x);\n\t"
- "(*R5) = 2.0f * (*R5) - (*R7);\n\t"
- "(*R10) = (*R8) - (*R10);\n\t"
- "(*R8) = 2.0f * (*R8) - (*R10);\n\t"
- "(*R11) = (*R9) + (fvect2)(-(*R11).y, (*R11).x);\n\t"
- "(*R9) = 2.0f * (*R9) - (*R11);\n\t"
- "(*R14) = (*R12) - (*R14);\n\t"
- "(*R12) = 2.0f * (*R12) - (*R14);\n\t"
- "(*R15) = (*R13) + (fvect2)(-(*R15).y, (*R15).x);\n\t"
- "(*R13) = 2.0f * (*R13) - (*R15);\n\t"
- "\n\t"
- "(*R4) = (*R0) - (*R4);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R4);\n\t"
- "(*R5) = ((*R1) - C8Q * (*R5)) - C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
- "(*R1) = 2.0f * (*R1) - (*R5);\n\t"
- "(*R6) = (*R2) + (fvect2)(-(*R6).y, (*R6).x);\n\t"
- "(*R2) = 2.0f * (*R2) - (*R6);\n\t"
- "(*R7) = ((*R3) + C8Q * (*R7)) - C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
- "(*R3) = 2.0f * (*R3) - (*R7);\n\t"
- "(*R12) = (*R8) - (*R12);\n\t"
- "(*R8) = 2.0f * (*R8) - (*R12);\n\t"
- "(*R13) = ((*R9) - C8Q * (*R13)) - C8Q * (fvect2)((*R13).y, -(*R13).x);\n\t"
- "(*R9) = 2.0f * (*R9) - (*R13);\n\t"
- "(*R14) = (*R10) + (fvect2)(-(*R14).y, (*R14).x);\n\t"
- "(*R10) = 2.0f * (*R10) - (*R14);\n\t"
- "(*R15) = ((*R11) + C8Q * (*R15)) - C8Q * (fvect2)((*R15).y, -(*R15).x);\n\t"
- "(*R11) = 2.0f * (*R11) - (*R15);\n\t"
- "\n\t"
- "(*R8) = (*R0) - (*R8);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R8);\n\t"
- "(*R9) = ((*R1) - 0.92387953251128675612818318939679 * (*R9)) - 0.3826834323650897717284599840304 * (fvect2)((*R9).y, -(*R9).x);\n\t"
- "(*R1) = 2.0f * (*R1) - (*R9);\n\t"
- "(*R10) = ((*R2) - C8Q * (*R10)) - C8Q * (fvect2)((*R10).y, -(*R10).x);\n\t"
- "(*R2) = 2.0f * (*R2) - (*R10);\n\t"
- "(*R11) = ((*R3) - 0.3826834323650897717284599840304 * (*R11)) - 0.92387953251128675612818318939679 * (fvect2)((*R11).y, -(*R11).x);\n\t"
- "(*R3) = 2.0f * (*R3) - (*R11);\n\t"
- "(*R12) = (*R4) + (fvect2)(-(*R12).y, (*R12).x);\n\t"
- "(*R4) = 2.0f * (*R4) - (*R12);\n\t"
- "(*R13) = ((*R5) + 0.3826834323650897717284599840304 * (*R13)) - 0.92387953251128675612818318939679 * (fvect2)((*R13).y, -(*R13).x);\n\t"
- "(*R5) = 2.0f * (*R5) - (*R13);\n\t"
- "(*R14) = ((*R6) + C8Q * (*R14)) - C8Q * (fvect2)((*R14).y, -(*R14).x);\n\t"
- "(*R6) = 2.0f * (*R6) - (*R14);\n\t"
- "(*R15) = ((*R7) + 0.92387953251128675612818318939679 * (*R15)) - 0.3826834323650897717284599840304 * (fvect2)((*R15).y, -(*R15).x);\n\t"
- "(*R7) = 2.0f * (*R7) - (*R15);\n\t";
-
- }
- else
- assert(false);
- }
- else
- {
- if(cReg)
- {
- bflyStr +=
-
- "(*R1) = (*R0) - (*R1);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R1);\n\t"
- "(*R3) = (*R2) - (*R3);\n\t"
- "(*R2) = 2.0f * (*R2) - (*R3);\n\t"
- "(*R5) = (*R4) - (*R5);\n\t"
- "(*R4) = 2.0f * (*R4) - (*R5);\n\t"
- "(*R7) = (*R6) - (*R7);\n\t"
- "(*R6) = 2.0f * (*R6) - (*R7);\n\t"
- "(*R9) = (*R8) - (*R9);\n\t"
- "(*R8) = 2.0f * (*R8) - (*R9);\n\t"
- "(*R11) = (*R10) - (*R11);\n\t"
- "(*R10) = 2.0f * (*R10) - (*R11);\n\t"
- "(*R13) = (*R12) - (*R13);\n\t"
- "(*R12) = 2.0f * (*R12) - (*R13);\n\t"
- "(*R15) = (*R14) - (*R15);\n\t"
- "(*R14) = 2.0f * (*R14) - (*R15);\n\t"
- "\n\t"
- "(*R2) = (*R0) - (*R2);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R2);\n\t"
- "(*R3) = (*R1) + (fvect2)((*R3).y, -(*R3).x);\n\t"
- "(*R1) = 2.0f * (*R1) - (*R3);\n\t"
- "(*R6) = (*R4) - (*R6);\n\t"
- "(*R4) = 2.0f * (*R4) - (*R6);\n\t"
- "(*R7) = (*R5) + (fvect2)((*R7).y, -(*R7).x);\n\t"
- "(*R5) = 2.0f * (*R5) - (*R7);\n\t"
- "(*R10) = (*R8) - (*R10);\n\t"
- "(*R8) = 2.0f * (*R8) - (*R10);\n\t"
- "(*R11) = (*R9) + (fvect2)((*R11).y, -(*R11).x);\n\t"
- "(*R9) = 2.0f * (*R9) - (*R11);\n\t"
- "(*R14) = (*R12) - (*R14);\n\t"
- "(*R12) = 2.0f * (*R12) - (*R14);\n\t"
- "(*R15) = (*R13) + (fvect2)((*R15).y, -(*R15).x);\n\t"
- "(*R13) = 2.0f * (*R13) - (*R15);\n\t"
- "\n\t"
- "(*R4) = (*R0) - (*R4);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R4);\n\t"
- "(*R5) = ((*R1) - C8Q * (*R5)) + C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
- "(*R1) = 2.0f * (*R1) - (*R5);\n\t"
- "(*R6) = (*R2) + (fvect2)((*R6).y, -(*R6).x);\n\t"
- "(*R2) = 2.0f * (*R2) - (*R6);\n\t"
- "(*R7) = ((*R3) + C8Q * (*R7)) + C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
- "(*R3) = 2.0f * (*R3) - (*R7);\n\t"
- "(*R12) = (*R8) - (*R12);\n\t"
- "(*R8) = 2.0f * (*R8) - (*R12);\n\t"
- "(*R13) = ((*R9) - C8Q * (*R13)) + C8Q * (fvect2)((*R13).y, -(*R13).x);\n\t"
- "(*R9) = 2.0f * (*R9) - (*R13);\n\t"
- "(*R14) = (*R10) + (fvect2)((*R14).y, -(*R14).x);\n\t"
- "(*R10) = 2.0f * (*R10) - (*R14);\n\t"
- "(*R15) = ((*R11) + C8Q * (*R15)) + C8Q * (fvect2)((*R15).y, -(*R15).x);\n\t"
- "(*R11) = 2.0f * (*R11) - (*R15);\n\t"
- "\n\t"
- "(*R8) = (*R0) - (*R8);\n\t"
- "(*R0) = 2.0f * (*R0) - (*R8);\n\t"
- "(*R9) = ((*R1) - 0.92387953251128675612818318939679 * (*R9)) + 0.3826834323650897717284599840304 * (fvect2)((*R9).y, -(*R9).x);\n\t"
- "(*R1) = 2.0f * (*R1) - (*R9);\n\t"
- "(*R10) = ((*R2) - C8Q * (*R10)) + C8Q * (fvect2)((*R10).y, -(*R10).x);\n\t"
- "(*R2) = 2.0f * (*R2) - (*R10);\n\t"
- "(*R11) = ((*R3) - 0.3826834323650897717284599840304 * (*R11)) + 0.92387953251128675612818318939679 * (fvect2)((*R11).y, -(*R11).x);\n\t"
- "(*R3) = 2.0f * (*R3) - (*R11);\n\t"
- "(*R12) = (*R4) + (fvect2)((*R12).y, -(*R12).x);\n\t"
- "(*R4) = 2.0f * (*R4) - (*R12);\n\t"
- "(*R13) = ((*R5) + 0.3826834323650897717284599840304 * (*R13)) + 0.92387953251128675612818318939679 * (fvect2)((*R13).y, -(*R13).x);\n\t"
- "(*R5) = 2.0f * (*R5) - (*R13);\n\t"
- "(*R14) = ((*R6) + C8Q * (*R14)) + C8Q * (fvect2)((*R14).y, -(*R14).x);\n\t"
- "(*R6) = 2.0f * (*R6) - (*R14);\n\t"
- "(*R15) = ((*R7) + 0.92387953251128675612818318939679 * (*R15)) + 0.3826834323650897717284599840304 * (fvect2)((*R15).y, -(*R15).x);\n\t"
- "(*R7) = 2.0f * (*R7) - (*R15);\n\t";
-
- }
- else
- assert(false);
- }
- } break;
default:
assert(false);
}
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
new file mode 100644
index 0000000..1d4a46d
--- /dev/null
+++ b/src/library/generator.transpose.gcn.cpp
@@ -0,0 +1,660 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.generator.Transpose.cpp : Dynamic run-time generator of openCL transpose kernels
+//
+
+// TODO: generalize the kernel to work with any size
+
+#include "stdafx.h"
+
+#include <math.h>
+#include <iomanip>
+
+#include "generator.transpose.gcn.h"
+#include "generator.stockham.h"
+
+// A structure that represents a bounding box or tile, with convenient names for the row and column addresses
+// local work sizes
+struct tile
+{
+ union
+ {
+ size_t x;
+ size_t col;
+ };
+
+ union
+ {
+ size_t y;
+ size_t row;
+ };
+};
+
+inline std::stringstream& clKernWrite( std::stringstream& rhs, const size_t tabIndex )
+{
+ rhs << std::setw( tabIndex ) << "";
+ return rhs;
+}
+
+static size_t NumBlocksX(size_t N);
+
+static void OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& params, bool input )
+{
+ const size_t *stride = input ? params.fft_inStride : params.fft_outStride;
+ std::string offset = input ? "iOffset" : "oOffset";
+
+
+ clKernWrite( transKernel, 3 ) << "size_t " << offset << " = 0;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "currDimSize = groupIndex.y;" << std::endl;
+
+
+ for(size_t i = params.fft_DataDim - 2; i > 0 ; i--)
+ {
+ clKernWrite( transKernel, 3 ) << offset << " += (currDimSize/numGroupsY_" << i << ")*" << stride[i+1] << ";" << std::endl;
+ clKernWrite( transKernel, 3 ) << "currDimSize = currDimSize % numGroupsY_" << i << ";" << std::endl;
+ }
+
+ clKernWrite( transKernel, 3 ) << "rowSizeinUnits = " << stride[1] << ";" << std::endl;
+
+ if(params.transOutHorizontal)
+ {
+ size_t numBlocksX = NumBlocksX(params.fft_N[ 0 ]);
+
+ if(input)
+ {
+ clKernWrite( transKernel, 3 ) << offset << " += rowSizeinUnits * wgTileExtent.y * wgUnroll * "
+ << "(groupIndex.x + " << numBlocksX << "*(currDimSize%(numGroupsY_1/" << numBlocksX << ")));" << std::endl;
+ clKernWrite( transKernel, 3 ) << offset << " += (currDimSize/(numGroupsY_1/" << numBlocksX
+ << ")) * wgTileExtent.x;" << std::endl;
+ }
+ else
+ {
+ clKernWrite( transKernel, 3 ) << offset << " += (currDimSize/(numGroupsY_1/" << numBlocksX
+ << ")) * wgTileExtent.x * rowSizeinUnits;" << std::endl;
+ clKernWrite( transKernel, 3 ) << offset << " += wgTileExtent.y * wgUnroll * "
+ << "(groupIndex.x + " << numBlocksX << "*(currDimSize%(numGroupsY_1/" << numBlocksX << ")));" << std::endl;
+ }
+ }
+ else
+ {
+ if(input)
+ {
+ clKernWrite( transKernel, 3 ) << offset << " += rowSizeinUnits * wgTileExtent.y * wgUnroll * currDimSize;" << std::endl;
+ clKernWrite( transKernel, 3 ) << offset << " += groupIndex.x * wgTileExtent.x;" << std::endl;
+ }
+ else
+ {
+ clKernWrite( transKernel, 3 ) << offset << " += rowSizeinUnits * wgTileExtent.x * groupIndex.x;" << std::endl;
+ clKernWrite( transKernel, 3 ) << offset << " += currDimSize * wgTileExtent.y * wgUnroll;" << std::endl;
+ }
+ }
+
+ clKernWrite( transKernel, 3 ) << std::endl;
+}
+
+
+
+
+// Small snippet of code that multiplies the twiddle factors into the butterfiles. It is only emitted if the plan tells
+// the generator that it wants the twiddle factors generated inside of the transpose
+static clfftStatus genTwiddleMath( const FFTKernelGenKeyParams& params, std::stringstream& transKernel, const std::string& dtComplex, bool fwd )
+{
+ clKernWrite( transKernel, 6 ) << dtComplex << " W = TW3step( (groupIndex.x * wgTileExtent.x + xInd) * (currDimSize * wgTileExtent.y * wgUnroll + yInd) );" << std::endl;
+ clKernWrite( transKernel, 6 ) << dtComplex << " T;" << std::endl;
+
+ if(fwd)
+ {
+ clKernWrite( transKernel, 6 ) << "T.x = ( W.x * tmp.x ) - ( W.y * tmp.y );" << std::endl;
+ clKernWrite( transKernel, 6 ) << "T.y = ( W.y * tmp.x ) + ( W.x * tmp.y );" << std::endl;
+ }
+ else
+ {
+ clKernWrite( transKernel, 6 ) << "T.x = ( W.x * tmp.x ) + ( W.y * tmp.y );" << std::endl;
+ clKernWrite( transKernel, 6 ) << "T.y = -( W.y * tmp.x ) + ( W.x * tmp.y );" << std::endl;
+ }
+
+ clKernWrite( transKernel, 6 ) << "tmp.x = T.x;" << std::endl;
+ clKernWrite( transKernel, 6 ) << "tmp.y = T.y;" << std::endl;
+
+ return CLFFT_SUCCESS;
+}
+
+// These strings represent the names that are used as strKernel parameters
+const std::string pmRealIn( "pmRealIn" );
+const std::string pmImagIn( "pmImagIn" );
+const std::string pmRealOut( "pmRealOut" );
+const std::string pmImagOut( "pmImagOut" );
+const std::string pmComplexIn( "pmComplexIn" );
+const std::string pmComplexOut( "pmComplexOut" );
+
+static clfftStatus genTransposePrototype( const FFTKernelGenKeyParams& params, const tile& lwSize, const std::string& dtPlanar, const std::string& dtComplex,
+ const std::string &funcName, std::stringstream& transKernel, std::string& dtInput, std::string& dtOutput )
+{
+
+ // Declare and define the function
+ clKernWrite( transKernel, 0 ) << "__attribute__(( reqd_work_group_size( " << lwSize.x << ", " << lwSize.y << ", 1 ) ))" << std::endl;
+ clKernWrite( transKernel, 0 ) << "kernel void" << std::endl;
+
+ clKernWrite( transKernel, 0 ) << funcName << "( ";
+
+ switch( params.fft_inputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ dtInput = dtComplex;
+ clKernWrite( transKernel, 0 ) << "global " << dtInput << "* restrict " << pmComplexIn;
+
+ switch( params.fft_placeness )
+ {
+ case CLFFT_INPLACE:
+ dtOutput = dtComplex;
+ break;
+ case CLFFT_OUTOFPLACE:
+ switch( params.fft_outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ dtOutput = dtComplex;
+ clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmComplexOut;
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ dtOutput = dtPlanar;
+ clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmRealOut
+ << ", global " << dtOutput << "* restrict " << pmImagOut;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ dtInput = dtPlanar;
+ clKernWrite( transKernel, 0 ) << "global " << dtInput << "* restrict " << pmRealIn << ", global " << dtInput << "* restrict " << pmImagIn;
+
+ switch( params.fft_placeness )
+ {
+ case CLFFT_INPLACE:
+ dtOutput = dtPlanar;
+ break;
+ case CLFFT_OUTOFPLACE:
+ switch( params.fft_outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ dtOutput = dtComplex;
+ clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmComplexOut;
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ dtOutput = dtPlanar;
+ clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmRealOut
+ << ", global " << dtOutput << "* restrict " << pmImagOut;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+ // Close the method signature
+ clKernWrite( transKernel, 0 ) << " )\n{" << std::endl;
+
+ return CLFFT_SUCCESS;
+}
+
+static clfftStatus genTransposeKernel( const FFTKernelGenKeyParams& params, std::string& strKernel, const tile& lwSize, const size_t reShapeFactor,
+ const size_t loopCount, const size_t outRowPadding )
+{
+ strKernel.reserve( 4096 );
+ std::stringstream transKernel( std::stringstream::out );
+
+ // These strings represent the various data types we read or write in the kernel, depending on how the plan
+ // is configured
+ std::string dtInput; // The type read as input into kernel
+ std::string dtOutput; // The type written as output from kernel
+ std::string dtPlanar; // Fundamental type for planar arrays
+ std::string dtComplex; // Fundamental type for complex arrays
+
+ // NOTE: Enable only for debug
+ // clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
+
+ switch( params.fft_precision )
+ {
+ case CLFFT_SINGLE:
+ case CLFFT_SINGLE_FAST:
+ dtPlanar = "float";
+ dtComplex = "float2";
+ break;
+ case CLFFT_DOUBLE:
+ case CLFFT_DOUBLE_FAST:
+ dtPlanar = "double";
+ dtComplex = "double2";
+
+ // Emit code that enables double precision in the kernel
+ clKernWrite( transKernel, 0 ) << "#ifdef cl_khr_fp64" << std::endl;
+ clKernWrite( transKernel, 3 ) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
+ clKernWrite( transKernel, 0 ) << "#else" << std::endl;
+ clKernWrite( transKernel, 3 ) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
+ clKernWrite( transKernel, 0 ) << "#endif\n" << std::endl;
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ break;
+ }
+
+
+ // If twiddle computation has been requested, generate the lookup function
+ if(params.fft_3StepTwiddle)
+ {
+ std::string str;
+ StockhamGenerator::TwiddleTableLarge twLarge(params.fft_N[0] * params.fft_N[1]);
+ if( (params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST) )
+ twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
+ else
+ twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
+ clKernWrite( transKernel, 0 ) << str << std::endl;
+ clKernWrite( transKernel, 0 ) << std::endl;
+ }
+
+
+ clKernWrite( transKernel, 0 ) << "// Local structure to embody/capture tile dimensions" << std::endl;
+ clKernWrite( transKernel, 0 ) << "typedef struct tag_Tile" << std::endl;
+ clKernWrite( transKernel, 0 ) << "{" << std::endl;
+ clKernWrite( transKernel, 3 ) << "size_t x;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "size_t y;" << std::endl;
+ clKernWrite( transKernel, 0 ) << "} Tile;" << std::endl << std::endl;
+
+ // This detects whether the input matrix is square
+ bool notSquare = ( params.fft_N[ 0 ] == params.fft_N[ 1 ] ) ? false : true;
+
+ if( notSquare && (params.fft_placeness == CLFFT_INPLACE) )
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+
+
+ for(size_t bothDir=0; bothDir<2; bothDir++)
+ {
+ // Generate the kernel entry point and parameter list
+ //
+ bool fwd = bothDir ? false : true;
+
+ std::string funcName;
+ if(params.fft_3StepTwiddle)
+ funcName = fwd ? "transpose_gcn_tw_fwd" : "transpose_gcn_tw_back";
+ else
+ funcName = "transpose_gcn";
+
+ genTransposePrototype( params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput );
+
+ clKernWrite( transKernel, 3 ) << "const Tile localIndex = { get_local_id( 0 ), get_local_id( 1 ) }; " << std::endl;
+ clKernWrite( transKernel, 3 ) << "const Tile localExtent = { get_local_size( 0 ), get_local_size( 1 ) }; " << std::endl;
+ clKernWrite( transKernel, 3 ) << "const Tile groupIndex = { get_group_id( 0 ), get_group_id( 1 ) };" << std::endl;
+ // clKernWrite( transKernel, 3 ) << "const Tile groupExtent = { get_num_groups( 0 ), get_num_groups( 1 ) }; " << std::endl;
+ clKernWrite( transKernel, 3 ) << std::endl;
+
+ // Debug index code to see what indices we recieve
+ //clKernWrite( transKernel, 3 ) << "printf( \"localExtent: (%lu, %lu) \", localExtent.x, localExtent.x );" << std::endl;
+ //clKernWrite( transKernel, 3 ) << "printf( \"localIndex.x: %lu \", localIndex.x );" << std::endl;
+ //clKernWrite( transKernel, 3 ) << "printf( \"localIndex.x: %lu \", localIndex.x );" << std::endl;
+ //clKernWrite( transKernel, 3 ) << "if( localIndex.x == 0 && localIndex.y == 0) {\n" << std::endl;
+ //clKernWrite( transKernel, 6 ) << "printf( \"localIndex.x: %lu \", localIndex.x );" << std::endl;
+ //clKernWrite( transKernel, 6 ) << "printf( \"localIndex.y: %lu \", localIndex.y );" << std::endl;
+ //clKernWrite( transKernel, 6 ) << "printf( \"groupIndex.x: %lu \", groupIndex.x );" << std::endl;
+ //clKernWrite( transKernel, 6 ) << "printf( \"groupIndex.y: %lu\\n\", groupIndex.y );" << std::endl;
+ //clKernWrite( transKernel, 3 ) << "}\n" << std::endl;
+
+ // This is an interesting idea in that we might be able to reshape the input 1D array as a 2D array
+ //clKernWrite( transKernel, 3 ) << "global " << dtInput << " (*myTileIn)[ 4096 ] =(global " << dtInput << " (*)[ 4096 ]) " << pmComplexIn << ";" << std::endl;
+
+
+
+ clKernWrite( transKernel, 3 ) << "// Calculate the unit address (in terms of datatype) of the beginning of the Tile for the WG block" << std::endl;
+ clKernWrite( transKernel, 3 ) << "// Transpose of input & output blocks happens with the Offset calculation" << std::endl;
+ clKernWrite( transKernel, 3 ) << "const size_t reShapeFactor = " << reShapeFactor << ";" << std::endl;
+ clKernWrite( transKernel, 3 ) << "const size_t wgUnroll = " << loopCount << ";" << std::endl;
+ clKernWrite( transKernel, 3 ) << "const Tile wgTileExtent = { localExtent.x * reShapeFactor, localExtent.y / reShapeFactor };" << std::endl;
+ clKernWrite( transKernel, 3 ) << "const size_t tileSizeinUnits = wgTileExtent.x * wgTileExtent.y * wgUnroll;" << std::endl << std::endl;
+
+
+ // This is the size of a matrix in the y dimension in units of group size; used to calculate stride[2] indexing
+ //size_t numGroupsY = DivRoundingUp( params.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
+
+ //numGroupY_1 is the number of cumulative work groups up to 1st dimension
+ //numGroupY_2 is the number of cumulative work groups up to 2nd dimension and so forth
+
+ size_t numGroupsTemp = DivRoundingUp( params.fft_N[1], lwSize.y / reShapeFactor * loopCount );
+ clKernWrite( transKernel, 3 ) << "const size_t numGroupsY_1" << " = " << numGroupsTemp << ";" << std::endl;
+ for(int i = 2; i < params.fft_DataDim - 1; i++)
+ {
+ numGroupsTemp *= params.fft_N[i];
+ clKernWrite( transKernel, 3 ) << "const size_t numGroupsY_" << i << " = " << numGroupsTemp << ";" << std::endl;
+ }
+
+
+ // Generate the amount of local data share we need
+ // Assumption: Even for planar data, we will still store values in LDS as interleaved
+ tile ldsSize = { lwSize.x * reShapeFactor, lwSize.y / reShapeFactor * loopCount };
+ switch( params.fft_outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ case CLFFT_COMPLEX_PLANAR:
+ clKernWrite( transKernel, 3 ) << "// LDS is always complex and allocated transposed: lds[ wgTileExtent.y * wgUnroll ][ wgTileExtent.x ];" << std::endl;
+ clKernWrite( transKernel, 3 ) << "local " << dtComplex << " lds[ " << ldsSize.x << " ][ " << ldsSize.y << " ];" << std::endl << std::endl;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+
+ clKernWrite( transKernel, 3 ) << "size_t currDimSize;" << std::endl ;
+ clKernWrite( transKernel, 3 ) << "size_t rowSizeinUnits;" << std::endl << std::endl ;
+
+
+ OffsetCalc(transKernel, params, true);
+
+
+ switch( params.fft_inputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite( transKernel, 3 ) << "global " << dtInput << "* tileIn = " << pmComplexIn << " + iOffset;" << std::endl;
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ clKernWrite( transKernel, 3 ) << "global " << dtInput << "* realTileIn = " << pmRealIn << " + iOffset;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "global " << dtInput << "* imagTileIn = " << pmImagIn << " + iOffset;" << std::endl;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+ // This is the loop reading through the Tile
+ clKernWrite( transKernel, 3 ) << dtComplex << " tmp;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "rowSizeinUnits = " << params.fft_inStride[ 1 ] << ";" << std::endl; // get_num_groups( 0 ) * wgTileExtent.x;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "for( uint t=0; t < wgUnroll; t++ )" << std::endl;
+ clKernWrite( transKernel, 3 ) << "{" << std::endl;
+
+ clKernWrite( transKernel, 6 ) << "size_t xInd = localIndex.x + localExtent.x * ( localIndex.y % wgTileExtent.y ); " << std::endl;
+ clKernWrite( transKernel, 6 ) << "size_t yInd = localIndex.y/wgTileExtent.y + t * wgTileExtent.y; " << std::endl;
+
+ // Calculating the index seperately enables easier debugging through tools
+ clKernWrite( transKernel, 6 ) << "size_t gInd = xInd + rowSizeinUnits * yInd;" << std::endl;
+
+ switch( params.fft_inputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite( transKernel, 6 ) << "tmp = tileIn[ gInd ];" << std::endl;
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ clKernWrite( transKernel, 6 ) << "tmp.s0 = realTileIn[ gInd ];" << std::endl;
+ clKernWrite( transKernel, 6 ) << "tmp.s1 = imagTileIn[ gInd ];" << std::endl;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+ clKernWrite( transKernel, 6 ) << "// Transpose of Tile data happens here" << std::endl;
+
+
+ // If requested, generate the Twiddle math to multiply constant values
+ if( params.fft_3StepTwiddle )
+ genTwiddleMath( params, transKernel, dtComplex, fwd );
+
+ clKernWrite( transKernel, 6 ) << "lds[ xInd ][ yInd ] = tmp; " << std::endl;
+ clKernWrite( transKernel, 3 ) << "}" << std::endl;
+ clKernWrite( transKernel, 3 ) << std::endl;
+ clKernWrite( transKernel, 3 ) << "barrier( CLK_LOCAL_MEM_FENCE );" << std::endl;
+ clKernWrite( transKernel, 3 ) << std::endl;
+
+ OffsetCalc(transKernel, params, false);
+
+
+ switch( params.fft_outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* tileOut = " << pmComplexOut << " + oOffset;" << std::endl << std::endl;
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* realTileOut = " << pmRealOut << " + oOffset;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* imagTileOut = " << pmImagOut << " + oOffset;" << std::endl;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+ // Write the transposed values from LDS into global memory
+ clKernWrite( transKernel, 3 ) << "rowSizeinUnits = " << params.fft_outStride[ 1 ] << ";" << std::endl; // get_num_groups( 0 ) * wgTileExtent.x;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "const size_t transposeRatio = wgTileExtent.x / ( wgTileExtent.y * wgUnroll );" << std::endl;
+ clKernWrite( transKernel, 3 ) << "const size_t groupingPerY = wgUnroll / wgTileExtent.y;" << std::endl;
+ clKernWrite( transKernel, 3 ) << "for( uint t=0; t < wgUnroll; t++ )" << std::endl;
+ clKernWrite( transKernel, 3 ) << "{" << std::endl;
+ clKernWrite( transKernel, 6 ) << "size_t xInd = localIndex.x + localExtent.x * ( localIndex.y % groupingPerY ); " << std::endl;
+ clKernWrite( transKernel, 6 ) << "size_t yInd = localIndex.y/groupingPerY + t * (wgTileExtent.y * transposeRatio); " << std::endl;
+ clKernWrite( transKernel, 6 ) << "tmp = lds[ yInd ][ xInd ]; " << std::endl;
+ clKernWrite( transKernel, 6 ) << "size_t gInd = xInd + rowSizeinUnits * yInd;" << std::endl;
+
+ switch( params.fft_outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite( transKernel, 6 ) << "tileOut[ gInd ] = tmp;" << std::endl;
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ clKernWrite( transKernel, 6 ) << "realTileOut[ gInd ] = tmp.s0;" << std::endl;
+ clKernWrite( transKernel, 6 ) << "imagTileOut[ gInd ] = tmp.s1;" << std::endl;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ case CLFFT_REAL:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+ clKernWrite( transKernel, 3 ) << "}" << std::endl;
+ clKernWrite( transKernel, 3 ) << std::endl;
+
+ clKernWrite( transKernel, 0 ) << "}\n" << std::endl;
+
+ strKernel = transKernel.str( );
+ //std::cout << strKernel;
+
+ if(!params.fft_3StepTwiddle)
+ break;
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_GCN> (FFTKernelGenKeyParams & params) const
+{
+ ::memset( ¶ms, 0, sizeof( params ) );
+ params.fft_precision = this->precision;
+ params.fft_placeness = this->placeness;
+ params.fft_inputLayout = this->inputLayout;
+ params.fft_outputLayout = this->outputLayout;
+ params.fft_3StepTwiddle = false;
+
+ params.transOutHorizontal = this->transOutHorizontal; // using the twiddle front flag to specify horizontal write
+ // we do this so as to reuse flags in FFTKernelGenKeyParams
+ // and to avoid making a new one
+
+ ARG_CHECK( this->inStride.size( ) == this->outStride.size( ) );
+
+ if( CLFFT_INPLACE == params.fft_placeness )
+ {
+ // If this is an in-place transform the
+ // input and output layout, dimensions and strides
+ // *MUST* be the same.
+ //
+ ARG_CHECK( params.fft_inputLayout == params.fft_outputLayout )
+
+ for( size_t u = this->inStride.size(); u-- > 0; )
+ {
+ ARG_CHECK( this->inStride[u] == this->outStride[u] );
+ }
+ }
+
+ params.fft_DataDim = this->length.size() + 1;
+ int i = 0;
+ for(i = 0; i < (params.fft_DataDim - 1); i++)
+ {
+ params.fft_N[i] = this->length[i];
+ params.fft_inStride[i] = this->inStride[i];
+ params.fft_outStride[i] = this->outStride[i];
+
+ }
+ params.fft_inStride[i] = this->iDist;
+ params.fft_outStride[i] = this->oDist;
+
+ if (this->large1D != 0) {
+ ARG_CHECK (params.fft_N[0] != 0)
+ ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
+ params.fft_3StepTwiddle = true;
+ ARG_CHECK ( this->large1D == (params.fft_N[1] * params.fft_N[0]) );
+ }
+
+ // Query the devices in this context for their local memory sizes
+ // How we generate a kernel depends on the *minimum* LDS size for all devices.
+ //
+ const FFTEnvelope * pEnvelope = NULL;
+ OPENCL_V( this->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
+ BUG_CHECK( NULL != pEnvelope );
+
+ // TODO: Since I am going with a 2D workgroup size now, I need a better check than this 1D use
+ // Check: CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
+ // CL_DEVICE_MAX_WORK_ITEM_SIZES
+ params.fft_R = 1; // Dont think i'll use
+ params.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+
+ return CLFFT_SUCCESS;
+}
+
+// Constants that specify the bounding sizes of the block that each workgroup will transpose
+const tile lwSize = { 16, 16 };
+const size_t reShapeFactor = 4; // wgTileSize = { lwSize.x * reShapeFactor, lwSize.y / reShapeFactor }
+const size_t outRowPadding = 0;
+
+static size_t NumBlocksX(size_t N)
+{
+ return DivRoundingUp( N, lwSize.x * reShapeFactor );
+}
+
+// This is global, but should consider to be part of FFTPlan
+size_t loopCount = 0;
+
+// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
+// Feed this generator the FFTPlan, and it returns the generated program as a string
+template<>
+clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+{
+ FFTKernelGenKeyParams params;
+ OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( params ), _T( "GetKernelGenKey() failed!" ) );
+
+ switch( params.fft_precision )
+ {
+ case CLFFT_SINGLE:
+ case CLFFT_SINGLE_FAST:
+ loopCount = 16;
+ break;
+ case CLFFT_DOUBLE:
+ case CLFFT_DOUBLE_FAST:
+ // Double precisions need about half the amount of LDS space as singles do
+ loopCount = 8;
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ break;
+ }
+
+ std::string programCode;
+ OPENCL_V( genTransposeKernel( params, programCode, lwSize, reShapeFactor, loopCount, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
+
+ cl_int status = CL_SUCCESS;
+ cl_device_id Device = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+ OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+
+ cl_context QueueContext = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+ OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+
+
+ OPENCL_V( fftRepo.setProgramCode( Transpose_GCN, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+
+ // Note: See genFunctionPrototype( )
+ if( params.fft_3StepTwiddle )
+ {
+ OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn_tw_fwd", "transpose_gcn_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ }
+ else
+ {
+ OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn", "transpose_gcn", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_GCN>( std::vector< size_t >& globalWS, std::vector< size_t >& localWS ) const
+{
+ FFTKernelGenKeyParams parameters;
+ OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( parameters ), _T( "GetKernelGenKey() failed!" ) );
+ // We need to make sure that the global work size is evenly divisible by the local work size
+ // Our transpose works in tiles, so divide tiles in each dimension to get count of blocks, rounding up for remainder items
+ size_t numBlocksX = NumBlocksX(parameters.fft_N[ 0 ]);
+ size_t numBlocksY = DivRoundingUp( parameters.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
+ size_t numWIX = numBlocksX * lwSize.x;
+
+ // Batches of matrices are lined up along the Y axis, 1 after the other
+ size_t numWIY = numBlocksY * lwSize.y * this->batchsize;
+ // fft_DataDim has one more dimension than the actual fft data, which is devoted to batch.
+ // dim from 2 to fft_DataDim - 2 are lined up along the Y axis
+ for(int i = 2; i < parameters.fft_DataDim - 1; i++)
+ {
+ numWIY *= parameters.fft_N[i];
+ }
+
+
+ globalWS.clear( );
+ globalWS.push_back( numWIX );
+ globalWS.push_back( numWIY );
+
+ localWS.clear( );
+ localWS.push_back( lwSize.x );
+ localWS.push_back( lwSize.y );
+
+ return CLFFT_SUCCESS;
+}
diff --git a/src/library/generator.transpose.h b/src/library/generator.transpose.gcn.h
similarity index 100%
copy from src/library/generator.transpose.h
copy to src/library/generator.transpose.gcn.h
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.vliw.cpp
similarity index 94%
rename from src/library/generator.transpose.cpp
rename to src/library/generator.transpose.vliw.cpp
index 0615b99..f715c7d 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.vliw.cpp
@@ -22,7 +22,7 @@
#include "stdafx.h"
#include <math.h>
-#include "generator.transpose.h"
+#include "generator.transpose.vliw.h"
#define QUOTEMARK(x) #x
@@ -737,7 +737,7 @@ static clfftStatus GenerateTransposeKernel (FFTKernelGenKeyParams & params,
}
template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_VLIW> (FFTKernelGenKeyParams & params) const
{
// Query the devices in this context for their local memory sizes
@@ -797,12 +797,12 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose> (FFTKernelGenKeyParams & para
}
template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Transpose> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_VLIW> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
{
// How many numbers per workitem in the generated kernel?
FFTKernelGenKeyParams fftParams;
// Translate the user plan into the structure that we use to map plans to clPrograms
- OPENCL_V( this->GetKernelGenKeyPvt<Transpose>( fftParams ), _T("GetKernelGenKey() failed!") );
+ OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW>( fftParams ), _T("GetKernelGenKey() failed!") );
unsigned long long count, count0, count1;
count0 = DivRoundingUp<unsigned long long> (this->length[0], fftParams.fft_R);
@@ -822,22 +822,25 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Transpose> (std::vector<size_t> & globalWS,
// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
// Feed this generator the FFTPlan, and it returns the generated program as a string
template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Transpose> ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
+clfftStatus FFTPlan::GenerateKernelPvt<Transpose_VLIW> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
{
FFTKernelGenKeyParams params;
- OPENCL_V( this->GetKernelGenKeyPvt<Transpose> (params), _T("GetKernelGenKey() failed!") );
+ OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW> (params), _T("GetKernelGenKey() failed!") );
std::string programCode;
OPENCL_V( GenerateTransposeKernel( params, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
- cl_int status = CL_SUCCESS;
- cl_context QueueContext = NULL;
- status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+ cl_int status = CL_SUCCESS;
+ cl_device_id Device = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+ OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+ cl_context QueueContext = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+ OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- OPENCL_V( fftRepo.setProgramCode( Transpose, params, programCode, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
- OPENCL_V( fftRepo.setProgramEntryPoints( Transpose, params, "fft_trans", "fft_trans",QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW, params, "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
return CLFFT_SUCCESS;
}
diff --git a/src/library/generator.transpose.h b/src/library/generator.transpose.vliw.h
similarity index 100%
rename from src/library/generator.transpose.h
rename to src/library/generator.transpose.vliw.h
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 5a750d1..60389ad 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -44,6 +44,26 @@ bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& r
return false;
}
+// Returns CLFFT_SUCCESS if the fp64 is present, CLFFT_DEVICE_NO_DOUBLE if it is not found.
+clfftStatus checkDevExt( std::string ext, const cl_device_id &device )
+{
+ size_t deviceExtSize = 0;
+ OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+ std::vector< char > szDeviceExt( deviceExtSize );
+ OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+ "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+ std::string strDeviceExt = &szDeviceExt[ 0 ];
+
+ if( strDeviceExt.find( ext.c_str( ), 0 ) == std::string::npos )
+ return CLFFT_DEVICE_NO_DOUBLE;
+
+
+ return CLFFT_SUCCESS;
+}
+
clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
const size_t* clLengths )
{
@@ -119,6 +139,7 @@ clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
fftPlan->forwardScale = 1.0;
fftPlan->backwardScale = 1.0 / static_cast< double >( lenX * lenY * lenZ );
fftPlan->batchsize = 1;
+ fftPlan->userPlan = true;
fftPlan->gen = Stockham; //default setting
@@ -126,6 +147,7 @@ clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
clRetainContext( fftPlan->context );
+#if 0
/////////////////////////////////////////////////////////////////
// Detect OpenCL devices
/////////////////////////////////////////////////////////////////
@@ -140,6 +162,7 @@ clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
/* Now, get the device list data */
OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &fftPlan->devices[ 0 ], NULL ),
"Getting device array ( ::clGetContextInfo() )" );
+#endif
// Need to devise a way to generate better names
tstringstream tstream;
@@ -193,39 +216,7 @@ clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
return CLFFT_SUCCESS;
}
-// Read the kernels that this plan uses from file, and store into the plan
-clfftStatus WriteKernel( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelGenKeyParams& fftParams, const cl_context& context )
-{
- FFTRepo& fftRepo = FFTRepo::getInstance( );
-
- // Logic to define a sensible filename
- const std::string kernelPrefix( "clfft.kernel." );
- std::string generatorName;
- std::stringstream kernelPath;
-
- switch( gen )
- {
- case Stockham: generatorName = "Stockham"; break;
- case Transpose: generatorName = "Transpose"; break;
- }
-
- kernelPath << kernelPrefix << generatorName << plHandle << ".cl";
- // Logic to write string contents out to file
- tofstreamRAII< std::ofstream, std::string > kernelFile( kernelPath.str( ) );
- if( !kernelFile.get( ) )
- {
- std::cerr << "Failed to open kernel file for writing: " << kernelPath.str( ) << std::endl;
- return CLFFT_FILE_CREATE_FAILURE;
- }
-
- std::string kernel;
- OPENCL_V( fftRepo.getProgramCode( gen, fftParams, kernel, context ), _T( "fftRepo.getProgramCode failed." ) );
-
- kernelFile.get( ) << kernel << std::endl;
-
- return CLFFT_SUCCESS;
-}
// **************** TODO TODO TODO ***********************
// Making CompileKernels function take in command queue parameter so we can build for 1 particular device only;
@@ -243,23 +234,18 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
// create a cl program executable for the device associated with command queue
// Get the device
- cl_device_id q_device;
- clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &q_device, NULL);
+ cl_device_id &q_device = fftPlan->bakeDevice;
+ //clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &q_device, NULL);
FFTKernelGenKeyParams fftParams;
OPENCL_V( fftPlan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
cl_program program;
- if( fftRepo.getclProgram( gen, fftParams, program, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
+ if( fftRepo.getclProgram( gen, fftParams, program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
{
- // If the user wishes us to write the kernels out to disk, we do so
- if( fftRepo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
- {
- OPENCL_V( WriteKernel( plHandle, gen, fftParams, fftPlan->context ), _T( "WriteKernel failed." ) );
- }
std::string programCode;
- OPENCL_V( fftRepo.getProgramCode( gen, fftParams, programCode, fftPlan->context ), _T( "fftRepo.getProgramCode failed." ) );
+ OPENCL_V( fftRepo.getProgramCode( gen, fftParams, programCode, q_device, fftPlan->context ), _T( "fftRepo.getProgramCode failed." ) );
const char* source = programCode.c_str();
program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
@@ -301,23 +287,24 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
OPENCL_V( status, _T( "clBuildProgram failed" ) );
}
- fftRepo.setclProgram( gen, fftParams, program );
+ fftRepo.setclProgram( gen, fftParams, program, q_device, fftPlan->context );
// For real transforms we comppile either forward or backward kernel
bool r2c_transform = (fftParams.fft_inputLayout == CLFFT_REAL);
bool c2r_transform = (fftParams.fft_outputLayout == CLFFT_REAL);
- bool real_transform = (gen == Copy) ? true : (r2c_transform || c2r_transform);
bool h2c = (gen == Copy) && ((fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED));
bool c2h = (gen == Copy) && ((fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED));
+ bool generalCopy = !(h2c || c2h) && (gen == Copy);
+ bool complexTransform = ( !(r2c_transform || c2r_transform) && (gen != Copy) );
// get a kernel object handle for a kernel with the given name
cl_kernel kernel;
- if( (!real_transform) || r2c_transform || c2h )
+ if( complexTransform || r2c_transform || c2h || generalCopy)
{
if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
{
std::string entryPoint;
- OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_FORWARD, entryPoint, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+ OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
OPENCL_V( status, _T( "clCreateKernel failed" ) );
@@ -326,12 +313,12 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
}
}
- if( (!real_transform) || c2r_transform || h2c )
+ if( complexTransform || c2r_transform || h2c || generalCopy)
{
if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
{
std::string entryPoint;
- OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_BACKWARD, entryPoint, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+ OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
OPENCL_V( status, _T( "clCreateKernel failed" ) );
@@ -341,130 +328,22 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
}
}
-//TODO caching kernel binaries for later reload
-#if 0
- // figure out number of devices and the sizes of the binary for each device.
- OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(fftPlan->number_of_devices), &(fftPlan->number_of_devices), NULL ), _T("CompileKernels(): error getting number of devices") );
-
-
- // get the sizes of the different binaries
- fftPlan->ResetBinarySizes();
- OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * fftPlan->number_of_devices, fftPlan->binary_sizes.get(), NULL ), _T("CompileKernels(): error getting binary sizes") );
-
- // we need a list of naked pointers to all of the binaries for OpenCL
- std::unique_ptr<char*[]> naked_binary_pointers( new char*[fftPlan->number_of_devices] );
-
- // make space for all of the generated binaries
- for( int i = 0; i < fftPlan->number_of_devices; i++ )
- {
- // this is our permanent storage place for the binaries
- fftPlan->binaries.push_back( std::unique_ptr<char[]>(new char[fftPlan->binary_sizes[i]] ) );
- // and we need this second copy of it for OpenCL
- naked_binary_pointers[i] = fftPlan->binaries[i].get();
- }
-
- // copy all of the generated binaries over
- OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*) * fftPlan->number_of_devices, naked_binary_pointers.get(), NULL ), _T("CompileKernels(): error getting program binaries") );
-#endif
return CLFFT_SUCCESS;
}
-//TODO caching kernel binaries for later reload
-#if 0
-// Compile the kernels that this plan uses, and store into the plan
-clfftStatus LoadCompiledKernels( const clfftPlanHandle plHandle, const clfftGenerators gen, FFTPlan* plan )
-{
- // if there are no devices, there are not any kernels to load
- if( plan->number_of_devices == 0 )
- return CLFFT_SUCCESS;
- FFTRepo& repo = FFTRepo::getInstance( );
- FFTKernelGenKeyParams fftParams;
- OPENCL_V( plan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
- cl_program program;
- if( repo.getclProgram( gen, fftParams, program ) == CLFFT_INVALID_PROGRAM )
+inline size_t PrecisionWidth(clfftPrecision pr)
+{
+ switch(pr)
{
- //if( repo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
- //{
- // OPENCL_V( WriteKernel( plHandle, gen, fftParams ), _T( "WriteKernel failed." ) );
- // //TODO there's no source to spit out, but we should consider giving the user a helpful message
- // // such as "there's no source to output -- kernel binaries loaded from file"
- //}
-
- std::unique_ptr<cl_int[]> binary_status( new cl_int[plan->number_of_devices] );
- cl_int error_code;
-
- std::unique_ptr<const unsigned char*[]> binaries( new const unsigned char*[plan->number_of_devices] );
- for( int i = 0; i < plan->number_of_devices; i++ )
- {
- binaries[i] = reinterpret_cast<const unsigned char*>(plan->binaries[0].get());
- }
-
- if( plan->number_of_devices > 0 )
- {
- program = clCreateProgramWithBinary( plan->context,
- (cl_uint)plan->number_of_devices, &plan->devices[0], &plan->binary_sizes[0], &binaries[0],
- binary_status.get(), &error_code);
-
- cl_int status = 0;
- // create a cl program executable for all the devices specified
- status = clBuildProgram( program, 1, &plan->devices[0], NULL, NULL, NULL);
-
- if( status != CL_SUCCESS )
- {
- if( status == CL_BUILD_PROGRAM_FAILURE )
- {
- size_t buildLogSize = 0;
- OPENCL_V( clGetProgramBuildInfo( program, plan->devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
- _T( "clGetProgramBuildInfo failed" ) );
-
- vector< char > buildLog( buildLogSize );
- ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
-
- OPENCL_V( clGetProgramBuildInfo( program, plan->devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
- _T( "clGetProgramBuildInfo failed" ) );
-
- std::cerr << " \n\t\t\tBUILD LOG\n";
- std::cerr << " ************************************************\n";
- std::cerr << &buildLog[ 0 ] << std::endl;
- std::cerr << " ************************************************\n";
- }
-
- OPENCL_V( status, _T( "clBuildProgram failed" ) );
- }
-
- repo.setclProgram( gen, fftParams, program );
-
- // get a kernel object handle for a kernel with the given name
- cl_kernel kernel;
- if( repo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
- {
- kernel = clCreateKernel( program, "fft_fwd", &status );
- OPENCL_V( status, _T( "clCreateKernel failed" ) );
-
- repo.setclKernel( program, CLFFT_FORWARD, kernel );
- }
-
- if( repo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
- {
- kernel = clCreateKernel( program, "fft_back", &status );
- OPENCL_V( status, _T( "clCreateKernel failed" ) );
-
- repo.setclKernel( program, CLFFT_BACKWARD, kernel );
- }
-
- FFTKernelGenKeyParams params;
- plan->GetKernelGenKey( params );
- OPENCL_V( repo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back" ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
- }
+ case CLFFT_SINGLE: return 1;
+ case CLFFT_DOUBLE: return 2;
+ default: assert(false); return 1;
}
-
- return CLFFT_SUCCESS;
}
-#endif
clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
void (CL_CALLBACK *pfn_notify)( clfftPlanHandle plHandle, void *user_data ), void* user_data )
@@ -506,11 +385,13 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
case CLFFT_1D: pLength *= fftPlan->length[DimX];
}
+ const bool rc = (fftPlan->inputLayout == CLFFT_REAL) || (fftPlan->outputLayout == CLFFT_REAL);
+
// upper bounds on transfrom lengths - address this in the next release
size_t SP_MAX_LEN = 1 << 24;
size_t DP_MAX_LEN = 1 << 22;
- if((fftPlan->precision == CLFFT_SINGLE) && (pLength > SP_MAX_LEN)) return CLFFT_NOTIMPLEMENTED;
- if((fftPlan->precision == CLFFT_DOUBLE) && (pLength > DP_MAX_LEN)) return CLFFT_NOTIMPLEMENTED;
+ if((fftPlan->precision == CLFFT_SINGLE) && (pLength > SP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
+ if((fftPlan->precision == CLFFT_DOUBLE) && (pLength > DP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
// release buffers, as these will be created only in EnqueueTransform
@@ -519,7 +400,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if( NULL != fftPlan->intBufferC2R ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferC2R ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferC2R = NULL; }
- if(fftPlan->dim == fftPlan->length.size() && fftPlan->gen != Transpose && fftPlan->gen != Copy) // confirm it is top-level plan (user plan)
+ if( fftPlan->dim == fftPlan->length.size( ) && ( fftPlan->gen != Transpose_VLIW ) && ( fftPlan->gen != Transpose_GCN ) && ( fftPlan->gen != Copy ) ) // confirm it is top-level plan (user plan)
{
if(fftPlan->placeness == CLFFT_INPLACE)
{
@@ -552,42 +433,26 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
return CLFFT_SUCCESS;
}
-//TODO caching kernel binaries for later reload
-#if 0
- if( fftPlan->readFromFile == true )
- {
- OPENCL_V( LoadCompiledKernels( plHandle, fftPlan->gen, fftPlan ), _T( "LoadCompiledKernels() failed" ) );
-
- // all of the plan compressing and subplan making should be done already,
- // but we still need to make constant buffers
- OPENCL_V( fftPlan->AllocateBuffers(), _T("AllocateBuffers() failed"));
- fftPlan->ConstructAndEnqueueConstantBuffers( commQueueFFT );
-
- if( fftPlan->planX )
- {
- OPENCL_V( clfftBakePlan( fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planX)" );
- }
-
- if( fftPlan->planY )
- {
- OPENCL_V( clfftBakePlan( fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planY)" );
- }
- if( fftPlan->planZ )
+ if( fftPlan->userPlan )
+ {
+ // If the user specifies double precision, check that the device supports double precision first
+ if( fftPlan->precision == CLFFT_DOUBLE || fftPlan->precision == CLFFT_DOUBLE_FAST )
{
- OPENCL_V( clfftBakePlan( fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planZ)" );
+ clfftStatus retAmdFp64 = checkDevExt( "cl_amd_fp64", fftPlan->bakeDevice );
+ if( retAmdFp64 != CLFFT_SUCCESS )
+ {
+ // If AMD's extention is not supported, check for Khronos extention
+ clfftStatus retKhrFp64 = checkDevExt( "cl_khr_fp64", fftPlan->bakeDevice );
+ if( retKhrFp64 != CLFFT_SUCCESS )
+ return retKhrFp64;
+ }
}
-
- fftPlan->baked = true;
- return CLFFT_SUCCESS;
}
-#endif
-
- bool rc = (fftPlan->inputLayout == CLFFT_REAL) || (fftPlan->outputLayout == CLFFT_REAL);
// Compress the plan by discarding length '1' dimensions
// decision to pick generator
- if(fftPlan->dim == fftPlan->length.size() && fftPlan->gen != Transpose && !rc) // confirm it is top-level plan (user plan)
+ if( fftPlan->userPlan && !rc ) // confirm it is top-level plan (user plan)
{
size_t dmnsn = fftPlan->dim;
bool pow2flag = true;
@@ -651,39 +516,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
//
size_t Large1DThreshold = 0;
- //First time check or see if LDS paramters are set-up.
- if (fftPlan->uLdsFraction == 0)
- {
- switch( fftPlan->dim )
- {
- case CLFFT_1D:
- {
- if (fftPlan->length[0] < 32768 || fftPlan->length[0] > 1048576)
- fftPlan->uLdsFraction = 8;
- else
- fftPlan->uLdsFraction = 4;
- if (fftPlan->length[0] < 1024 )
- fftPlan->bLdsComplex = true;
- else
- fftPlan->bLdsComplex = false;
- }
- break;
- case CLFFT_2D:
- {
- fftPlan->uLdsFraction = 4;
- fftPlan->bLdsComplex = false;
- }
- break;
- case CLFFT_3D:
- {
- //for case 128*128*128 and 1024*128*128, fraction = 8 is faster.
- fftPlan->uLdsFraction = 4;
- fftPlan->bLdsComplex = false;
- }
- break;
- }
- }
OPENCL_V(fftPlan->GetMax1DLength (&Large1DThreshold), _T("GetMax1DLength failed"));
BUG_CHECK (Large1DThreshold > 1);
@@ -698,17 +531,59 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
size_t in_1d, in_x, count;
BUG_CHECK (IsPo2 (Large1DThreshold))
- //ARG_CHECK (IsPo2 (fftPlan->length[0]))
- // see whether large1D_Xfactor are fixed or not
- if (fftPlan->large1D_Xfactor == 0 )
+
+ if( IsPo2(fftPlan->length[0]) )
+ {
+ // Enable block compute under these conditions
+ if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
+ && (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) )
+ {
+ fftPlan->blockCompute = true;
+
+ if(1 == PrecisionWidth(fftPlan->precision))
+ {
+ switch(fftPlan->length[0])
+ {
+ case 8192: clLengths[1] = 64; break;
+ case 16384: clLengths[1] = 64; break;
+ case 32768: clLengths[1] = 128; break;
+ case 65536: clLengths[1] = 256; break;
+ case 131072: clLengths[1] = 64; break;
+ case 262144: clLengths[1] = 64; break;
+ case 524288: clLengths[1] = 256; break;
+ case 1048576: clLengths[1] = 256; break;
+ default: assert(false);
+ }
+ }
+ else
+ {
+ switch(fftPlan->length[0])
+ {
+ case 4096: clLengths[1] = 64; break;
+ case 8192: clLengths[1] = 64; break;
+ case 16384: clLengths[1] = 64; break;
+ case 32768: clLengths[1] = 128; break;
+ case 65536: clLengths[1] = 64; break;
+ case 131072: clLengths[1] = 64; break;
+ case 262144: clLengths[1] = 128; break;
+ case 524288: clLengths[1] = 256; break;
+ default: assert(false);
+ }
+ }
+ }
+ else
{
- if( IsPo2(fftPlan->length[0]) )
+ if(fftPlan->length[0] > (Large1DThreshold * Large1DThreshold) )
+ {
+ clLengths[1] = fftPlan->length[0] / Large1DThreshold;
+ }
+ else
{
in_1d = BitScanF (Large1DThreshold); // this is log2(LARGE1D_THRESHOLD)
in_x = BitScanF (fftPlan->length[0]); // this is log2(length)
BUG_CHECK (in_1d > 0)
- count = in_x/in_1d;
+ count = in_x/in_1d;
if (count*in_1d < in_x)
{
count++;
@@ -716,74 +591,570 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if (in_1d * count < in_x) in_1d++;
}
clLengths[1] = (size_t)1 << in_1d;
+ }
+ }
+ }
+ else
+ {
+ // This array must be kept sorted in the ascending order
+ size_t supported[] = { 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40,
+ 45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
+ 144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288,
+ 300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480, 486, 500, 512, 540,
+ 576, 600, 625, 640, 648, 675, 720, 729, 750, 768, 800, 810, 864, 900, 960,
+ 972, 1000, 1024, 1080, 1125, 1152, 1200, 1215, 1250, 1280, 1296, 1350, 1440,
+ 1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048,
+ 2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
+ 3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888, 4000, 4050, 4096 };
+
+ size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
+ size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
+
+ size_t halfPowerLength = (size_t)1 << ( (StockhamGenerator::CeilPo2(fftPlan->length[0]) + 1) / 2 );
+ size_t factoredLengthStart = (halfPowerLength < maxFactoredLength) ? halfPowerLength : maxFactoredLength;
+
+ size_t indexStart = 0;
+ while(supported[indexStart] < factoredLengthStart) indexStart++;
+
+ for(size_t i = indexStart; i >= 1; i--)
+ {
+ if( fftPlan->length[0] % supported[i] == 0 )
+ {
+ clLengths[1] = supported[i];
+ break;
+ }
+ }
+ }
+
+ clLengths[0] = fftPlan->length[0]/clLengths[1];
+
+
+ // Start of block where transposes are generated; 1D FFT
+ while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
+ {
+ if (!IsPo2(fftPlan->length[0])) break;
+
+ //TBD, only one dimension?
+ if (fftPlan->length.size() > 1) break;
+ if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
+
+ if (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) break;
+
+ ARG_CHECK(clLengths[0] <= Large1DThreshold);
+ ARG_CHECK(clLengths[0]>=32 && clLengths[1]>=32);
+
+ size_t padding = 64;
+ size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
+ size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
+
+ if (fftPlan->tmpBufSize==0 )
+ {
+ fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ }
+
+ //Transpose
+ //Input --> tmp buffer
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+
+ FFTPlan* trans1Plan = NULL;
+ lockRAII* trans1Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ trans1Plan->placeness = CLFFT_OUTOFPLACE;
+ trans1Plan->precision = fftPlan->precision;
+ trans1Plan->tmpBufSize = 0;
+ trans1Plan->batchsize = fftPlan->batchsize;
+ trans1Plan->envelope = fftPlan->envelope;
+ trans1Plan->inputLayout = fftPlan->inputLayout;
+ trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans1Plan->inStride[0] = fftPlan->inStride[0];
+ trans1Plan->inStride[1] = clLengths[0];
+ trans1Plan->outStride[0] = 1;
+ trans1Plan->outStride[1] = clLengths[1] + padding;
+ trans1Plan->iDist = fftPlan->iDist;
+ trans1Plan->oDist = clLengths[0] * trans1Plan->outStride[1];
+ trans1Plan->gen = Transpose_GCN;
+ trans1Plan->transflag = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d trans1 plan failed" ) );
+
+ //Row transform
+ //tmp->output
+ //size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* row1Plan = NULL;
+ lockRAII* row1Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ row1Plan->placeness = CLFFT_OUTOFPLACE;
+ row1Plan->precision = fftPlan->precision;
+ row1Plan->forwardScale = 1.0f;
+ row1Plan->backwardScale = 1.0f;
+ row1Plan->tmpBufSize = 0;
+ row1Plan->batchsize = fftPlan->batchsize;
+
+ row1Plan->gen = fftPlan->gen;
+ row1Plan->envelope = fftPlan->envelope;
+
+ // twiddling is done in row2
+ row1Plan->large1D = 0;
+
+ row1Plan->length.push_back(clLengths[0]);
+ row1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row1Plan->outputLayout = fftPlan->outputLayout;
+ row1Plan->inStride[0] = 1;
+ row1Plan->outStride[0] = fftPlan->outStride[0];
+ row1Plan->inStride.push_back(clLengths[1]+padding);
+ row1Plan->outStride.push_back(clLengths[1]);
+ row1Plan->iDist = clLengths[0] * row1Plan->inStride[1];
+ row1Plan->oDist = fftPlan->oDist;
+
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d first row plan failed" ) );
+
+ //Transpose 2
+ //Output --> tmp buffer
+ clLengths[2] = clLengths[0];
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d transpose 2 failed" ) );
+
+ FFTPlan* trans2Plan = NULL;
+ lockRAII* trans2Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ trans2Plan->placeness = CLFFT_OUTOFPLACE;
+ trans2Plan->precision = fftPlan->precision;
+ trans2Plan->tmpBufSize = 0;
+ trans2Plan->batchsize = fftPlan->batchsize;
+ trans2Plan->envelope = fftPlan->envelope;
+ trans2Plan->inputLayout = fftPlan->outputLayout;
+ trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans2Plan->inStride[0] = fftPlan->outStride[0];
+ trans2Plan->inStride[1] = clLengths[1];
+ trans2Plan->outStride[0] = 1;
+ trans2Plan->outStride[1] = clLengths[0] + padding;
+ trans2Plan->iDist = fftPlan->oDist;
+ trans2Plan->oDist = clLengths[1] * trans2Plan->outStride[1];
+ trans2Plan->gen = Transpose_GCN;
+ trans2Plan->large1D = fftPlan->length[0];
+ trans2Plan->transflag = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d trans2 plan failed" ) );
+
+ //Row transform 2
+ //tmp->tmp
+ //size clLengths[0], batch clLengths[1]
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan Large1d second row plan failed" ) );
+
+ FFTPlan* row2Plan = NULL;
+ lockRAII* row2Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ row2Plan->placeness = CLFFT_INPLACE;
+ row2Plan->precision = fftPlan->precision;
+ row2Plan->forwardScale = fftPlan->forwardScale;
+ row2Plan->backwardScale = fftPlan->backwardScale;
+ row2Plan->tmpBufSize = 0;
+ row2Plan->batchsize = fftPlan->batchsize;
+
+ row2Plan->gen = fftPlan->gen;
+ row2Plan->envelope = fftPlan->envelope;
+
+
+ row2Plan->length.push_back(clLengths[1]);
+ row2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ row2Plan->inStride[0] = 1;
+ row2Plan->outStride[0] = 1;
+ row2Plan->inStride.push_back(clLengths[0] + padding);
+ row2Plan->outStride.push_back(clLengths[0] + padding);
+ row2Plan->iDist = clLengths[1] * row2Plan->inStride[1];
+ row2Plan->oDist = clLengths[1] * row2Plan->outStride[1];
+
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d second row plan failed" ) );
+
+ //Transpose 3
+ //tmp --> output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan Large1d transpose 3 failed" ) );
+
+ FFTPlan* trans3Plan = NULL;
+ lockRAII* trans3Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ trans3Plan->placeness = CLFFT_OUTOFPLACE;
+ trans3Plan->precision = fftPlan->precision;
+ trans3Plan->tmpBufSize = 0;
+ trans3Plan->batchsize = fftPlan->batchsize;
+ trans3Plan->envelope = fftPlan->envelope;
+ trans3Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans3Plan->outputLayout = fftPlan->outputLayout;
+ trans3Plan->inStride[0] = 1;
+ trans3Plan->inStride[1] = clLengths[0] + padding;
+ trans3Plan->outStride[0] = fftPlan->outStride[0];
+ trans3Plan->outStride[1] = clLengths[1];
+ trans3Plan->iDist = clLengths[1] * trans3Plan->inStride[1];
+ trans3Plan->oDist = fftPlan->oDist;
+ trans3Plan->gen = Transpose_GCN;
+ trans3Plan->transflag = true;
+ trans3Plan->transOutHorizontal = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d trans3 plan failed" ) );
+
+ fftPlan->transflag = true;
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
+ }
+
+ size_t length0 = clLengths[0];
+ size_t length1 = clLengths[1];
+
+ if(fftPlan->inputLayout == CLFFT_REAL)
+ {
+ if (fftPlan->tmpBufSizeRC==0 )
+ {
+ fftPlan->tmpBufSizeRC = length0 * length1 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ fftPlan->tmpBufSizeRC *= fftPlan->length[index];
}
- else
+ }
+
+ // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+ // transposed output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* colTPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // current plan is to create intermediate buffer, packed and interleave
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
+
+ //this part are common for both passes
+ colTPlan->placeness = CLFFT_OUTOFPLACE;
+ colTPlan->precision = fftPlan->precision;
+ colTPlan->forwardScale = 1.0f;
+ colTPlan->backwardScale = 1.0f;
+ colTPlan->tmpBufSize = 0;
+ colTPlan->batchsize = fftPlan->batchsize;
+
+ colTPlan->gen = fftPlan->gen;
+ colTPlan->envelope = fftPlan->envelope;
+
+ //Pass large1D flag to confirm we need multiply twiddle factor
+ colTPlan->large1D = fftPlan->length[0];
+ colTPlan->RCsimple = true;
+
+ colTPlan->length.push_back(clLengths[0]);
+
+ // first Pass
+ colTPlan->inputLayout = fftPlan->inputLayout;
+ colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
+ colTPlan->outStride[0] = 1;
+ colTPlan->iDist = fftPlan->iDist;
+ colTPlan->oDist = length0 * length1;//fftPlan->length[0];
+ colTPlan->inStride.push_back(fftPlan->inStride[0]);
+ colTPlan->outStride.push_back(length1);//clLengths[1]);
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ colTPlan->length.push_back(fftPlan->length[index]);
+ colTPlan->inStride.push_back(fftPlan->inStride[index]);
+ // tmp buffer is tightly packed
+ colTPlan->outStride.push_back(colTPlan->oDist);
+ colTPlan->oDist *= fftPlan->length[index];
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+
+ //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan large1D row failed" ) );
+
+ FFTPlan* col2Plan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ col2Plan->placeness = CLFFT_INPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+ col2Plan->precision = fftPlan->precision;
+ col2Plan->forwardScale = fftPlan->forwardScale;
+ col2Plan->backwardScale = fftPlan->backwardScale;
+ col2Plan->tmpBufSize = 0;
+ col2Plan->batchsize = fftPlan->batchsize;
+
+ col2Plan->gen = fftPlan->gen;
+ col2Plan->envelope = fftPlan->envelope;
+
+ col2Plan->length.push_back(length1);
+
+ col2Plan->inStride[0] = length1;
+ col2Plan->inStride.push_back(1);
+ col2Plan->iDist = length0 * length1;
+
+ col2Plan->outStride[0] = length1;
+ col2Plan->outStride.push_back(1);
+ col2Plan->oDist = length0 * length1;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
+ col2Plan->outStride.push_back(col2Plan->oDist);
+ col2Plan->iDist *= fftPlan->length[index];
+ col2Plan->oDist *= fftPlan->length[index];
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+
+
+ // copy plan to get back to hermitian
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
+ _T( "CreateDefaultPlan RC copy failed" ) );
+
+ FFTPlan* copyPlan = NULL;
+ lockRAII* copyLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ copyPlan->placeness = CLFFT_OUTOFPLACE;
+ copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ copyPlan->outputLayout = fftPlan->outputLayout;
+
+ copyPlan->precision = fftPlan->precision;
+ copyPlan->forwardScale = 1.0f;
+ copyPlan->backwardScale = 1.0f;
+ copyPlan->tmpBufSize = 0;
+ copyPlan->batchsize = fftPlan->batchsize;
+
+ copyPlan->gen = Copy;
+ copyPlan->envelope = fftPlan->envelope;
+
+
+ copyPlan->inStride[0] = 1;
+ copyPlan->iDist = fftPlan->length[0];
+
+ copyPlan->outStride[0] = fftPlan->outStride[0];
+ copyPlan->oDist = fftPlan->oDist;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ copyPlan->length.push_back(fftPlan->length[index]);
+ copyPlan->inStride.push_back(copyPlan->inStride[index-1] * fftPlan->length[index-1]);
+ copyPlan->iDist *= fftPlan->length[index];
+ copyPlan->outStride.push_back(fftPlan->outStride[index]);
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
+
+ }
+ else if(fftPlan->outputLayout == CLFFT_REAL)
+ {
+ if (fftPlan->tmpBufSizeRC==0 )
+ {
+ fftPlan->tmpBufSizeRC = length0 * length1 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ for (size_t index=1; index < fftPlan->length.size(); index++)
{
- // This array must be kept sorted in the ascending order
- size_t supported[] = { 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40,
- 45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
- 144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288,
- 300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480, 486, 500, 512, 540,
- 576, 600, 625, 640, 648, 675, 720, 729, 750, 768, 800, 810, 864, 900, 960,
- 972, 1000, 1024, 1080, 1125, 1152, 1200, 1215, 1250, 1280, 1296, 1350, 1440,
- 1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048,
- 2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
- 3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888, 4000, 4050, 4096 };
-
- size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
- size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
-
- size_t halfPowerLength = (size_t)1 << ( (StockhamGenerator::CeilPo2(fftPlan->length[0]) + 1) / 2 );
- size_t factoredLengthStart = (halfPowerLength < maxFactoredLength) ? halfPowerLength : maxFactoredLength;
-
- size_t indexStart = 0;
- while(supported[indexStart] < factoredLengthStart) indexStart++;
-
- for(size_t i = indexStart; i >= 1; i--)
- {
- if( fftPlan->length[0] % supported[i] == 0 )
- {
- clLengths[1] = supported[i];
- break;
- }
- }
+ fftPlan->tmpBufSizeRC *= fftPlan->length[index];
}
+ }
+
+ // copy plan to from hermitian to full complex
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
+ _T( "CreateDefaultPlan RC copy failed" ) );
+
+ FFTPlan* copyPlan = NULL;
+ lockRAII* copyLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
- clLengths[0] = fftPlan->length[0]/clLengths[1];
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ copyPlan->placeness = CLFFT_OUTOFPLACE;
+ copyPlan->inputLayout = fftPlan->inputLayout;
+ copyPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+ copyPlan->precision = fftPlan->precision;
+ copyPlan->forwardScale = 1.0f;
+ copyPlan->backwardScale = 1.0f;
+ copyPlan->tmpBufSize = 0;
+ copyPlan->batchsize = fftPlan->batchsize;
+
+ copyPlan->gen = Copy;
+ copyPlan->envelope = fftPlan->envelope;
+
+ copyPlan->inStride[0] = fftPlan->inStride[0];
+ copyPlan->iDist = fftPlan->iDist;
+
+ copyPlan->outStride[0] = 1;
+ copyPlan->oDist = fftPlan->length[0];
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ copyPlan->length.push_back(fftPlan->length[index]);
+ copyPlan->outStride.push_back(copyPlan->outStride[index-1] * fftPlan->length[index-1]);
+ copyPlan->oDist *= fftPlan->length[index];
+ copyPlan->inStride.push_back(fftPlan->inStride[index]);
}
- else
+
+ OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
+
+ // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+ // transposed output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan Large1d column failed" ) );
+
+ FFTPlan* colTPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // current plan is to create intermediate buffer, packed and interleave
+ // This is a column FFT, the first elements distance between each FFT is the distance of the first two
+ // elements in the original buffer. Like a transpose of the matrix
+ // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
+
+ //this part are common for both passes
+ colTPlan->placeness = CLFFT_INPLACE;
+ colTPlan->precision = fftPlan->precision;
+ colTPlan->forwardScale = 1.0f;
+ colTPlan->backwardScale = 1.0f;
+ colTPlan->tmpBufSize = 0;
+ colTPlan->batchsize = fftPlan->batchsize;
+
+ colTPlan->gen = fftPlan->gen;
+ colTPlan->envelope = fftPlan->envelope;
+
+ //Pass large1D flag to confirm we need multiply twiddle factor
+ colTPlan->large1D = fftPlan->length[0];
+
+ colTPlan->length.push_back(clLengths[0]);
+
+ // first Pass
+ colTPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+
+ colTPlan->inStride[0] = length0;
+ colTPlan->inStride.push_back(1);
+ colTPlan->iDist = length0 * length1;
+
+ colTPlan->outStride[0] = length0;
+ colTPlan->outStride.push_back(1);
+ colTPlan->oDist = length0 * length1;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
{
- //large1D_Xfactor will not pass to the second level of call
- clLengths[0] = fftPlan->large1D_Xfactor;
- clLengths[1] = fftPlan->length[0]/clLengths[0];
- ARG_CHECK (fftPlan->length[0] == clLengths[0] * clLengths[1]);
+ colTPlan->length.push_back(fftPlan->length[index]);
+ colTPlan->inStride.push_back(colTPlan->iDist);
+ colTPlan->outStride.push_back(colTPlan->oDist);
+ colTPlan->iDist *= fftPlan->length[index];
+ colTPlan->oDist *= fftPlan->length[index];
}
- while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
+
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+
+ //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan large1D row failed" ) );
+
+ FFTPlan* col2Plan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+ // common part for both passes
+ col2Plan->placeness = CLFFT_OUTOFPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->outputLayout = fftPlan->outputLayout;
+
+ col2Plan->precision = fftPlan->precision;
+ col2Plan->forwardScale = fftPlan->forwardScale;
+ col2Plan->backwardScale = fftPlan->backwardScale;
+ col2Plan->tmpBufSize = 0;
+ col2Plan->batchsize = fftPlan->batchsize;
+
+ col2Plan->gen = fftPlan->gen;
+ col2Plan->envelope = fftPlan->envelope;
+
+ col2Plan->RCsimple = true;
+ col2Plan->length.push_back(length1);
+
+ col2Plan->inStride[0] = 1;
+ col2Plan->inStride.push_back(length0);
+ col2Plan->iDist = length0 * length1;
+
+ col2Plan->outStride[0] = length1 * fftPlan->outStride[0];
+ col2Plan->outStride.push_back(fftPlan->outStride[0]);
+ col2Plan->oDist = fftPlan->oDist;
+
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
+ col2Plan->iDist *= fftPlan->length[index];
+ col2Plan->outStride.push_back(fftPlan->outStride[index]);
+ }
+
+ OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+ }
+ else
+ {
+
+ if( (fftPlan->length[0] > 262144/PrecisionWidth(fftPlan->precision)) && fftPlan->blockCompute )
{
- if (!IsPo2(fftPlan->length[0])) break;
- //if (fftPlan->precision != CLFFT_SINGLE) break;
- //TBD, only one dimension?
- if (fftPlan->length.size() > 1) break;
- if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
- //This length is good for using transpose
- if (fftPlan->length[0] < 131072) break;
-
- //first version not support huge1D, TBD
- if (clLengths[0] > Large1DThreshold) break;
- ARG_CHECK(clLengths[0]>=32 && clLengths[1]>=32);
+ assert(fftPlan->length[0] <= 1048576);
+
+ size_t padding = 64;
if (fftPlan->tmpBufSize==0 )
{
- fftPlan->tmpBufSize = clLengths[0] * clLengths[1] *
- fftPlan->batchsize * fftPlan->ElementSize();
+ fftPlan->tmpBufSize = (length1 + padding) * length0 *
+ fftPlan->batchsize * fftPlan->ElementSize();
+ for (size_t index=1; index < fftPlan->length.size(); index++)
+ {
+ fftPlan->tmpBufSize *= fftPlan->length[index];
+ }
}
- //Transpose
- //Input --> tmp buffer
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
- _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+ // Algorithm in this case is
+ // T(with pad, out_of_place), R (in_place), C(in_place), Unpad(out_of_place)
+
+ size_t len[3] = { clLengths[1], clLengths[0], 1 };
+
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, len ),
+ _T( "CreateDefaultPlan Large1d trans1 failed" ) );
FFTPlan* trans1Plan = NULL;
lockRAII* trans1Lock = NULL;
@@ -797,260 +1168,99 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans1Plan->inputLayout = fftPlan->inputLayout;
trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
trans1Plan->inStride[0] = fftPlan->inStride[0];
- trans1Plan->inStride[1] = clLengths[0];
+ trans1Plan->inStride[1] = length1;
trans1Plan->outStride[0] = 1;
- trans1Plan->outStride[1] = clLengths[1];
+ trans1Plan->outStride[1] = length0 + padding;
trans1Plan->iDist = fftPlan->iDist;
- trans1Plan->oDist = fftPlan->length[0];
- trans1Plan->gen = Transpose;
+ trans1Plan->oDist = length1 * trans1Plan->outStride[1];
+ trans1Plan->gen = Transpose_GCN;
trans1Plan->transflag = true;
OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
_T( "BakePlan large1d trans1 plan failed" ) );
- //Row transform
- //tmp->output
- //size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
- _T( "CreateDefaultPlan Large1d column failed" ) );
- FFTPlan* row1Plan = NULL;
- lockRAII* row1Lock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
-
- row1Plan->placeness = CLFFT_OUTOFPLACE;
- row1Plan->precision = fftPlan->precision;
- row1Plan->forwardScale = 1.0f;
- row1Plan->backwardScale = 1.0f;
- row1Plan->tmpBufSize = 0;
- row1Plan->batchsize = fftPlan->batchsize;
- row1Plan->bLdsComplex = fftPlan->bLdsComplex;
- row1Plan->uLdsFraction = fftPlan->uLdsFraction;
- row1Plan->ldsPadding = fftPlan->ldsPadding;
- row1Plan->gen = fftPlan->gen;
- row1Plan->envelope = fftPlan->envelope;
-
- //Pass large1D flag to confirm we need multiply twiddle factor
- row1Plan->large1D = fftPlan->length[0];
-
- row1Plan->length.push_back(clLengths[0]);
- row1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- row1Plan->outputLayout = fftPlan->outputLayout;
- row1Plan->inStride[0] = 1;
- row1Plan->outStride[0] = fftPlan->outStride[0];
- row1Plan->iDist = fftPlan->length[0];
- row1Plan->oDist = fftPlan->oDist;
- row1Plan->inStride.push_back(clLengths[1]);
- row1Plan->outStride.push_back(clLengths[1]);
-
- OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
- _T( "BakePlan large1d first row plan failed" ) );
-
- //Transpose 2
- //Output --> tmp buffer
- clLengths[2] = clLengths[0];
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
- _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
-
- FFTPlan* trans2Plan = NULL;
- lockRAII* trans2Lock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
-
- trans2Plan->placeness = CLFFT_OUTOFPLACE;
- trans2Plan->precision = fftPlan->precision;
- trans2Plan->tmpBufSize = 0;
- trans2Plan->batchsize = fftPlan->batchsize;
- trans2Plan->envelope = fftPlan->envelope;
- trans2Plan->inputLayout = fftPlan->outputLayout;
- trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
- trans2Plan->inStride[0] = fftPlan->outStride[0];
- trans2Plan->inStride[1] = clLengths[1];
- trans2Plan->outStride[0] = 1;
- trans2Plan->outStride[1] = clLengths[0];
- trans2Plan->iDist = fftPlan->oDist;
- trans2Plan->oDist = fftPlan->length[0];
- trans2Plan->gen = Transpose;
- trans2Plan->transflag = true;
-
- OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
- _T( "BakePlan large1d trans2 plan failed" ) );
-
- //Row transform 2
- //tmp->tmp
- //size clLengths[0], batch clLengths[1]
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ // row FFT
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[0] ),
_T( "CreateDefaultPlan Large1d column failed" ) );
- FFTPlan* row2Plan = NULL;
- lockRAII* row2Lock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
-
- row2Plan->placeness = CLFFT_INPLACE;
- row2Plan->precision = fftPlan->precision;
- row2Plan->forwardScale = fftPlan->forwardScale;
- row2Plan->backwardScale = fftPlan->backwardScale;
- row2Plan->tmpBufSize = 0;
- row2Plan->batchsize = fftPlan->batchsize;
- row2Plan->bLdsComplex = fftPlan->bLdsComplex;
- row2Plan->uLdsFraction = fftPlan->uLdsFraction;
- row2Plan->ldsPadding = fftPlan->ldsPadding;
- row2Plan->gen = fftPlan->gen;
- row2Plan->envelope = fftPlan->envelope;
-
- //No twiddle factor is needed.
- row2Plan->large1D = 0;
-
- row2Plan->length.push_back(clLengths[1]);
- row2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- row2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
- row2Plan->inStride[0] = 1;
- row2Plan->outStride[0] = 1;
- row2Plan->iDist = fftPlan->length[0];
- row2Plan->oDist = fftPlan->length[0];
- row2Plan->inStride.push_back(clLengths[0]);
- row2Plan->outStride.push_back(clLengths[0]);
-
- OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
- _T( "BakePlan large1d first row plan failed" ) );
-
- //Transpose 3
- //tmp --> output
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
- _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
-
- FFTPlan* trans3Plan = NULL;
- lockRAII* trans3Lock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
-
- trans3Plan->placeness = CLFFT_OUTOFPLACE;
- trans3Plan->precision = fftPlan->precision;
- trans3Plan->tmpBufSize = 0;
- trans3Plan->batchsize = fftPlan->batchsize;
- trans3Plan->envelope = fftPlan->envelope;
- trans3Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- trans3Plan->outputLayout = fftPlan->outputLayout;
- trans3Plan->inStride[0] = 1;
- trans3Plan->inStride[1] = clLengths[0];
- trans3Plan->outStride[0] = fftPlan->outStride[0];
- trans3Plan->outStride[1] = clLengths[1];
- trans3Plan->iDist = fftPlan->length[0];
- trans3Plan->oDist = fftPlan->oDist;
- trans3Plan->gen = Transpose;
- trans3Plan->transflag = true;
-
- OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
- _T( "BakePlan large1d trans3 plan failed" ) );
-
- fftPlan->transflag = true;
- fftPlan->baked = true;
- return CLFFT_SUCCESS;
- }
+ FFTPlan* rowPlan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
- size_t length0 = clLengths[0];
- size_t length1 = clLengths[1];
+ assert(fftPlan->large1D == 0);
- if(fftPlan->inputLayout == CLFFT_REAL)
- {
- if (fftPlan->tmpBufSizeRC==0 )
- {
- fftPlan->tmpBufSizeRC = length0 * length1 *
- fftPlan->batchsize * fftPlan->ElementSize();
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- fftPlan->tmpBufSizeRC *= fftPlan->length[index];
- }
- }
+ rowPlan->placeness = CLFFT_INPLACE;
+ rowPlan->precision = fftPlan->precision;
+ rowPlan->forwardScale = 1.0f;
+ rowPlan->backwardScale = 1.0f;
+ rowPlan->tmpBufSize = 0;
+ rowPlan->batchsize = fftPlan->batchsize;
- // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
- // transposed output
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
- _T( "CreateDefaultPlan Large1d column failed" ) );
+ rowPlan->gen = fftPlan->gen;
+ rowPlan->envelope = fftPlan->envelope;
- FFTPlan* colTPlan = NULL;
- lockRAII* colLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+ rowPlan->length.push_back(length1);
- // current plan is to create intermediate buffer, packed and interleave
- // This is a column FFT, the first elements distance between each FFT is the distance of the first two
- // elements in the original buffer. Like a transpose of the matrix
- // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
- //this part are common for both passes
- colTPlan->placeness = CLFFT_OUTOFPLACE;
- colTPlan->precision = fftPlan->precision;
- colTPlan->forwardScale = 1.0f;
- colTPlan->backwardScale = 1.0f;
- colTPlan->tmpBufSize = 0;
- colTPlan->batchsize = fftPlan->batchsize;
- colTPlan->bLdsComplex = fftPlan->bLdsComplex;
- colTPlan->uLdsFraction = fftPlan->uLdsFraction;
- colTPlan->ldsPadding = fftPlan->ldsPadding;
- colTPlan->gen = fftPlan->gen;
- colTPlan->envelope = fftPlan->envelope;
-
- //Pass large1D flag to confirm we need multiply twiddle factor
- colTPlan->large1D = fftPlan->length[0];
- colTPlan->RCsimple = true;
-
- colTPlan->length.push_back(clLengths[0]);
-
- // first Pass
- colTPlan->inputLayout = fftPlan->inputLayout;
- colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
- colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
- colTPlan->outStride[0] = 1;
- colTPlan->iDist = fftPlan->iDist;
- colTPlan->oDist = length0 * length1;//fftPlan->length[0];
- colTPlan->inStride.push_back(fftPlan->inStride[0]);
- colTPlan->outStride.push_back(length1);//clLengths[1]);
+ rowPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ rowPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ rowPlan->inStride[0] = 1;
+ rowPlan->outStride[0] = 1;
+ rowPlan->inStride.push_back(length0+padding);
+ rowPlan->outStride.push_back(length0+padding);
+ rowPlan->iDist = (length0+padding)*length1;
+ rowPlan->oDist = (length0+padding)*length1;
for (size_t index=1; index < fftPlan->length.size(); index++)
{
- colTPlan->length.push_back(fftPlan->length[index]);
- colTPlan->inStride.push_back(fftPlan->inStride[index]);
- // tmp buffer is tightly packed
- colTPlan->outStride.push_back(colTPlan->oDist);
- colTPlan->oDist *= fftPlan->length[index];
+ rowPlan->length.push_back(fftPlan->length[index]);
+ rowPlan->inStride.push_back(rowPlan->iDist);
+ rowPlan->iDist *= fftPlan->length[index];
+ rowPlan->outStride.push_back(rowPlan->oDist);
+ rowPlan->oDist *= fftPlan->length[index];
}
- OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
- //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
- _T( "CreateDefaultPlan large1D row failed" ) );
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first row plan failed" ) );
- FFTPlan* col2Plan = NULL;
- lockRAII* rowLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+ //column FFT
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+ _T( "CreateDefaultPlan large1D column failed" ) );
- // This is second column fft, intermediate buffer is packed and interleaved
- // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+ FFTPlan* col2Plan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, colLock ), _T( "fftRepo.getPlan failed" ) );
- // common part for both passes
col2Plan->placeness = CLFFT_INPLACE;
col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
-
col2Plan->precision = fftPlan->precision;
col2Plan->forwardScale = fftPlan->forwardScale;
col2Plan->backwardScale = fftPlan->backwardScale;
col2Plan->tmpBufSize = 0;
col2Plan->batchsize = fftPlan->batchsize;
- col2Plan->bLdsComplex = fftPlan->bLdsComplex;
- col2Plan->uLdsFraction = fftPlan->uLdsFraction;
- col2Plan->ldsPadding = fftPlan->ldsPadding;
+
col2Plan->gen = fftPlan->gen;
- col2Plan->envelope = fftPlan->envelope;
+ col2Plan->envelope = fftPlan->envelope;
- col2Plan->length.push_back(length1);
+ col2Plan->large1D = fftPlan->length[0];
+ col2Plan->twiddleFront = true;
+
+ col2Plan->length.push_back(clLengths[0]);
- col2Plan->inStride[0] = length1;
- col2Plan->inStride.push_back(1);
- col2Plan->iDist = length0 * length1;
- col2Plan->outStride[0] = length1;
+
+ col2Plan->blockCompute = true;
+ col2Plan->blockComputeType = BCT_C2C;
+
+ col2Plan->inStride[0] = length0+padding;
+ col2Plan->outStride[0] = length0+padding;
+ col2Plan->iDist = (length0+padding) * length1;
+ col2Plan->oDist = (length0+padding) * length1;
+ col2Plan->inStride.push_back(1);
col2Plan->outStride.push_back(1);
- col2Plan->oDist = length0 * length1;
+
for (size_t index=1; index < fftPlan->length.size(); index++)
{
@@ -1061,21 +1271,19 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
col2Plan->oDist *= fftPlan->length[index];
}
+
OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
- // copy plan to get back to hermitian
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
- _T( "CreateDefaultPlan RC copy failed" ) );
+ // copy plan to get results back to packed output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planCopy, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+ _T( "CreateDefaultPlan Copy failed" ) );
FFTPlan* copyPlan = NULL;
lockRAII* copyLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+ OPENCL_V( fftRepo.getPlan( fftPlan->planCopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
- // This is second column fft, intermediate buffer is packed and interleaved
- // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
- // common part for both passes
copyPlan->placeness = CLFFT_OUTOFPLACE;
copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
copyPlan->outputLayout = fftPlan->outputLayout;
@@ -1085,85 +1293,43 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
copyPlan->backwardScale = 1.0f;
copyPlan->tmpBufSize = 0;
copyPlan->batchsize = fftPlan->batchsize;
- copyPlan->bLdsComplex = fftPlan->bLdsComplex;
- copyPlan->uLdsFraction = fftPlan->uLdsFraction;
- copyPlan->ldsPadding = fftPlan->ldsPadding;
+
copyPlan->gen = Copy;
copyPlan->envelope = fftPlan->envelope;
+ copyPlan->length.push_back(length1);
copyPlan->inStride[0] = 1;
- copyPlan->iDist = fftPlan->length[0];
+ copyPlan->inStride.push_back(length0+padding);
+ copyPlan->iDist = length1*(length0+padding);
copyPlan->outStride[0] = fftPlan->outStride[0];
+ copyPlan->outStride.push_back(length0);
copyPlan->oDist = fftPlan->oDist;
for (size_t index=1; index < fftPlan->length.size(); index++)
{
copyPlan->length.push_back(fftPlan->length[index]);
- copyPlan->inStride.push_back(copyPlan->inStride[index-1] * fftPlan->length[index-1]);
+ copyPlan->inStride.push_back(copyPlan->inStride[index] * copyPlan->length[index]);
copyPlan->iDist *= fftPlan->length[index];
copyPlan->outStride.push_back(fftPlan->outStride[index]);
}
- OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
-
+ OPENCL_V(clfftBakePlan(fftPlan->planCopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d copy plan failed" ) );
}
- else if(fftPlan->outputLayout == CLFFT_REAL)
+ else
{
- if (fftPlan->tmpBufSizeRC==0 )
+
+ if (fftPlan->tmpBufSize==0 )
{
- fftPlan->tmpBufSizeRC = length0 * length1 *
+ fftPlan->tmpBufSize = length0 * length1 *
fftPlan->batchsize * fftPlan->ElementSize();
for (size_t index=1; index < fftPlan->length.size(); index++)
{
- fftPlan->tmpBufSizeRC *= fftPlan->length[index];
+ fftPlan->tmpBufSize *= fftPlan->length[index];
}
}
- // copy plan to from hermitian to full complex
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
- _T( "CreateDefaultPlan RC copy failed" ) );
-
- FFTPlan* copyPlan = NULL;
- lockRAII* copyLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
-
- // This is second column fft, intermediate buffer is packed and interleaved
- // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
-
- // common part for both passes
- copyPlan->placeness = CLFFT_OUTOFPLACE;
- copyPlan->inputLayout = fftPlan->inputLayout;
- copyPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
-
- copyPlan->precision = fftPlan->precision;
- copyPlan->forwardScale = 1.0f;
- copyPlan->backwardScale = 1.0f;
- copyPlan->tmpBufSize = 0;
- copyPlan->batchsize = fftPlan->batchsize;
- copyPlan->bLdsComplex = fftPlan->bLdsComplex;
- copyPlan->uLdsFraction = fftPlan->uLdsFraction;
- copyPlan->ldsPadding = fftPlan->ldsPadding;
- copyPlan->gen = Copy;
- copyPlan->envelope = fftPlan->envelope;
-
- copyPlan->inStride[0] = fftPlan->inStride[0];
- copyPlan->iDist = fftPlan->iDist;
-
- copyPlan->outStride[0] = 1;
- copyPlan->oDist = fftPlan->length[0];
-
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- copyPlan->length.push_back(fftPlan->length[index]);
- copyPlan->outStride.push_back(copyPlan->outStride[index-1] * fftPlan->length[index-1]);
- copyPlan->oDist *= fftPlan->length[index];
- copyPlan->inStride.push_back(fftPlan->inStride[index]);
- }
-
- OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
-
// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
// transposed output
OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
@@ -1173,49 +1339,53 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
lockRAII* colLock = NULL;
OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+ assert(fftPlan->large1D == 0);
+
// current plan is to create intermediate buffer, packed and interleave
// This is a column FFT, the first elements distance between each FFT is the distance of the first two
// elements in the original buffer. Like a transpose of the matrix
// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
//this part are common for both passes
- colTPlan->placeness = CLFFT_INPLACE;
+ colTPlan->placeness = CLFFT_OUTOFPLACE;
colTPlan->precision = fftPlan->precision;
colTPlan->forwardScale = 1.0f;
colTPlan->backwardScale = 1.0f;
colTPlan->tmpBufSize = 0;
colTPlan->batchsize = fftPlan->batchsize;
- colTPlan->bLdsComplex = fftPlan->bLdsComplex;
- colTPlan->uLdsFraction = fftPlan->uLdsFraction;
- colTPlan->ldsPadding = fftPlan->ldsPadding;
+
colTPlan->gen = fftPlan->gen;
colTPlan->envelope = fftPlan->envelope;
//Pass large1D flag to confirm we need multiply twiddle factor
colTPlan->large1D = fftPlan->length[0];
- colTPlan->length.push_back(clLengths[0]);
-
- // first Pass
- colTPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->length.push_back(length0);
- colTPlan->inStride[0] = length0;
- colTPlan->inStride.push_back(1);
- colTPlan->iDist = length0 * length1;
-
- colTPlan->outStride[0] = length0;
- colTPlan->outStride.push_back(1);
+ colTPlan->inputLayout = fftPlan->inputLayout;
+ colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->inStride[0] = fftPlan->inStride[0] * length0;
+ colTPlan->outStride[0] = length0;
+ colTPlan->iDist = fftPlan->iDist;
colTPlan->oDist = length0 * length1;
+ colTPlan->inStride.push_back(fftPlan->inStride[0]);
+ colTPlan->outStride.push_back(1);
+
+ // Enabling block column compute
+ if( (colTPlan->inStride[0] == length0) && IsPo2(fftPlan->length[0]) && (fftPlan->length[0] < 524288) )
+ {
+ colTPlan->blockCompute = true;
+ colTPlan->blockComputeType = BCT_C2C;
+ }
for (size_t index=1; index < fftPlan->length.size(); index++)
{
colTPlan->length.push_back(fftPlan->length[index]);
- colTPlan->inStride.push_back(colTPlan->iDist);
+ colTPlan->inStride.push_back(fftPlan->inStride[index]);
+ // tmp buffer is tightly packed
colTPlan->outStride.push_back(colTPlan->oDist);
- colTPlan->iDist *= fftPlan->length[index];
- colTPlan->oDist *= fftPlan->length[index];
+ colTPlan->oDist *= fftPlan->length[index];
}
@@ -1233,229 +1403,122 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
// common part for both passes
- col2Plan->placeness = CLFFT_OUTOFPLACE;
- col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
col2Plan->outputLayout = fftPlan->outputLayout;
-
col2Plan->precision = fftPlan->precision;
col2Plan->forwardScale = fftPlan->forwardScale;
col2Plan->backwardScale = fftPlan->backwardScale;
col2Plan->tmpBufSize = 0;
col2Plan->batchsize = fftPlan->batchsize;
- col2Plan->bLdsComplex = fftPlan->bLdsComplex;
- col2Plan->uLdsFraction = fftPlan->uLdsFraction;
- col2Plan->ldsPadding = fftPlan->ldsPadding;
- col2Plan->gen = fftPlan->gen;
- col2Plan->envelope = fftPlan->envelope;
-
- col2Plan->RCsimple = true;
- col2Plan->length.push_back(length1);
-
- col2Plan->inStride[0] = 1;
- col2Plan->inStride.push_back(length0);
- col2Plan->iDist = length0 * length1;
-
- col2Plan->outStride[0] = length1 * fftPlan->outStride[0];
- col2Plan->outStride.push_back(fftPlan->outStride[0]);
col2Plan->oDist = fftPlan->oDist;
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- col2Plan->length.push_back(fftPlan->length[index]);
- col2Plan->inStride.push_back(col2Plan->iDist);
- col2Plan->iDist *= fftPlan->length[index];
- col2Plan->outStride.push_back(fftPlan->outStride[index]);
- }
-
- OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
- }
- else
- {
- if (fftPlan->cacheSize) {
- length0 += fftPlan->cacheSize & 0xFF;
- length1 += (fftPlan->cacheSize >> 8) & 0xFF;
- if (length0 * length1 > 2 * fftPlan->length[0])
- {
- length0 = clLengths[0];
- length1 = clLengths[1];
- }
- }
- else
- {
- if (fftPlan->length[0] == 131072) length1 += 1; //x0=0, y0=1 good for Cayman card
- else if (fftPlan->length[0] == 65536) length1 += 8; //x0=0, y0=8 good for Cypress card
- }
-
- if (clLengths[0] > Large1DThreshold)
- {//make no change for Huge 1D case
- length0 = clLengths[0];
- length1 = clLengths[1];
- }
-
- if (fftPlan->tmpBufSize==0 )
- {
- fftPlan->tmpBufSize = length0 * length1 *
- fftPlan->batchsize * fftPlan->ElementSize();
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- fftPlan->tmpBufSize *= fftPlan->length[index];
- }
- }
- else
- {//make no change for cases passed from higher dimension
- length0 = clLengths[0];
- length1 = clLengths[1];
- }
-
- // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
- // transposed output
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
- _T( "CreateDefaultPlan Large1d column failed" ) );
-
- FFTPlan* colTPlan = NULL;
- lockRAII* colLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+ col2Plan->gen = fftPlan->gen;
+ col2Plan->envelope = fftPlan->envelope;
- // current plan is to create intermediate buffer, packed and interleave
- // This is a column FFT, the first elements distance between each FFT is the distance of the first two
- // elements in the original buffer. Like a transpose of the matrix
- // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
- //this part are common for both passes
- colTPlan->placeness = CLFFT_OUTOFPLACE;
- colTPlan->precision = fftPlan->precision;
- colTPlan->forwardScale = 1.0f;
- colTPlan->backwardScale = 1.0f;
- colTPlan->tmpBufSize = 0;
- colTPlan->batchsize = fftPlan->batchsize;
- colTPlan->bLdsComplex = fftPlan->bLdsComplex;
- colTPlan->uLdsFraction = fftPlan->uLdsFraction;
- colTPlan->ldsPadding = fftPlan->ldsPadding;
- colTPlan->gen = fftPlan->gen;
- colTPlan->envelope = fftPlan->envelope;
+ col2Plan->length.push_back(clLengths[1]);
- //Pass large1D flag to confirm we need multiply twiddle factor
- colTPlan->large1D = fftPlan->length[0];
+ bool integratedTranposes = true;
- colTPlan->length.push_back(clLengths[0]);
- if (fftPlan->large1D == 0)
+ if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) && clLengths[0] <= 256)
{
- // first Pass
- colTPlan->inputLayout = fftPlan->inputLayout;
- colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
- colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
- colTPlan->outStride[0] = 1;
- colTPlan->iDist = fftPlan->iDist;
- colTPlan->oDist = length0 * length1;//fftPlan->length[0];
- colTPlan->inStride.push_back(fftPlan->inStride[0]);
- colTPlan->outStride.push_back(length1);//clLengths[1]);
+ col2Plan->blockCompute = true;
+ col2Plan->blockComputeType = BCT_R2C;
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- colTPlan->length.push_back(fftPlan->length[index]);
- colTPlan->inStride.push_back(fftPlan->inStride[index]);
- // tmp buffer is tightly packed
- colTPlan->outStride.push_back(colTPlan->oDist);
- colTPlan->oDist *= fftPlan->length[index];
- }
+ col2Plan->placeness = CLFFT_OUTOFPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->inStride[0] = 1;
+ col2Plan->outStride[0] = length1;
+ col2Plan->iDist = length0 * length1;
+ col2Plan->inStride.push_back(length0);
+ col2Plan->outStride.push_back(1);
}
- else
+ else if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) )
{
- // second pass for huge 1D
- colTPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- colTPlan->outputLayout = fftPlan->outputLayout;
- colTPlan->inStride[0] = fftPlan->length[1]*clLengths[0];
- colTPlan->outStride[0] = fftPlan->outStride[0];
- colTPlan->iDist = fftPlan->length[0];
- colTPlan->oDist = fftPlan->oDist;
- colTPlan->inStride.push_back(fftPlan->length[1]);
- colTPlan->outStride.push_back(fftPlan->outStride[0]*clLengths[1]);
+ integratedTranposes = false;
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- colTPlan->length.push_back(fftPlan->length[index]);
- colTPlan->inStride.push_back(fftPlan->inStride[index]);
- colTPlan->outStride.push_back(fftPlan->outStride[index]);
- colTPlan->iDist *= fftPlan->length[index];
- }
+ col2Plan->placeness = CLFFT_INPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->inStride[0] = 1;
+ col2Plan->outStride[0] = 1;
+ col2Plan->iDist = length0 * length1;
+ col2Plan->oDist = length0 * length1;
+ col2Plan->inStride.push_back(length0);
+ col2Plan->outStride.push_back(length0);
}
-
- OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
-
- //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
- OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
- _T( "CreateDefaultPlan large1D row failed" ) );
-
- FFTPlan* col2Plan = NULL;
- lockRAII* rowLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
-
- // This is second column fft, intermediate buffer is packed and interleaved
- // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
-
- // common part for both passes
- col2Plan->outputLayout = fftPlan->outputLayout;
- col2Plan->precision = fftPlan->precision;
- col2Plan->forwardScale = fftPlan->forwardScale;
- col2Plan->backwardScale = fftPlan->backwardScale;
- col2Plan->tmpBufSize = 0;
- col2Plan->batchsize = fftPlan->batchsize;
- col2Plan->oDist = fftPlan->oDist;
- col2Plan->bLdsComplex = fftPlan->bLdsComplex;
- col2Plan->uLdsFraction = fftPlan->uLdsFraction;
- col2Plan->ldsPadding = fftPlan->ldsPadding;
- col2Plan->gen = fftPlan->gen;
- col2Plan->envelope = fftPlan->envelope;
-
- if (clLengths[0] > Large1DThreshold)
- //prepare for huge 1D
- col2Plan->large1D = fftPlan->length[0];
-
- col2Plan->length.push_back(clLengths[1]);
- col2Plan->outStride.push_back(fftPlan->outStride[0]);
-
- if (fftPlan->large1D == 0)
+ else
{
//first layer, large 1D from tmp buffer to output buffer
col2Plan->placeness = CLFFT_OUTOFPLACE;
col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- col2Plan->inStride[0] = length1;//clLengths[1];
+ col2Plan->inStride[0] = 1;
col2Plan->outStride[0] = fftPlan->outStride[0] * clLengths[1];
col2Plan->iDist = length0 * length1; //fftPlan->length[0];
- col2Plan->inStride.push_back(1);
+ col2Plan->inStride.push_back(length0);
+ col2Plan->outStride.push_back(fftPlan->outStride[0]);
+ }
+ if(!integratedTranposes)
+ {
for (size_t index=1; index < fftPlan->length.size(); index++)
{
col2Plan->length.push_back(fftPlan->length[index]);
col2Plan->inStride.push_back(col2Plan->iDist);
- col2Plan->outStride.push_back(fftPlan->outStride[index]);
- col2Plan->iDist *= fftPlan->length[index];
+ col2Plan->outStride.push_back(col2Plan->oDist);
+ col2Plan->iDist *= fftPlan->length[index];
+ col2Plan->oDist *= fftPlan->length[index];
}
}
else
{
- //second layer, huge 1D from output buffer to output buffer
- col2Plan->placeness = CLFFT_INPLACE;
- col2Plan->inputLayout = fftPlan->outputLayout;
- col2Plan->inStride[0] = fftPlan->outStride[0] * clLengths[1];
- col2Plan->outStride[0] = col2Plan->inStride[0];
- col2Plan->iDist = fftPlan->oDist;
- col2Plan->inStride.push_back(fftPlan->outStride[0]);
-
for (size_t index=1; index < fftPlan->length.size(); index++)
{
col2Plan->length.push_back(fftPlan->length[index]);
- col2Plan->inStride.push_back(fftPlan->outStride[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
col2Plan->outStride.push_back(fftPlan->outStride[index]);
+ col2Plan->iDist *= fftPlan->length[index];
}
}
+
OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+
+ if(!integratedTranposes)
+ {
+ //Transpose
+ //tmp --> output
+ OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
+ _T( "CreateDefaultPlan Large1d transpose failed" ) );
+
+ FFTPlan* trans3Plan = NULL;
+ lockRAII* trans3Lock = NULL;
+ OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
+
+ trans3Plan->placeness = CLFFT_OUTOFPLACE;
+ trans3Plan->precision = fftPlan->precision;
+ trans3Plan->tmpBufSize = 0;
+ trans3Plan->batchsize = fftPlan->batchsize;
+ trans3Plan->envelope = fftPlan->envelope;
+ trans3Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ trans3Plan->outputLayout = fftPlan->outputLayout;
+ trans3Plan->inStride[0] = 1;
+ trans3Plan->inStride[1] = clLengths[0];
+ trans3Plan->outStride[0] = fftPlan->outStride[0];
+ trans3Plan->outStride[1] = clLengths[1] * fftPlan->outStride[0];
+ trans3Plan->iDist = fftPlan->length[0];
+ trans3Plan->oDist = fftPlan->oDist;
+ trans3Plan->gen = Transpose_GCN;
+ trans3Plan->transflag = true;
+
+ OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
+ _T( "BakePlan large1d trans plan failed" ) );
+ }
}
+ }
- fftPlan->baked = true;
- return CLFFT_SUCCESS;
+ fftPlan->baked = true;
+ return CLFFT_SUCCESS;
}
}
break;
@@ -1465,40 +1528,28 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
size_t length1 = fftPlan->length[1];
- if (fftPlan->cacheSize)
+ if (fftPlan->length[0]==256 && fftPlan->length[1]==256)
{
- length0 += fftPlan->cacheSize & 0xFF;
- length1 += (fftPlan->cacheSize >> 8) & 0xFF;
- if (length0 * length1 > 2 * fftPlan->length[0] * fftPlan->length[1])
- {
- length0 = fftPlan->length[0];
- length1 = fftPlan->length[1];
- }
+ length0 += 8;
+ length1 += 1;
}
- else
+ else if (fftPlan->length[0]==512 && fftPlan->length[1]==512)
{
- if (fftPlan->length[0]==256 && fftPlan->length[1]==256)
- {
- length0 += 8;
- length1 += 1;
- }
- else if (fftPlan->length[0]==512 && fftPlan->length[1]==512)
- {
- length0 += 1;
- length1 += 1;//length1 += 0;
- }
- else if (fftPlan->length[0]==1024 && fftPlan->length[1]==512)
- {
- length0 += 2;
- length1 += 2;//length1 += 0;
- }
- else if (fftPlan->length[0]==1024 && fftPlan->length[1]==1024)
- {
- length0 += 1;
- length1 += 1;//length1 += 0;
- }
+ length0 += 1;
+ length1 += 1;//length1 += 0;
+ }
+ else if (fftPlan->length[0]==1024 && fftPlan->length[1]==512)
+ {
+ length0 += 2;
+ length1 += 2;//length1 += 0;
+ }
+ else if (fftPlan->length[0]==1024 && fftPlan->length[1]==1024)
+ {
+ length0 += 1;
+ length1 += 1;//length1 += 0;
}
+
if (fftPlan->length[0] > Large1DThreshold ||
fftPlan->length[1] > Large1DThreshold)
fftPlan->large2D = true;
@@ -1571,9 +1622,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
rowPlan->forwardScale = 1.0f;
rowPlan->backwardScale = 1.0f;
rowPlan->tmpBufSize = 0;
- rowPlan->bLdsComplex = fftPlan->bLdsComplex;
- rowPlan->uLdsFraction = fftPlan->uLdsFraction;
- rowPlan->ldsPadding = fftPlan->ldsPadding;
+
rowPlan->gen = fftPlan->gen;
rowPlan->envelope = fftPlan->envelope;
rowPlan->batchsize = fftPlan->batchsize;
@@ -1610,7 +1659,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
transPlanX->inputLayout = fftPlan->outputLayout;
transPlanX->precision = fftPlan->precision;
transPlanX->tmpBufSize = 0;
- transPlanX->gen = Transpose;
+ transPlanX->gen = Transpose_VLIW;
transPlanX->envelope = fftPlan->envelope;
transPlanX->batchsize = fftPlan->batchsize;
transPlanX->inStride[0] = fftPlan->outStride[0];
@@ -1689,9 +1738,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colPlan->forwardScale = fftPlan->forwardScale;
colPlan->backwardScale = fftPlan->backwardScale;
colPlan->tmpBufSize = 0;
- colPlan->bLdsComplex = fftPlan->bLdsComplex;
- colPlan->uLdsFraction = fftPlan->uLdsFraction;
- colPlan->ldsPadding = fftPlan->ldsPadding;
+
colPlan->gen = fftPlan->gen;
colPlan->envelope = fftPlan->envelope;
colPlan->batchsize = fftPlan->batchsize;
@@ -1739,7 +1786,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
transPlanY->oDist = fftPlan->oDist;
transPlanY->precision = fftPlan->precision;
transPlanY->tmpBufSize = 0;
- transPlanY->gen = Transpose;
+ transPlanY->gen = Transpose_VLIW;
transPlanY->envelope = fftPlan->envelope;
transPlanY->batchsize = fftPlan->batchsize;
transPlanY->transflag = true;
@@ -1793,9 +1840,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
rowPlan->forwardScale = 1.0f;
rowPlan->backwardScale = 1.0f;
rowPlan->tmpBufSize = fftPlan->tmpBufSize;
- rowPlan->bLdsComplex = fftPlan->bLdsComplex;
- rowPlan->uLdsFraction = fftPlan->uLdsFraction;
- rowPlan->ldsPadding = fftPlan->ldsPadding;
+
rowPlan->gen = fftPlan->gen;
rowPlan->envelope = fftPlan->envelope;
@@ -1855,9 +1900,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colPlan->forwardScale = fftPlan->forwardScale;
colPlan->backwardScale = fftPlan->backwardScale;
colPlan->tmpBufSize = fftPlan->tmpBufSize;
- colPlan->bLdsComplex = fftPlan->bLdsComplex;
- colPlan->uLdsFraction = fftPlan->uLdsFraction;
- colPlan->ldsPadding = fftPlan->ldsPadding;
+
colPlan->gen = fftPlan->gen;
colPlan->envelope = fftPlan->envelope;
@@ -1951,9 +1994,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colPlan->forwardScale = 1.0f;
colPlan->backwardScale = 1.0f;
colPlan->tmpBufSize = fftPlan->tmpBufSize;
- colPlan->bLdsComplex = fftPlan->bLdsComplex;
- colPlan->uLdsFraction = fftPlan->uLdsFraction;
- colPlan->ldsPadding = fftPlan->ldsPadding;
+
colPlan->gen = fftPlan->gen;
colPlan->envelope = fftPlan->envelope;
@@ -1992,9 +2033,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
rowPlan->forwardScale = fftPlan->forwardScale;
rowPlan->backwardScale = fftPlan->backwardScale;
rowPlan->tmpBufSize = fftPlan->tmpBufSize;
- rowPlan->bLdsComplex = fftPlan->bLdsComplex;
- rowPlan->uLdsFraction = fftPlan->uLdsFraction;
- rowPlan->ldsPadding = fftPlan->ldsPadding;
+
rowPlan->gen = fftPlan->gen;
rowPlan->envelope = fftPlan->envelope;
@@ -2052,9 +2091,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
rowPlan->forwardScale = 1.0f;
rowPlan->backwardScale = 1.0f;
rowPlan->tmpBufSize = fftPlan->tmpBufSize;
- rowPlan->bLdsComplex = fftPlan->bLdsComplex;
- rowPlan->uLdsFraction = fftPlan->uLdsFraction;
- rowPlan->ldsPadding = fftPlan->ldsPadding;
+
rowPlan->gen = fftPlan->gen;
rowPlan->envelope = fftPlan->envelope;
@@ -2110,9 +2147,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colPlan->forwardScale = fftPlan->forwardScale;
colPlan->backwardScale = fftPlan->backwardScale;
colPlan->tmpBufSize = fftPlan->tmpBufSize;
- colPlan->bLdsComplex = fftPlan->bLdsComplex;
- colPlan->uLdsFraction = fftPlan->uLdsFraction;
- colPlan->ldsPadding = fftPlan->ldsPadding;
+
colPlan->gen = fftPlan->gen;
colPlan->envelope = fftPlan->envelope;
@@ -2164,9 +2199,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
xyPlan->forwardScale = 1.0f;
xyPlan->backwardScale = 1.0f;
xyPlan->tmpBufSize = fftPlan->tmpBufSize;
- xyPlan->bLdsComplex = fftPlan->bLdsComplex;
- xyPlan->uLdsFraction = fftPlan->uLdsFraction;
- xyPlan->ldsPadding = fftPlan->ldsPadding;
+
xyPlan->gen = fftPlan->gen;
xyPlan->envelope = fftPlan->envelope;
@@ -2219,9 +2252,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colPlan->forwardScale = fftPlan->forwardScale;
colPlan->backwardScale = fftPlan->backwardScale;
colPlan->tmpBufSize = fftPlan->tmpBufSize;
- colPlan->bLdsComplex = fftPlan->bLdsComplex;
- colPlan->uLdsFraction = fftPlan->uLdsFraction;
- colPlan->ldsPadding = fftPlan->ldsPadding;
+
colPlan->gen = fftPlan->gen;
colPlan->envelope = fftPlan->envelope;
@@ -2287,9 +2318,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colPlan->forwardScale = 1.0f;
colPlan->backwardScale = 1.0f;
colPlan->tmpBufSize = fftPlan->tmpBufSize;
- colPlan->bLdsComplex = fftPlan->bLdsComplex;
- colPlan->uLdsFraction = fftPlan->uLdsFraction;
- colPlan->ldsPadding = fftPlan->ldsPadding;
+
colPlan->gen = fftPlan->gen;
colPlan->envelope = fftPlan->envelope;
@@ -2337,9 +2366,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
xyPlan->forwardScale = fftPlan->forwardScale;
xyPlan->backwardScale = fftPlan->backwardScale;
xyPlan->tmpBufSize = fftPlan->tmpBufSize;
- xyPlan->bLdsComplex = fftPlan->bLdsComplex;
- xyPlan->uLdsFraction = fftPlan->uLdsFraction;
- xyPlan->ldsPadding = fftPlan->ldsPadding;
+
xyPlan->gen = fftPlan->gen;
xyPlan->envelope = fftPlan->envelope;
@@ -2391,9 +2418,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
xyPlan->forwardScale = 1.0f;
xyPlan->backwardScale = 1.0f;
xyPlan->tmpBufSize = fftPlan->tmpBufSize;
- xyPlan->bLdsComplex = fftPlan->bLdsComplex;
- xyPlan->uLdsFraction = fftPlan->uLdsFraction;
- xyPlan->ldsPadding = fftPlan->ldsPadding;
+
xyPlan->gen = fftPlan->gen;
xyPlan->envelope = fftPlan->envelope;
@@ -2431,9 +2456,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colPlan->forwardScale = fftPlan->forwardScale;
colPlan->backwardScale = fftPlan->backwardScale;
colPlan->tmpBufSize = fftPlan->tmpBufSize;
- colPlan->bLdsComplex = fftPlan->bLdsComplex;
- colPlan->uLdsFraction = fftPlan->uLdsFraction;
- colPlan->ldsPadding = fftPlan->ldsPadding;
+
colPlan->gen = fftPlan->gen;
colPlan->envelope = fftPlan->envelope;
@@ -2521,97 +2544,8 @@ clfftStatus FFTPlan::ConstructAndEnqueueConstantBuffers( cl_command_queue* commQ
cb_t ConstantBufferParams [CLFFT_CB_SIZE];
memset (& ConstantBufferParams, 0, sizeof (ConstantBufferParams));
- cl_uint nY = 1;
- cl_uint nZ = 0;
- cl_uint nW = 0;
- cl_uint n5 = 0;
-
- switch( /*fftPlan->*/length.size() )
- {
- case 1:
- nY = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
- break;
-
- case 2:
- nY = (cl_uint)/*fftPlan->*/length[DimY];
- nZ = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
- break;
-
- case 3:
- nY = (cl_uint)/*fftPlan->*/length[DimY];
- nZ = (cl_uint)/*fftPlan->*/length[DimZ];
- nW = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
- break;
-
- case 4:
- nY = (cl_uint)/*fftPlan->*/length[DimY];
- nZ = (cl_uint)/*fftPlan->*/length[DimZ];
- nW = (cl_uint)/*fftPlan->*/length[DimW];
- n5 = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
- break;
- }
- ConstantBufferParams[CLFFT_CB_NY ].u = nY;
- ConstantBufferParams[CLFFT_CB_NZ ].u = nZ;
- ConstantBufferParams[CLFFT_CB_NW ].u = nW;
- ConstantBufferParams[CLFFT_CB_N5 ].u = n5;
-
- assert (/*fftPlan->*/inStride.size() == /*fftPlan->*/outStride.size());
-
- switch (/*fftPlan->*/inStride.size()) {
- case 1:
- ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
- ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/iDist);
- break;
-
- case 2:
- ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
- ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
- ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/iDist);
- break;
-
- case 3:
- ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
- ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
- ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/inStride[2]);
- ConstantBufferParams[CLFFT_CB_ISW].u = cl_uint (/*fftPlan->*/iDist);
- break;
-
- case 4:
- ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
- ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
- ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/inStride[2]);
- ConstantBufferParams[CLFFT_CB_ISW].u = cl_uint (/*fftPlan->*/inStride[3]);
- ConstantBufferParams[CLFFT_CB_IS5].u = cl_uint (/*fftPlan->*/iDist);
- break;
- }
+ ConstantBufferParams[0].u = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
- switch (/*fftPlan->*/outStride.size()) {
- case 1:
- ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
- ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/oDist);
- break;
-
- case 2:
- ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
- ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
- ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/oDist);
- break;
-
- case 3:
- ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
- ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
- ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/outStride[2]);
- ConstantBufferParams[CLFFT_CB_OSW].u = cl_uint (/*fftPlan->*/oDist);
- break;
-
- case 4:
- ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
- ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
- ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/outStride[2]);
- ConstantBufferParams[CLFFT_CB_OSW].u = cl_uint (/*fftPlan->*/outStride[3]);
- ConstantBufferParams[CLFFT_CB_OS5].u = cl_uint (/*fftPlan->*/oDist);
- break;
- }
OPENCL_V(clEnqueueWriteBuffer( *commQueueFFT,
/*fftPlan->*/const_buffer,
@@ -2720,11 +2654,6 @@ clfftStatus clfftWritePlanToDisk( clfftPlanHandle plan_handle, const char* filen
// clfftReadPlanFromDisk will read the hex back in as float
planfile << " gen " << plan->gen;
- planfile << " bLdsComplex " << plan->bLdsComplex;
- planfile << " ldsPadding " << plan->ldsPadding;
- planfile << " uLdsFraction " << plan->uLdsFraction;
- planfile << " large1D_Xfactor " << plan->large1D_Xfactor;
- planfile << " cacheSize " << plan->cacheSize;
planfile << " tmpBufSize " << plan->tmpBufSize;
planfile << " large1D " << plan->large1D;
planfile << " large2D " << plan->large2D;
@@ -2994,26 +2923,6 @@ clfftStatus clfftReadPlanFromDisk( clfftPlanHandle plan_handle, const char* file
planfile >> gen_read;
plan->gen = static_cast<clfftGenerators>(gen_read);
}
- else if( next_word == "bLdsComplex" )
- {
- planfile >> plan->bLdsComplex;
- }
- else if( next_word == "ldsPadding" )
- {
- planfile >> plan->ldsPadding;
- }
- else if( next_word == "uLdsFraction" )
- {
- planfile >> plan->uLdsFraction;
- }
- else if( next_word == "large1D_Xfactor" )
- {
- planfile >> plan->large1D_Xfactor;
- }
- else if( next_word == "cacheSize" )
- {
- planfile >> plan->cacheSize;
- }
else if( next_word == "tmpBufSize" )
{
planfile >> plan->tmpBufSize;
@@ -3097,6 +3006,8 @@ clfftStatus clfftDestroyPlan( clfftPlanHandle* plHandle )
clfftDestroyPlan( &fftPlan->planTZ );
if( fftPlan->planRCcopy )
clfftDestroyPlan( &fftPlan->planRCcopy );
+ if( fftPlan->planCopy )
+ clfftDestroyPlan( &fftPlan->planCopy );
fftRepo.deletePlan( plHandle );
@@ -3241,7 +3152,7 @@ clfftStatus FFTPlan::ReleaseBuffers ()
result = tmp;
}
- if( NULL != intBuffer )
+ if( (NULL != intBuffer) && libCreatedIntBuffer )
{
tmp = static_cast< clfftStatus >( clReleaseMemObject( intBuffer ) );
intBuffer = NULL;
@@ -3256,6 +3167,14 @@ clfftStatus FFTPlan::ReleaseBuffers ()
if( CLFFT_SUCCESS == result )
result = tmp;
}
+
+ if( NULL != intBufferC2R )
+ {
+ tmp = static_cast< clfftStatus >( clReleaseMemObject( intBufferC2R ) );
+ intBufferC2R = NULL;
+ if( CLFFT_SUCCESS == result )
+ result = tmp;
+ }
return result;
}
@@ -3264,10 +3183,11 @@ clfftStatus FFTPlan::GetWorkSizes (std::vector<size_t> & globalws, std::vector<
{
switch(gen)
{
- case Stockham: return GetWorkSizesPvt<Stockham>(globalws, localws);
- case Transpose: return GetWorkSizesPvt<Transpose>(globalws, localws);
- case Copy: return GetWorkSizesPvt<Copy>(globalws, localws);
- default: assert(false); return CLFFT_NOTIMPLEMENTED;
+ case Stockham: return GetWorkSizesPvt<Stockham>( globalws, localws );
+ case Transpose_VLIW: return GetWorkSizesPvt<Transpose_VLIW>( globalws, localws );
+ case Transpose_GCN: return GetWorkSizesPvt<Transpose_GCN>( globalws, localws );
+ case Copy: return GetWorkSizesPvt<Copy>( globalws, localws );
+ default: assert( false ); return CLFFT_NOTIMPLEMENTED;
}
}
@@ -3276,8 +3196,9 @@ clfftStatus FFTPlan::GetKernelGenKey (FFTKernelGenKeyParams & params) const
switch(gen)
{
case Stockham: return GetKernelGenKeyPvt<Stockham>(params);
- case Transpose: return GetKernelGenKeyPvt<Transpose>(params);
- case Copy: return GetKernelGenKeyPvt<Copy>(params);
+ case Transpose_VLIW: return GetKernelGenKeyPvt<Transpose_VLIW>(params);
+ case Transpose_GCN: return GetKernelGenKeyPvt<Transpose_GCN>( params );
+ case Copy: return GetKernelGenKeyPvt<Copy>( params );
default: assert(false); return CLFFT_NOTIMPLEMENTED;
}
}
@@ -3287,8 +3208,9 @@ clfftStatus FFTPlan::GenerateKernel (FFTRepo & fftRepo, const cl_command_queue
switch(gen)
{
case Stockham: return GenerateKernelPvt<Stockham>(fftRepo, commQueueFFT);
- case Transpose: return GenerateKernelPvt<Transpose>(fftRepo, commQueueFFT);
- case Copy: return GenerateKernelPvt<Copy>(fftRepo, commQueueFFT);
+ case Transpose_VLIW: return GenerateKernelPvt<Transpose_VLIW>(fftRepo, commQueueFFT);
+ case Transpose_GCN: return GenerateKernelPvt<Transpose_GCN>( fftRepo, commQueueFFT );
+ case Copy: return GenerateKernelPvt<Copy>( fftRepo, commQueueFFT );
default: assert(false); return CLFFT_NOTIMPLEMENTED;
}
}
@@ -3298,16 +3220,22 @@ clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
switch(gen)
{
case Stockham: return GetMax1DLengthPvt<Stockham>(longest);
- //No restriction for transpose kernel
- case Transpose: *longest = 4096; return CLFFT_SUCCESS;
- case Copy: *longest = 4096; return CLFFT_SUCCESS;
+ //No restriction for Transpose_VLIW kernel
+ case Transpose_VLIW: *longest = 4096; return CLFFT_SUCCESS;
+ case Transpose_GCN: *longest = 4096; return CLFFT_SUCCESS;
+ case Copy: *longest = 4096; return CLFFT_SUCCESS;
default: assert(false); return CLFFT_NOTIMPLEMENTED;
}
}
clfftStatus FFTPlan::GetEnvelope (const FFTEnvelope ** ppEnvelope) const
{
- if(&envelope == NULL) assert(false);
+ if( &envelope == NULL )
+ {
+ assert( false );
+ return CLFFT_NOTIMPLEMENTED;
+ }
+
*ppEnvelope = &envelope;
return CLFFT_SUCCESS;
}
diff --git a/src/library/plan.h b/src/library/plan.h
index acafa07..56f5df4 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -70,57 +70,25 @@ namespace ARBITRARY {
// The latter uses half as much LDS space, so twice as many wavefronts can be run
// in parallel.
- TWIDDLE_DEE = 4,
- // 4 bits per row of matrix.
+ TWIDDLE_DEE = 8,
+ // number of bits per row of matrix.
};
+
};
-enum eConstantBuffer {
- /* Layout of a constant buffer passed to the generated kernel
- * This needs to be know by the kernel generator and by the
- * framework code that creates the buffer and fills it at execution time.
- */
-
- // [0] uint NY This is the batchsize for a 1D Array,
- // or the 2nd (Y dimension) for a 2D.
- // [1] uint NZ This is the batchsize for a 2D Array,
- // or the 3rd (Z dimension) for a 3D.
- // [2] uint NW This is the batchsize for a 3D Array,
- // or the 4th (W dimension) for a 4D.
- // [3] uint N5 This is the batchsize for a 4D Array,
- //
- CLFFT_CB_NY = 0,
- CLFFT_CB_NZ,
- CLFFT_CB_NW,
- CLFFT_CB_N5,
-
- // [4] uint ISX Input data X stride (== 1 for row-major compact data)
- // [5] uint ISY Input data Y stride (== X for row-major compact data)
- // [6] uint ISZ Input data Z stride (== X*Y for row-major compact data)
- // [7] uint ISW Input data W stride (== X*Y*Z for row-major compact data)
- // [8] uint IS5 Input data 5th stride
- //
- CLFFT_CB_ISX,
- CLFFT_CB_ISY,
- CLFFT_CB_ISZ,
- CLFFT_CB_ISW,
- CLFFT_CB_IS5,
-
- // [9] uint OSX Output data X stride
- // [10] uint OSY Output data Y stride
- // [11] uint OSZ Output data Z stride
- // [12] uint OSW Output data W stride
- // [13] uint OS5 Output data 5th stride
- //
- CLFFT_CB_OSX,
- CLFFT_CB_OSY,
- CLFFT_CB_OSZ,
- CLFFT_CB_OSW,
- CLFFT_CB_OS5,
- CLFFT_CB_SIZE = 32,
+enum BlockComputeType
+{
+ BCT_C2C, // Column to column
+ BCT_C2R, // Column to row
+ BCT_R2C, // Row to column
};
+
+
+#define CLFFT_CB_SIZE 32
+#define CLFFT_MAX_INTERNAL_DIM 16
+
struct FFTKernelGenKeyParams {
/*
* This structure distills a subset of the fftPlan data,
@@ -129,10 +97,10 @@ struct FFTKernelGenKeyParams {
* been compiled.
*/
size_t fft_DataDim; // Dimensionality of the data
- size_t fft_N[5]; // [0] is FFT size, e.g. 1024
+ size_t fft_N[CLFFT_MAX_INTERNAL_DIM]; // [0] is FFT size, e.g. 1024
// This must be <= size of LDS!
- size_t fft_inStride [5]; // input strides
- size_t fft_outStride[5]; // output strides
+ size_t fft_inStride [CLFFT_MAX_INTERNAL_DIM]; // input strides
+ size_t fft_outStride[CLFFT_MAX_INTERNAL_DIM]; // output strides
clfftResultLocation fft_placeness;
clfftLayout fft_inputLayout;
@@ -145,18 +113,55 @@ struct FFTKernelGenKeyParams {
size_t fft_LDSsize; // Limit the use of LDS to this many bytes.
size_t fft_R; // # of complex values to keep in working registers
// SIMD size * R must be <= size of LDS!
- size_t fft_MaxRadix; // Limit the radix to this value.
+
size_t fft_MaxWorkGroupSize; // Limit for work group size
- bool fft_LdsComplex; // If true, store complex values in LDS memory
- // If false, store scalare values in LDS.
- // Generally, false will provide more efficient kernels,
- // but not always.
- // see FFTPlan::bLdsComplex and ARBITRARY::LDS_COMPLEX
- bool fft_ldsPadding; // default padding is false
+
bool fft_3StepTwiddle; // This is one pass of the "3-step" algorithm;
// so extra twiddles are applied on output.
- bool fft_UseFMA; // *** TODO
+ bool fft_twiddleFront; // do twiddle scaling at the beginning pass
+
+
bool fft_RCsimple;
+
+ bool transOutHorizontal; // tiles traverse the output buffer in horizontal direction
+
+ bool blockCompute;
+ BlockComputeType blockComputeType;
+ size_t blockSIMD;
+ size_t blockLDS;
+
+
+ // Default constructor
+ FFTKernelGenKeyParams()
+ {
+ fft_DataDim = 0;
+ for(int i=0; i<CLFFT_MAX_INTERNAL_DIM; i++)
+ {
+ fft_N[i] = 0;
+ fft_inStride[i] = 0;
+ fft_outStride[i] = 0;
+ }
+
+ fft_placeness = CLFFT_OUTOFPLACE;
+ fft_inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ fft_outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ fft_precision = CLFFT_SINGLE;
+ fft_fwdScale = fft_backScale = 0.0;
+ fft_SIMD = 0;
+ fft_LDSsize = 0;
+ fft_R = 0;
+ fft_MaxWorkGroupSize = 0;
+ fft_3StepTwiddle = false;
+ fft_twiddleFront = false;
+
+ transOutHorizontal = false;
+
+ fft_RCsimple = false;
+ blockCompute = false;
+ blockComputeType = BCT_C2C;
+ blockSIMD = 0;
+ blockLDS = 0;
+ }
};
@@ -185,7 +190,7 @@ struct FFTEnvelope {
, limit_Dimensions (0)
, limit_WorkGroupSize (0)
{
- ::memset (& limit_Size, 0, sizeof (limit_Size));
+ ::memset( &limit_Size, 0, sizeof( limit_Size ) );
}
};
@@ -202,14 +207,14 @@ class FFTPlan
clfftStatus GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const;
template <clfftGenerators G>
- clfftStatus GenerateKernelPvt (FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const;
+ clfftStatus GenerateKernelPvt (FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const;
template <clfftGenerators G>
clfftStatus GetMax1DLengthPvt (size_t *longest ) const;
public:
+
bool baked;
- bool readFromFile;
// Properties provided by the user.
clfftDim dim;
@@ -227,8 +232,9 @@ public:
// TODO, change this logic for handling multiple GPUs/devices
cl_device_id bakeDevice;
+ // Disabling devices member, plan has 1-on-1 mapping with single device as identified by bakeDevice
// Devices that the user specified in the context passed to the create function
- std::vector< cl_device_id > devices;
+ // std::vector< cl_device_id > devices;
// Length of the FFT in each dimension
std::vector< size_t > length;
@@ -239,14 +245,11 @@ public:
// Hardware Limits
FFTEnvelope envelope;
- // Performance Tuning parameters
- bool bLdsComplex; // see ARBITRARY::LDS_COMPLEX
- bool ldsPadding; // see ARBITRARY::LDS_PADDING
- unsigned uLdsFraction; // see ARBITRARY::LDS_FRACTION_IDEAL
// Reserved copy for large 1d, 2d, and 3d plan
size_t tmpBufSize;
cl_mem intBuffer;
+ bool libCreatedIntBuffer;
// for RC copies
size_t tmpBufSizeRC;
@@ -256,21 +259,23 @@ public:
size_t tmpBufSizeC2R;
cl_mem intBufferC2R;
- //extra cache size for 2d and 3d
- size_t cacheSize;
+
size_t large1D;
bool large2D;
- size_t large1D_Xfactor;
+ bool twiddleFront;
+
clfftPlanHandle planX;
clfftPlanHandle planY;
clfftPlanHandle planZ;
bool transflag;
+ bool transOutHorizontal;
clfftPlanHandle planTX;
clfftPlanHandle planTY;
clfftPlanHandle planTZ; //reserve for 3D transpose
clfftPlanHandle planRCcopy;
+ clfftPlanHandle planCopy;
// Plan resources
//
@@ -279,23 +284,25 @@ public:
// Generator type
clfftGenerators gen;
- // stored binaries
- size_t number_of_devices;
-
-//TODO caching kernel binaries for later reload
-#if 0
- std::unique_ptr<size_t[]> binary_sizes;
- std::vector< std::unique_ptr<char[]> > binaries;
-#endif
// Real-Complex simple flag
// if this is set we do real to-and-from full complex using simple algorithm
// where imaginary of input is set to zero in forward and imaginary not written in backward
bool RCsimple;
+
+ // User created plan
+ bool userPlan;
+
+ // A flag to say that blocked FFTs are going to be performed
+ // It can only be one of these: column to row, row to column or column to column
+ // row to row is just the normal case where blocking is not needed
+ bool blockCompute;
+ BlockComputeType blockComputeType;
+
+
FFTPlan ()
: baked (false)
- , readFromFile (false)
, dim (CLFFT_1D)
, inputLayout (CLFFT_COMPLEX_INTERLEAVED)
, outputLayout (CLFFT_COMPLEX_INTERLEAVED)
@@ -309,28 +316,29 @@ public:
, batchsize (1)
, tmpBufSize (0)
, intBuffer( NULL )
+ , libCreatedIntBuffer(false)
, tmpBufSizeRC (0)
, intBufferRC( NULL )
, tmpBufSizeC2R (0)
, intBufferC2R( NULL )
, large1D(0)
, large2D(false)
+ , twiddleFront(false)
, planX( 0 )
, planY( 0 )
, planZ( 0 )
, transflag(false)
+ , transOutHorizontal(false)
, RCsimple(false)
+ , userPlan(false)
+ , blockCompute(false)
+ , blockComputeType(BCT_C2C)
, planTX( 0 )
, planTY( 0 )
, planTZ( 0 )
, planRCcopy(0)
+ , planCopy(0)
, const_buffer( NULL )
- , bLdsComplex (ARBITRARY::LDS_COMPLEX)
- , ldsPadding (ARBITRARY::LDS_PADDING)
- , uLdsFraction (0/*ARBITRARY::LDS_FRACTION_IDEAL*/)
- , large1D_Xfactor(0)
- , cacheSize(0)
- , number_of_devices(0)
, gen(Stockham)
{};
diff --git a/src/library/private.h b/src/library/private.h
index 7c00ca3..000ab65 100644
--- a/src/library/private.h
+++ b/src/library/private.h
@@ -270,8 +270,6 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
// This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
// If an error occurs, we issue a return statement to exit the calling function.
-#if defined( _DEBUG )
-
#define OPENCL_V( fn, msg ) \
{ \
clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
@@ -290,23 +288,6 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
} \
}
-#else
-
-#define OPENCL_V( fn, msg ) \
-{ \
- clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
- switch( vclStatus ) \
- { \
- case CL_SUCCESS: /**< No error */ \
- break; \
- default: \
- { \
- return vclStatus; \
- } \
- } \
-}
-#endif
-
static inline bool IsPo2 (size_t u) {
return (u != 0) && (0 == (u & (u-1)));
}
@@ -353,8 +334,7 @@ CLFFTAPI clfftStatus clfftWritePlanToDisk( clfftPlanHandle plHandle, const char*
*/
CLFFTAPI clfftStatus clfftReadPlanFromDisk( clfftPlanHandle plHandle, const char* filename );
-/* internal api to set up some plan paramters */
-CLFFTAPI clfftStatus clfftSetInternal( const clfftPlanHandle plHandle, void* dataInternal );
+
#ifdef __cplusplus
}
diff --git a/src/library/repo.cpp b/src/library/repo.cpp
index 0b6e532..6d44985 100644
--- a/src/library/repo.cpp
+++ b/src/library/repo.cpp
@@ -91,11 +91,12 @@ clfftStatus FFTRepo::releaseResources( )
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const std::string& kernel, const cl_context& context )
+clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const std::string& kernel, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "setProgramCode" ) );
- std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+ ClPair clPair = std::make_pair(planContext, device);
+ std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
fftRepoKey key = std::make_pair( gen, Params );
@@ -126,11 +127,12 @@ clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelG
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, std::string& kernel, const cl_context& context )
+clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "getProgramCode" ) );
- std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+ ClPair clPair = std::make_pair(planContext, device);
+ std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
fftRepoKey key = std::make_pair( gen, Params );
fftRepo_iterator pos = mapFFTs.find( key);
@@ -142,11 +144,12 @@ clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelG
}
clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
- const char * kernel_fwd, const char * kernel_back, const cl_context& context )
+ const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "setProgramEntryPoints" ) );
- std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+ ClPair clPair = std::make_pair(planContext, device);
+ std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
fftRepoKey key = std::make_pair( gen, Params );
fftRepoValue& fft = mapFFTs[ key ];
@@ -157,11 +160,12 @@ clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFT
}
clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
- clfftDirection dir, std::string& kernel, const cl_context& context )
+ clfftDirection dir, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "getProgramEntryPoint" ) );
- std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+ ClPair clPair = std::make_pair(planContext, device);
+ std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
fftRepoKey key = std::make_pair( gen, Params );
fftRepo_iterator pos = mapFFTs.find( key );
@@ -186,7 +190,7 @@ clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTK
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog )
+clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "setclProgram" ) );
@@ -196,7 +200,8 @@ clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGen
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, ProgramContext);
+ ClPair clPair = std::make_pair(planContext, device);
+ std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
fftRepoKey key = std::make_pair( gen, Params );
fftRepo_iterator pos = mapFFTs.find( key );
@@ -213,11 +218,12 @@ clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGen
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_context& PlanContext )
+clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "getclProgram" ) );
- std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, PlanContext);
+ ClPair clPair = std::make_pair(planContext, device);
+ std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
fftRepoKey key = std::make_pair( gen, Params );
fftRepo_iterator pos = mapFFTs.find( key );
@@ -227,9 +233,9 @@ clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGen
if (NULL == prog)
return CLFFT_INVALID_PROGRAM;
- cl_context ProgContext;
- clGetProgramInfo(prog, CL_PROGRAM_CONTEXT, sizeof(cl_context), &ProgContext, NULL);
- if (PlanContext!=ProgContext)
+ cl_context progContext;
+ clGetProgramInfo(prog, CL_PROGRAM_CONTEXT, sizeof(cl_context), &progContext, NULL);
+ if (planContext!=progContext)
return CLFFT_INVALID_PROGRAM;
return CLFFT_SUCCESS;
diff --git a/src/library/repo.h b/src/library/repo.h
index f2619e7..9adc349 100644
--- a/src/library/repo.h
+++ b/src/library/repo.h
@@ -51,7 +51,8 @@ class FFTRepo
// has created
//typedef std::pair< clfftGenerators, FFTKernelGenKeyParams > fftRepoKey;
- typedef std::pair< clfftGenerators, std::pair<FFTKernelGenKeyParams, cl_context> > fftRepoKey;
+ typedef std::pair< cl_context, cl_device_id > ClPair;
+ typedef std::pair< clfftGenerators, std::pair<FFTKernelGenKeyParams, ClPair> > fftRepoKey;
typedef std::map< fftRepoKey, fftRepoValue > fftRepoType;
typedef fftRepoType::iterator fftRepo_iterator;
@@ -139,15 +140,14 @@ public:
clfftStatus releaseResources( );
- clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, const std::string& kernel, const cl_context& context);
- clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, std::string& kernel, const cl_context& context );
+ clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, const std::string& kernel, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, std::string& kernel, const cl_device_id &device, const cl_context& planContext );
- clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
- const char * kernel_fwd, const char * kernel_back, const cl_context& context );
- clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, clfftDirection dir, std::string& kernel , const cl_context& context);
+ clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, clfftDirection dir, std::string& kernel , const cl_device_id &device, const cl_context& planContext );
- clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& kernel );
- clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& kernel, const cl_context& PlanContext );
+ clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext );
clfftStatus setclKernel ( cl_program prog, clfftDirection dir, const cl_kernel& kernel );
clfftStatus getclKernel ( cl_program prog, clfftDirection dir, cl_kernel& kernel );
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 56a50dd..8ab7efa 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -89,6 +89,7 @@ clfftStatus clfftEnqueueTransform(
fftPlan->intBuffer = clCreateBuffer( fftPlan->context, CL_MEM_READ_WRITE,
fftPlan->tmpBufSize, 0, &status);
OPENCL_V( status, _T("Creating the intermediate buffer for large1D Failed") );
+ fftPlan->libCreatedIntBuffer = true;
#if defined(DEBUGGING)
std::cout << "One intermediate buffer is created" << std::endl;
@@ -155,8 +156,6 @@ clfftStatus clfftEnqueueTransform(
_T("clfftEnqueueTransform large1D RC copy failed"));
clReleaseEvent(copyInEvents);
- return CLFFT_SUCCESS;
-
}
else if( fftPlan->outputLayout == CLFFT_REAL )
{
@@ -184,8 +183,6 @@ clfftStatus clfftEnqueueTransform(
_T("clfftEnqueueTransform large1D second column failed"));
clReleaseEvent(colOutEvents);
-
- return CLFFT_SUCCESS;
}
else
{
@@ -207,7 +204,7 @@ clfftStatus clfftEnqueueTransform(
//First time usage, we can initialize tmp buffer
OPENCL_V(clEnqueueWriteBuffer( *commQueues,
localIntBuffer,
- 1, // blocking write
+ CL_TRUE, // blocking write
0,
buffSizeBytes_complex,
&temp[0],
@@ -232,6 +229,15 @@ clfftStatus clfftEnqueueTransform(
else
mybuffers = clOutputBuffers;
+#if defined(DEBUGGING)
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ clFinish(*commQueues);
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+ NULL, NULL ),
+ _T("Reading the result buffer failed") );
+#endif
+
//First Row
//tmp->output
cl_event rowXOutEvents = NULL;
@@ -240,6 +246,16 @@ clfftStatus clfftEnqueueTransform(
_T("clfftEnqueueTransform for large1D rowX failed"));
clReleaseEvent(transTXOutEvents);
+
+#if defined(DEBUGGING)
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ clFinish(*commQueues);
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, *mybuffers, CL_TRUE, 0, 536870912, &temp[ 0 ], 0,
+ NULL, NULL ),
+ _T("Reading the result buffer failed") );
+#endif
+
//Second Transpose
// output->tmp
cl_event transTYOutEvents = NULL;
@@ -248,6 +264,16 @@ clfftStatus clfftEnqueueTransform(
_T("clfftEnqueueTransform for large1D transTY failed"));
clReleaseEvent(rowXOutEvents);
+
+#if defined(DEBUGGING)
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ clFinish(*commQueues);
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+ NULL, NULL ),
+ _T("Reading the result buffer failed") );
+#endif
+
//Second Row
//tmp->tmp, inplace
cl_event rowYOutEvents = NULL;
@@ -256,6 +282,15 @@ clfftStatus clfftEnqueueTransform(
_T("clfftEnqueueTransform for large1D rowY failed"));
clReleaseEvent(transTYOutEvents);
+#if defined(DEBUGGING)
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ clFinish(*commQueues);
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+ NULL, NULL ),
+ _T("Reading the result buffer failed") );
+#endif
+
//Third Transpose
// tmp->output
OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
@@ -263,117 +298,185 @@ clfftStatus clfftEnqueueTransform(
_T("clfftEnqueueTransform for large1D transTZ failed"));
clReleaseEvent(rowYOutEvents);
- if( fftRepo.pStatTimer )
+ }
+ else
+ {
+ if (fftPlan->large1D == 0)
{
- fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
- }
+ if(fftPlan->planCopy)
+ {
+ // Transpose OUTOFPLACE
+ cl_event transTXOutEvents = NULL;
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &transTXOutEvents, clInputBuffers, &localIntBuffer, NULL ),
+ _T("clfftEnqueueTransform for large1D transTX failed"));
- return CLFFT_SUCCESS;
- }
+#if defined(DEBUGGING)
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ clFinish(*commQueues);
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+ NULL, NULL ),
+ _T("Reading the result buffer failed") );
+#endif
- cl_event colOutEvents = NULL;
- if (fftPlan->large1D == 0)
- {
- // First pass
- // column with twiddle first, OUTOFPLACE, + transpose
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &colOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer),
- _T("clfftEnqueueTransform large1D col pass failed"));
+ // FFT INPLACE
+ cl_event rowXOutEvents = NULL;
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
+ &transTXOutEvents, &rowXOutEvents, &localIntBuffer, NULL, NULL),
+ _T("clfftEnqueueTransform large1D first row pass failed"));
+ clReleaseEvent(transTXOutEvents);
#if defined(DEBUGGING)
- // debug purpose, interleave input <-> interleave output
- // read the intermediate buffer and print part of it.
- OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
- &colOutEvents, NULL ),
- _T("Reading the result buffer failed") );
- {
- FFTPlan* fftPlanX = NULL;
- lockRAII* planLockX = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planX, fftPlanX, planLockX ), _T( "fftRepo.getPlan failed" ) );
-
- size_t rows = fftPlanX->length[0];
- size_t cols = fftPlanX->batchsize;
- BUG_CHECK (rows * cols <= temp.size())
- size_t print_cols = std::min<size_t> (4, cols);
- size_t print_rows = std::min<size_t> (4, rows);
- //std::cout << std::endl << "Intermediate buffer:" << std::endl;
- //for (size_t jrow = 0; jrow < print_rows; ++jrow) {
- // for (size_t icol = 0; icol < print_cols; ++icol) {
- // size_t index = jrow *cols + icol;
- // std::complex<float> data = temp[index];
- // std::cout << data;
- // }
- // std::cout << std::endl;
- //}
- }
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ clFinish(*commQueues);
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+ NULL, NULL ),
+ _T("Reading the result buffer failed") );
#endif
- //another column FFT output, OUTOFPLACE
- if (fftPlan->placeness == CLFFT_INPLACE)
- {
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
- outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
- _T("clfftEnqueueTransform large1D second column failed"));
+ // FFT INPLACE
+ cl_event colYOutEvents = NULL;
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowXOutEvents,
+ &colYOutEvents, &localIntBuffer, NULL, NULL ),
+ _T("clfftEnqueueTransform large1D second column failed"));
+ clReleaseEvent(rowXOutEvents);
+
+#if defined(DEBUGGING)
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ clFinish(*commQueues);
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+ NULL, NULL ),
+ _T("Reading the result buffer failed") );
+#endif
+
+ cl_mem *mybuffers;
+ if (fftPlan->placeness==CLFFT_INPLACE)
+ mybuffers = clInputBuffers;
+ else
+ mybuffers = clOutputBuffers;
+
+ // Copy kernel
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planCopy, dir, numQueuesAndEvents, commQueues, 1, &colYOutEvents,
+ outEvents, &localIntBuffer, mybuffers, NULL ),
+ _T("clfftEnqueueTransform large1D copy failed"));
+ clReleaseEvent(colYOutEvents);
+ }
+ else
+ {
+ cl_event colOutEvents = NULL;
+ // First pass
+ // column with twiddle first, OUTOFPLACE, + transpose
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &colOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer),
+ _T("clfftEnqueueTransform large1D col pass failed"));
#if defined(DEBUGGING)
- // For debugging interleave data only,
- // read the input buffer back into memory.
- OPENCL_V( clEnqueueReadBuffer( *commQueues, clInputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
- outEvents, NULL ),
- _T("Reading the result buffer failed") );
+ // debug purpose, interleave input <-> interleave output
+ // read the intermediate buffer and print part of it.
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+ &colOutEvents, NULL ),
+ _T("Reading the result buffer failed") );
#endif
- }
- else
- {
+ if(fftPlan->planTZ)
+ {
+ cl_event rowYOutEvents = NULL;
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+ &rowYOutEvents, &localIntBuffer, NULL, NULL ),
+ _T("clfftEnqueueTransform large1D second row failed"));
+
+ if (fftPlan->placeness == CLFFT_INPLACE)
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1, &rowYOutEvents,
+ outEvents, &localIntBuffer, clInputBuffers, NULL ),
+ _T("clfftEnqueueTransform large1D trans3 failed"));
+ }
+ else
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1, &rowYOutEvents,
+ outEvents, &localIntBuffer, clOutputBuffers, NULL ),
+ _T("clfftEnqueueTransform large1D trans3 failed"));
+ }
+
+ clReleaseEvent(rowYOutEvents);
+
+ }
+ else
+ {
+ //another column FFT output, OUTOFPLACE
+ if (fftPlan->placeness == CLFFT_INPLACE)
+ {
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+ outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+ _T("clfftEnqueueTransform large1D second column failed"));
+
#if defined(DEBUGGING)
- // debug purpose, interleave input <-> interleave output
- OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
- &colOutEvents, NULL ),
- _T("Reading the result buffer failed") );
+ // For debugging interleave data only,
+ // read the input buffer back into memory.
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, clInputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+ outEvents, NULL ),
+ _T("Reading the result buffer failed") );
#endif
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
- outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
- _T("clfftEnqueueTransform large1D second column failed"));
+ }
+ else
+ {
+#if defined(DEBUGGING)
+ // debug purpose, interleave input <-> interleave output
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+ &colOutEvents, NULL ),
+ _T("Reading the result buffer failed") );
+#endif
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+ outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
+ _T("clfftEnqueueTransform large1D second column failed"));
#if defined(DEBUGGING)
- // For debugging interleave data only, read back the output buffer
- //
- OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
- outEvents, NULL ),
- _T("Reading the result buffer failed") );
+ // For debugging interleave data only, read back the output buffer
+ //
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+ outEvents, NULL ),
+ _T("Reading the result buffer failed") );
#endif
+ }
+ }
+
+ clReleaseEvent(colOutEvents);
+ }
}
- }
- else
- {
- // second pass for huge 1D
- // column with twiddle first, OUTOFPLACE, + transpose
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &colOutEvents, &localIntBuffer, clOutputBuffers, localIntBuffer),
- _T("clfftEnqueueTransform Huge1D col pass failed"));
+ else
+ {
+ cl_event colOutEvents = NULL;
+
+ // second pass for huge 1D
+ // column with twiddle first, OUTOFPLACE, + transpose
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &colOutEvents, &localIntBuffer, clOutputBuffers, localIntBuffer),
+ _T("clfftEnqueueTransform Huge1D col pass failed"));
#if defined(DEBUGGING)
- // debug purpose, interleave input <-> interleave output
- OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
- &colOutEvents, NULL ),
- _T("Reading the result buffer failed") );
+ // debug purpose, interleave input <-> interleave output
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+ &colOutEvents, NULL ),
+ _T("Reading the result buffer failed") );
#endif
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
- outEvents, clOutputBuffers, clOutputBuffers, localIntBuffer ),
- _T("clfftEnqueueTransform large1D second column failed"));
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+ outEvents, clOutputBuffers, clOutputBuffers, localIntBuffer ),
+ _T("clfftEnqueueTransform large1D second column failed"));
+ clReleaseEvent(colOutEvents);
+ }
}
+ }
- clReleaseEvent(colOutEvents);
+ if( fftRepo.pStatTimer )
+ {
+ fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
+ }
- if( fftRepo.pStatTimer )
- {
- fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
- }
+ return CLFFT_SUCCESS;
- return CLFFT_SUCCESS;
- }
- break;
}
case CLFFT_2D:
{
@@ -388,7 +491,8 @@ clfftStatus clfftEnqueueTransform(
//size_t buffSizeBytes=sizeof( std::complex< float > )*buffersize;
//std::vector< std::complex< float > > output2( buffersize );
size_t buffSizeBytes=sizeof( float) * buffersize;
- std::vector<float> output2(buffersize*2);
+ //std::vector<float> output2(buffersize*2);
+ float *output2 = new float[buffersize*2];
#endif
#if defined(DEBUGGING)
OPENCL_V( clEnqueueReadBuffer( *commQueues, clInputBuffers[0], CL_TRUE, 0, buffSizeBytes, &output2[ 0 ], 0,
@@ -507,134 +611,132 @@ clfftStatus clfftEnqueueTransform(
}
}
-
- if( fftRepo.pStatTimer )
- {
- fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
- }
-
- return CLFFT_SUCCESS;
- }
-
- if ( (fftPlan->large2D || fftPlan->length.size()>2) &&
- (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
- {
- if (fftPlan->placeness==CLFFT_INPLACE)
- {
- //deal with row first
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &rowOutEvents, clInputBuffers, NULL, localIntBuffer ),
- _T("clfftEnqueueTransform for row failed"));
-
- //deal with column
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
- outEvents, clInputBuffers, NULL, localIntBuffer ),
- _T("clfftEnqueueTransform for column failed"));
- }
- else
- {
- //deal with row first
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &rowOutEvents, clInputBuffers, clOutputBuffers, localIntBuffer ),
- _T("clfftEnqueueTransform for row failed"));
-
- //deal with column
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
- outEvents, clOutputBuffers, NULL, localIntBuffer ),
- _T("clfftEnqueueTransform for column failed"));
-
- }
}
else
{
- if(fftPlan->inputLayout == CLFFT_REAL)
+
+ if ( (fftPlan->large2D || fftPlan->length.size()>2) &&
+ (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
{
if (fftPlan->placeness==CLFFT_INPLACE)
{
- // deal with row
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+ //deal with row first
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
waitEvents, &rowOutEvents, clInputBuffers, NULL, localIntBuffer ),
_T("clfftEnqueueTransform for row failed"));
- // deal with column
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+ //deal with column
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
outEvents, clInputBuffers, NULL, localIntBuffer ),
_T("clfftEnqueueTransform for column failed"));
}
else
{
- // deal with row
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+ //deal with row first
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
waitEvents, &rowOutEvents, clInputBuffers, clOutputBuffers, localIntBuffer ),
_T("clfftEnqueueTransform for row failed"));
- // deal with column
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+ //deal with column
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
outEvents, clOutputBuffers, NULL, localIntBuffer ),
_T("clfftEnqueueTransform for column failed"));
+
}
}
- else if(fftPlan->outputLayout == CLFFT_REAL)
+ else
{
- cl_mem *out_local, *int_local, *out_y;
-
- if(fftPlan->length.size() > 2)
+ if(fftPlan->inputLayout == CLFFT_REAL)
{
- out_local = clOutputBuffers;
- int_local = NULL;
- out_y = clInputBuffers;
+ if (fftPlan->placeness==CLFFT_INPLACE)
+ {
+ // deal with row
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &rowOutEvents, clInputBuffers, NULL, localIntBuffer ),
+ _T("clfftEnqueueTransform for row failed"));
+
+ // deal with column
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+ outEvents, clInputBuffers, NULL, localIntBuffer ),
+ _T("clfftEnqueueTransform for column failed"));
+ }
+ else
+ {
+ // deal with row
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &rowOutEvents, clInputBuffers, clOutputBuffers, localIntBuffer ),
+ _T("clfftEnqueueTransform for row failed"));
+
+ // deal with column
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+ outEvents, clOutputBuffers, NULL, localIntBuffer ),
+ _T("clfftEnqueueTransform for column failed"));
+ }
}
- else
+ else if(fftPlan->outputLayout == CLFFT_REAL)
{
- out_local = (fftPlan->placeness==CLFFT_INPLACE) ? clInputBuffers : clOutputBuffers;
- int_local = fftPlan->tmpBufSizeC2R ? &(fftPlan->intBufferC2R) : &localIntBuffer;
- out_y = int_local;
- }
+ cl_mem *out_local, *int_local, *out_y;
+
+ if(fftPlan->length.size() > 2)
+ {
+ out_local = clOutputBuffers;
+ int_local = NULL;
+ out_y = clInputBuffers;
+ }
+ else
+ {
+ out_local = (fftPlan->placeness==CLFFT_INPLACE) ? clInputBuffers : clOutputBuffers;
+ int_local = fftPlan->tmpBufSizeC2R ? &(fftPlan->intBufferC2R) : &localIntBuffer;
+ out_y = int_local;
+ }
- // deal with column
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &rowOutEvents, clInputBuffers, int_local, localIntBuffer ),
- _T("clfftEnqueueTransform for row failed"));
-
- // deal with row
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
- outEvents, out_y, out_local, localIntBuffer ),
- _T("clfftEnqueueTransform for column failed"));
-
- }
- else
- {
- //deal with row first
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &rowOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer ),
- _T("clfftEnqueueTransform for row failed"));
-
+ // deal with column
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &rowOutEvents, clInputBuffers, int_local, localIntBuffer ),
+ _T("clfftEnqueueTransform for row failed"));
- if (fftPlan->placeness==CLFFT_INPLACE)
- {
- //deal with column
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
- outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+ // deal with row
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+ outEvents, out_y, out_local, localIntBuffer ),
_T("clfftEnqueueTransform for column failed"));
+
}
else
{
- //deal with column
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
- outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
- _T("clfftEnqueueTransform for column failed"));
+ //deal with row first
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &rowOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer ),
+ _T("clfftEnqueueTransform for row failed"));
- #if defined(DEBUGGING)
- OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes, &output2[ 0 ], 1,
- outEvents, NULL ),
- _T("Reading the result buffer failed") );
- #endif
+
+ if (fftPlan->placeness==CLFFT_INPLACE)
+ {
+ //deal with column
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+ outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+ _T("clfftEnqueueTransform for column failed"));
+ }
+ else
+ {
+ //deal with column
+ OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+ outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
+ _T("clfftEnqueueTransform for column failed"));
+
+ #if defined(DEBUGGING)
+ OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes, &output2[ 0 ], 1,
+ outEvents, NULL ),
+ _T("Reading the result buffer failed") );
+ #endif
+ }
}
}
+
+ clReleaseEvent(rowOutEvents);
+
}
- clReleaseEvent(rowOutEvents);
if( fftRepo.pStatTimer )
{
@@ -1162,7 +1264,7 @@ clfftStatus clfftEnqueueTransform(
cl_program prog;
cl_kernel kern;
- OPENCL_V( fftRepo.getclProgram( fftPlan->gen, fftParams, prog, fftPlan->context ), _T( "fftRepo.getclProgram failed" ) );
+ OPENCL_V( fftRepo.getclProgram( fftPlan->gen, fftParams, prog, fftPlan->bakeDevice, fftPlan->context ), _T( "fftRepo.getclProgram failed" ) );
OPENCL_V( fftRepo.getclKernel( prog, dir, kern ), _T( "fftRepo.getclKernels failed" ) );
@@ -1215,11 +1317,11 @@ clfftStatus clfftEnqueueTransform(
}
BUG_CHECK (gWorkSize.size() == lWorkSize.size());
- size_t *lwSize = NULL;
- if(fftPlan->gen != Copy) lwSize = &lWorkSize[ 0 ];
+ //size_t *lwSize = NULL;
+ //if(fftPlan->gen != Copy) lwSize = &lWorkSize[ 0 ];
status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
- NULL, &gWorkSize[ 0 ], lwSize, numWaitEvents, waitEvents, outEvents );
+ NULL, &gWorkSize[ 0 ], &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
OPENCL_V( status, _T( "clEnqueueNDRangeKernel failed" ) );
if( fftRepo.pStatTimer )
diff --git a/src/tests/accuracy_test_pow2.cpp b/src/tests/accuracy_test_pow2.cpp
index 56f6bfd..e395e04 100644
--- a/src/tests/accuracy_test_pow2.cpp
+++ b/src/tests/accuracy_test_pow2.cpp
@@ -1291,6 +1291,91 @@ TEST_F(accuracy_test_pow2_double, large_1D_forward_in_place_complex_planar_to_co
catch( const std::exception& err ) { handle_exception(err); }
}
+// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
+// ^^^^^^^^^^^^^^^^^^^^^^^ huge 1D ^^^^^^^^^^^^^^^^^^^^^^^ //
+// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
+
+// *****************************************************
+// *****************************************************
+
+//#define CLFFT_TEST_HUGE
+#ifdef CLFFT_TEST_HUGE
+
+#define HUGE_TEST_MAKE(test_name, len, bat) \
+template< class T, class cl_T, class fftw_T > \
+void test_name() \
+{ \
+ std::vector<size_t> lengths; \
+ lengths.push_back( len ); \
+ size_t batch = bat; \
+\
+ std::vector<size_t> input_strides; \
+ std::vector<size_t> output_strides; \
+ size_t input_distance = 0; \
+ size_t output_distance = 0; \
+ layout::buffer_layout_t in_layout = layout::complex_planar; \
+ layout::buffer_layout_t out_layout = layout::complex_planar; \
+ placeness::placeness_t placeness = placeness::in_place; \
+ direction::direction_t direction = direction::forward; \
+\
+ data_pattern pattern = sawtooth; \
+ complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness ); \
+}
+
+#define SP_HUGE_TEST(test_name, len, bat) \
+\
+ HUGE_TEST_MAKE(test_name, len, bat) \
+\
+ TEST_F(accuracy_test_pow2_single, test_name) \
+ { \
+ try { test_name< float, cl_float, fftwf_complex >(); } \
+ catch( const std::exception& err ) { handle_exception(err); } \
+ }
+
+#define DP_HUGE_TEST(test_name, len, bat) \
+\
+ HUGE_TEST_MAKE(test_name, len, bat) \
+\
+ TEST_F(accuracy_test_pow2_double, test_name) \
+ { \
+ try { test_name< double, cl_double, fftw_complex >(); } \
+ catch( const std::exception& err ) { handle_exception(err); } \
+ }
+
+SP_HUGE_TEST( huge_sp_test_1, 1048576, 11 )
+SP_HUGE_TEST( huge_sp_test_2, 1048576*2, 7 )
+SP_HUGE_TEST( huge_sp_test_3, 1048576*4, 3 )
+SP_HUGE_TEST( huge_sp_test_4, 1048576*8, 5 )
+SP_HUGE_TEST( huge_sp_test_5, 1048576*16, 3 )
+SP_HUGE_TEST( huge_sp_test_6, 1048576*32, 2 )
+SP_HUGE_TEST( huge_sp_test_7, 1048576*64, 1 )
+
+DP_HUGE_TEST( huge_dp_test_1, 524288, 11 )
+DP_HUGE_TEST( huge_dp_test_2, 524288*2, 7 )
+DP_HUGE_TEST( huge_dp_test_3, 524288*4, 3 )
+DP_HUGE_TEST( huge_dp_test_4, 524288*8, 5 )
+DP_HUGE_TEST( huge_dp_test_5, 524288*16, 3 )
+DP_HUGE_TEST( huge_dp_test_6, 524288*32, 2 )
+DP_HUGE_TEST( huge_dp_test_7, 524288*64, 1 )
+
+SP_HUGE_TEST( large_sp_test_1, 8192, 11 )
+SP_HUGE_TEST( large_sp_test_2, 8192*2, 7 )
+SP_HUGE_TEST( large_sp_test_3, 8192*4, 3 )
+SP_HUGE_TEST( large_sp_test_4, 8192*8, 5 )
+SP_HUGE_TEST( large_sp_test_5, 8192*16, 3 )
+SP_HUGE_TEST( large_sp_test_6, 8192*32, 21 )
+SP_HUGE_TEST( large_sp_test_7, 8192*64, 17 )
+
+DP_HUGE_TEST( large_dp_test_1, 4096, 11 )
+DP_HUGE_TEST( large_dp_test_2, 4096*2, 7 )
+DP_HUGE_TEST( large_dp_test_3, 4096*4, 3 )
+DP_HUGE_TEST( large_dp_test_4, 4096*8, 5 )
+DP_HUGE_TEST( large_dp_test_5, 4096*16, 3 )
+DP_HUGE_TEST( large_dp_test_6, 4096*32, 21 )
+DP_HUGE_TEST( large_dp_test_7, 4096*64, 17 )
+
+#endif
+
// *****************************************************
// *****************************************************
template< class T, class cl_T, class fftw_T >
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list