[clfft] 09/64: merging internal fork with performance optimizations for large sizes and other fixes

Wed May 20 07:33:32 UTC 2015

This is an automated email from the git hooks/post-receive script.

kieffer-guest pushed a commit to branch develop
in repository clfft.

commit 79dc76bdf2c9611981973c9a8857009f2161af3c
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Sun Feb 22 19:45:06 2015 -0600

    merging internal fork with performance optimizations for large sizes and other fixes
---
 src/client/client.cpp                              |   15 +
 src/client/client.h                                |   41 +
 src/client/openCL.misc.cpp                         |  926 +++++-----
 src/library/CMakeLists.txt                         |    6 +-
 src/library/accessors.cpp                          |   63 +-
 src/library/generator.copy.cpp                     |  338 ++--
 src/library/generator.h                            |    9 +-
 src/library/generator.stockham.cpp                 |  840 +++++----
 src/library/generator.stockham.h                   |  278 ++-
 src/library/generator.transpose.gcn.cpp            |  660 +++++++
 ...rator.transpose.h => generator.transpose.gcn.h} |    0
 ....transpose.cpp => generator.transpose.vliw.cpp} |   27 +-
 ...ator.transpose.h => generator.transpose.vliw.h} |    0
 src/library/plan.cpp                               | 1844 ++++++++++----------
 src/library/plan.h                                 |  172 +-
 src/library/private.h                              |   22 +-
 src/library/repo.cpp                               |   36 +-
 src/library/repo.h                                 |   16 +-
 src/library/transform.cpp                          |  478 +++--
 src/tests/accuracy_test_pow2.cpp                   |   85 +
 20 files changed, 3365 insertions(+), 2491 deletions(-)

diff --git a/src/client/client.cpp b/src/client/client.cpp
index be9698e..c495854 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -515,6 +515,9 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 	//
 	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
 
+	Timer tr;
+	tr.Start();
+
 	for( cl_uint i = 0; i < profile_count; ++i )
 	{
 		if( timer ) timer->Start( clFFTID );
@@ -526,6 +529,18 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
 		if( timer ) timer->Stop( clFFTID );
 	}
 	OPENCL_V_THROW( clFinish( queue ), "clFinish failed" );
+	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+	double wtime = tr.Sample()/((double)profile_count);
+	size_t totalLen = 1;
+	for(int i=0; i<dim; i++) totalLen *= lengths[i];
+	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	if(profile_count > 1)
+	{
+		tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl;
+		tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl;
+	}
 
 	if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) )
 	{
diff --git a/src/client/client.h b/src/client/client.h
index ad22f36..2c70aba 100644
--- a/src/client/client.h
+++ b/src/client/client.h
@@ -23,4 +23,45 @@
 //	#define BOOST_PROGRAM_OPTIONS_DYN_LINK
 #include <boost/program_options.hpp>
 
+#ifdef WIN32
+
+struct Timer
+{
+    LARGE_INTEGER start, stop, freq;
+
+public:
+    Timer() { QueryPerformanceFrequency( &freq ); }
+
+    void Start() { QueryPerformanceCounter(&start); }
+    double Sample()
+    {
+        QueryPerformanceCounter  ( &stop );
+        double time = (double)(stop.QuadPart-start.QuadPart) / (double)(freq.QuadPart);
+        return time;
+    }
+};
+
+#else
+
+#include <time.h>
+#include <math.h>
+
+struct Timer
+{
+    struct timespec start, end;
+
+public:
+    Timer() { }
+
+    void Start() { clock_gettime(CLOCK_MONOTONIC, &start); }
+    double Sample()
+    {
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        double time = 1000000000L * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+        return time * 1E-9;
+    }
+};
+
+#endif
+
 #endif
diff --git a/src/client/openCL.misc.cpp b/src/client/openCL.misc.cpp
index 6bbdec3..71e4650 100644
--- a/src/client/openCL.misc.cpp
+++ b/src/client/openCL.misc.cpp
@@ -29,507 +29,509 @@
 
 void prettyPrintPlatformInfo( const cl_platform_id& pId )
 {
-	size_t platformProfileSize	= 0;
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, 0, NULL, &platformProfileSize ),
-		"Getting CL_PLATFORM_PROFILE Platform Info string size ( ::clGetPlatformInfo() )" );
-
-	std::vector< char > szPlatformProfile( platformProfileSize );
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, platformProfileSize, &szPlatformProfile[ 0 ], NULL),
-		"Getting CL_PLATFORM_PROFILE Platform Info string ( ::clGetPlatformInfo() )" );
-
-	size_t platformVersionSize	= 0;
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, 0, NULL, &platformVersionSize ),
-		"Getting CL_PLATFORM_VERSION Platform Info string size ( ::clGetPlatformInfo() )" );
-
-	std::vector< char > szPlatformVersion( platformVersionSize );
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, platformVersionSize, &szPlatformVersion[ 0 ], NULL),
-		"Getting CL_PLATFORM_VERSION Platform Info string ( ::clGetPlatformInfo() )" );
-
-	size_t platformNameSize	= 0;
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, 0, NULL, &platformNameSize ),
-		"Getting CL_PLATFORM_NAME Platform Info string size ( ::clGetPlatformInfo() )" );
-
-	std::vector< char > szPlatformName( platformNameSize );
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, platformNameSize, &szPlatformName[ 0 ], NULL),
-		"Getting CL_PLATFORM_NAME Platform Info string ( ::clGetPlatformInfo() )" );
-
-	size_t vendorStringSize	= 0;
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, 0, NULL, &vendorStringSize ),
-		"Getting CL_PLATFORM_VENDOR Platform Info string size ( ::clGetPlatformInfo() )" );
-
-	std::vector< char > szPlatformVendor( vendorStringSize );
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, vendorStringSize, &szPlatformVendor[ 0 ], NULL),
-		"Getting CL_PLATFORM_VENDOR Platform Info string ( ::clGetPlatformInfo() )" );
-
-	size_t platformExtensionsSize	= 0;
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, 0, NULL, &platformExtensionsSize ),
-		"Getting CL_PLATFORM_EXTENSIONS Platform Info string size ( ::clGetPlatformInfo() )" );
-
-	std::vector< char > szPlatformExtensions( platformExtensionsSize );
-	OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, platformExtensionsSize, &szPlatformExtensions[ 0 ], NULL),
-		"Getting CL_PLATFORM_EXTENSIONS Platform Info string ( ::clGetPlatformInfo() )" );
-
-	const int indent = countOf( "    CL_PLATFORM_EXTENSIONS: " );
-	std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_PROFILE: " << &szPlatformProfile[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_VERSION: " << &szPlatformVersion[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_NAME: " << &szPlatformName[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_VENDOR: " << &szPlatformVendor[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_EXTENSIONS: " << &szPlatformExtensions[ 0 ] << std::endl;
-	std::cout << std::right << std::endl;
+    size_t platformProfileSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, 0, NULL, &platformProfileSize ),
+        "Getting CL_PLATFORM_PROFILE Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformProfile( platformProfileSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, platformProfileSize, &szPlatformProfile[ 0 ], NULL),
+        "Getting CL_PLATFORM_PROFILE Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t platformVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, 0, NULL, &platformVersionSize ),
+        "Getting CL_PLATFORM_VERSION Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformVersion( platformVersionSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, platformVersionSize, &szPlatformVersion[ 0 ], NULL),
+        "Getting CL_PLATFORM_VERSION Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t platformNameSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, 0, NULL, &platformNameSize ),
+        "Getting CL_PLATFORM_NAME Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformName( platformNameSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, platformNameSize, &szPlatformName[ 0 ], NULL),
+        "Getting CL_PLATFORM_NAME Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t vendorStringSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, 0, NULL, &vendorStringSize ),
+        "Getting CL_PLATFORM_VENDOR Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformVendor( vendorStringSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, vendorStringSize, &szPlatformVendor[ 0 ], NULL),
+        "Getting CL_PLATFORM_VENDOR Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t platformExtensionsSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, 0, NULL, &platformExtensionsSize ),
+        "Getting CL_PLATFORM_EXTENSIONS Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformExtensions( platformExtensionsSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, platformExtensionsSize, &szPlatformExtensions[ 0 ], NULL),
+        "Getting CL_PLATFORM_EXTENSIONS Platform Info string ( ::clGetPlatformInfo() )" );
+
+    const int indent = countOf( "    CL_PLATFORM_EXTENSIONS: " );
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_PROFILE: " << &szPlatformProfile[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_VERSION: " << &szPlatformVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_NAME: " << &szPlatformName[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_VENDOR: " << &szPlatformVendor[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_EXTENSIONS: " << &szPlatformExtensions[ 0 ] << std::endl;
+    std::cout << std::right << std::endl;
 }
 
 void prettyPrintDeviceInfo( const cl_device_id& dId )
 {
-	size_t deviceNameSize	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, 0, NULL, &deviceNameSize ),
-		"Getting CL_DEVICE_NAME Platform Info string size ( ::clGetDeviceInfo() )" );
-
-	std::vector< char > szDeviceName( deviceNameSize );
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, deviceNameSize, &szDeviceName[ 0 ], NULL ),
-		"Getting CL_DEVICE_NAME Platform Info string ( ::clGetDeviceInfo() )" );
-
-	size_t deviceVersionSize	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
-		"Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
-	std::vector< char > szDeviceVersion( deviceVersionSize );
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
-		"Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
-	size_t driverVersionSize	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, 0, NULL, &driverVersionSize ),
-		"Getting CL_DRIVER_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
-	std::vector< char > szDriverVersion( driverVersionSize );
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, driverVersionSize, &szDriverVersion[ 0 ], NULL ),
-		"Getting CL_DRIVER_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
-	size_t openCLVersionSize	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &openCLVersionSize ),
-		"Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
-	std::vector< char > szOpenCLVersion( openCLVersionSize );
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, openCLVersionSize, &szOpenCLVersion[ 0 ], NULL ),
-		"Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
-	cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_TYPE, sizeof( cl_device_type ), &devType, NULL ),
-		"Getting CL_DEVICE_TYPE device info ( ::clGetDeviceInfo() )" );
-
-	cl_uint devAddrBits = 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_ADDRESS_BITS, sizeof( cl_uint ), &devAddrBits, NULL ),
-		"Getting CL_DEVICE_ADDRESS_BITS device info ( ::clGetDeviceInfo() )" );
-
-	cl_uint maxClockFreq = 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( cl_uint ), &maxClockFreq, NULL ),
-		"Getting CL_DEVICE_MAX_CLOCK_FREQUENCY device info ( ::clGetDeviceInfo() )" );
-
-	cl_bool devAvailable = CL_FALSE;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_AVAILABLE, sizeof( cl_bool ), &devAvailable, NULL ),
-		"Getting CL_DEVICE_AVAILABLE device info ( ::clGetDeviceInfo() )" );
-
-	cl_bool devCompAvailable = CL_FALSE;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_COMPILER_AVAILABLE, sizeof( cl_bool ), &devCompAvailable, NULL ),
-		"Getting CL_DEVICE_COMPILER_AVAILABLE device info ( ::clGetDeviceInfo() )" );
-
-	size_t devMaxWorkGroup	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &devMaxWorkGroup, NULL ),
-		"Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )" );
-
-	cl_uint devMaxWorkItemDim = CL_FALSE;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( cl_uint ), &devMaxWorkItemDim, NULL ),
-		"Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )" );
-
-	std::vector< size_t >	devMaxWorkItemSizes( devMaxWorkItemDim );
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( size_t )*devMaxWorkItemSizes.size( ), &devMaxWorkItemSizes[0], NULL),
-		"Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )" );
-
-	cl_bool deviceHostUnified = 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( cl_bool ), &deviceHostUnified, NULL ),
-		"Getting CL_DEVICE_HOST_UNIFIED_MEMORY Platform Info string ( ::clGetDeviceInfo() )" );
-
-	cl_ulong devMaxConstantBuffer	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( cl_ulong ), &devMaxConstantBuffer, NULL ),
-		"Getting CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE device info ( ::clGetDeviceInfo() )" );
-
-	cl_ulong devLocalMemSize	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &devLocalMemSize, NULL ),
-		"Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
-
-	cl_ulong deviceGlobalMemSize = 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &deviceGlobalMemSize, NULL ),
-		"Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
-
-	cl_ulong deviceMaxMemAllocSize = 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &deviceMaxMemAllocSize, NULL ),
-		"Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
-
-	size_t deviceExtSize	= 0;
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
-		"Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
-
-	std::vector< char > szDeviceExt( deviceExtSize );
-	OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
-		"Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
-
-	const int indent = countOf( "    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " );
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_NAME: " << &szDeviceName[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_VERSION: " << &szDeviceVersion[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DRIVER_VERSION: " << &szDriverVersion[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_TYPE: "
-		<< (CL_DEVICE_TYPE_DEFAULT     & devType ? "default"     : "")
-		<< (CL_DEVICE_TYPE_CPU         & devType ? "CPU"         : "")
-		<< (CL_DEVICE_TYPE_GPU         & devType ? "GPU"         : "")
-		<< (CL_DEVICE_TYPE_ACCELERATOR & devType ? "Accelerator" : "")
-		<< std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_CLOCK_FREQUENCY: " << maxClockFreq << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_ADDRESS_BITS: " << devAddrBits << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_AVAILABLE: " << ( devAvailable ? "TRUE": "FALSE") << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_COMPILER_AVAILABLE: " << ( devCompAvailable ? "TRUE": "FALSE") << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_OPENCL_C_VERSION: " << &szOpenCLVersion[ 0 ] << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_WORK_GROUP_SIZE: " << devMaxWorkGroup << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << devMaxWorkItemDim << std::endl;
-	for( cl_uint wis = 0; wis < devMaxWorkItemSizes.size( ); ++wis )
-	{
-		std::stringstream dimString;
-		dimString << "Dimension[ " << wis << " ]  ";
-		std::cout << std::right << std::setw( indent ) << dimString.str( ) << devMaxWorkItemSizes[wis] << std::endl;
-	}
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_HOST_UNIFIED_MEMORY: " << ( deviceHostUnified ? "TRUE": "FALSE") << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << devMaxConstantBuffer;
-	std::cout << " ( " << devMaxConstantBuffer / 1024 << " KB )" << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_LOCAL_MEM_SIZE: " << devLocalMemSize;
-	std::cout << " ( " << devLocalMemSize / 1024 << " KB )" << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_GLOBAL_MEM_SIZE: " << deviceGlobalMemSize;
-	std::cout << " ( " << deviceGlobalMemSize / 1048576 << " MB )" << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << deviceMaxMemAllocSize;
-	std::cout << " ( " << deviceMaxMemAllocSize / 1048576 << " MB )" << std::endl;
-	std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_EXTENSIONS: " << &szDeviceExt[ 0 ] << std::endl;
-
-	std::cout << std::right << std::endl;
+    size_t deviceNameSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, 0, NULL, &deviceNameSize ),
+        "Getting CL_DEVICE_NAME Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDeviceName( deviceNameSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, deviceNameSize, &szDeviceName[ 0 ], NULL ),
+        "Getting CL_DEVICE_NAME Platform Info string ( ::clGetDeviceInfo() )" );
+
+    size_t deviceVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+        "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDeviceVersion( deviceVersionSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+        "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+    size_t driverVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, 0, NULL, &driverVersionSize ),
+        "Getting CL_DRIVER_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDriverVersion( driverVersionSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, driverVersionSize, &szDriverVersion[ 0 ], NULL ),
+        "Getting CL_DRIVER_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+    size_t openCLVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &openCLVersionSize ),
+        "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szOpenCLVersion( openCLVersionSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, openCLVersionSize, &szOpenCLVersion[ 0 ], NULL ),
+        "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+    cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_TYPE, sizeof( cl_device_type ), &devType, NULL ),
+        "Getting CL_DEVICE_TYPE device info ( ::clGetDeviceInfo() )" );
+
+    cl_uint devAddrBits = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_ADDRESS_BITS, sizeof( cl_uint ), &devAddrBits, NULL ),
+        "Getting CL_DEVICE_ADDRESS_BITS device info ( ::clGetDeviceInfo() )" );
+
+    cl_uint maxClockFreq = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( cl_uint ), &maxClockFreq, NULL ),
+        "Getting CL_DEVICE_MAX_CLOCK_FREQUENCY device info ( ::clGetDeviceInfo() )" );
+
+    cl_bool devAvailable = CL_FALSE;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_AVAILABLE, sizeof( cl_bool ), &devAvailable, NULL ),
+        "Getting CL_DEVICE_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+    cl_bool devCompAvailable = CL_FALSE;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_COMPILER_AVAILABLE, sizeof( cl_bool ), &devCompAvailable, NULL ),
+        "Getting CL_DEVICE_COMPILER_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+    size_t devMaxWorkGroup	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &devMaxWorkGroup, NULL ),
+        "Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_uint devMaxWorkItemDim = CL_FALSE;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( cl_uint ), &devMaxWorkItemDim, NULL ),
+        "Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )" );
+
+    std::vector< size_t >	devMaxWorkItemSizes( devMaxWorkItemDim );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( size_t )*devMaxWorkItemSizes.size( ), &devMaxWorkItemSizes[0], NULL),
+        "Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )" );
+
+    cl_bool deviceHostUnified = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( cl_bool ), &deviceHostUnified, NULL ),
+        "Getting CL_DEVICE_HOST_UNIFIED_MEMORY Platform Info string ( ::clGetDeviceInfo() )" );
+
+    cl_ulong devMaxConstantBuffer	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( cl_ulong ), &devMaxConstantBuffer, NULL ),
+        "Getting CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_ulong devLocalMemSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &devLocalMemSize, NULL ),
+        "Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_ulong deviceGlobalMemSize = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &deviceGlobalMemSize, NULL ),
+        "Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_ulong deviceMaxMemAllocSize = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &deviceMaxMemAllocSize, NULL ),
+        "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    size_t deviceExtSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+        "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDeviceExt( deviceExtSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+        "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+    const int indent = countOf( "    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " );
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_NAME: " << &szDeviceName[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_VERSION: " << &szDeviceVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DRIVER_VERSION: " << &szDriverVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_TYPE: "
+        << (CL_DEVICE_TYPE_DEFAULT     & devType ? "default"     : "")
+        << (CL_DEVICE_TYPE_CPU         & devType ? "CPU"         : "")
+        << (CL_DEVICE_TYPE_GPU         & devType ? "GPU"         : "")
+        << (CL_DEVICE_TYPE_ACCELERATOR & devType ? "Accelerator" : "")
+        << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_CLOCK_FREQUENCY: " << maxClockFreq << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_ADDRESS_BITS: " << devAddrBits << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_AVAILABLE: " << ( devAvailable ? "TRUE": "FALSE") << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_COMPILER_AVAILABLE: " << ( devCompAvailable ? "TRUE": "FALSE") << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_OPENCL_C_VERSION: " << &szOpenCLVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_WORK_GROUP_SIZE: " << devMaxWorkGroup << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << devMaxWorkItemDim << std::endl;
+    for( cl_uint wis = 0; wis < devMaxWorkItemSizes.size( ); ++wis )
+    {
+        std::stringstream dimString;
+        dimString << "Dimension[ " << wis << " ]  ";
+        std::cout << std::right << std::setw( indent ) << dimString.str( ) << devMaxWorkItemSizes[wis] << std::endl;
+    }
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_HOST_UNIFIED_MEMORY: " << ( deviceHostUnified ? "TRUE": "FALSE") << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << devMaxConstantBuffer;
+    std::cout << " ( " << devMaxConstantBuffer / 1024 << " KB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_LOCAL_MEM_SIZE: " << devLocalMemSize;
+    std::cout << " ( " << devLocalMemSize / 1024 << " KB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_GLOBAL_MEM_SIZE: " << deviceGlobalMemSize;
+    std::cout << " ( " << deviceGlobalMemSize / 1048576 << " MB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << deviceMaxMemAllocSize;
+    std::cout << " ( " << deviceMaxMemAllocSize / 1048576 << " MB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_EXTENSIONS: " << &szDeviceExt[ 0 ] << std::endl;
+
+    std::cout << std::right << std::endl;
 }
 
 //	Verify a failed condition; return true on fail
 inline cl_bool OPENCL_V_FAIL( cl_int res )
 {
-	if( res == CL_SUCCESS )
-		return CL_FALSE;
-	else
-		return CL_TRUE;
+    if( res == CL_SUCCESS )
+        return CL_FALSE;
+    else
+        return CL_TRUE;
 }
 
 std::string prettyPrintclFFTStatus( const cl_int& status )
 {
-	switch( status )
-	{
-		case CLFFT_INVALID_GLOBAL_WORK_SIZE:
-			return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
-		case CLFFT_INVALID_MIP_LEVEL:
-			return "CLFFT_INVALID_MIP_LEVEL";
-		case CLFFT_INVALID_BUFFER_SIZE:
-			return "CLFFT_INVALID_BUFFER_SIZE";
-		case CLFFT_INVALID_GL_OBJECT:
-			return "CLFFT_INVALID_GL_OBJECT";
-		case CLFFT_INVALID_OPERATION:
-			return "CLFFT_INVALID_OPERATION";
-		case CLFFT_INVALID_EVENT:
-			return "CLFFT_INVALID_EVENT";
-		case CLFFT_INVALID_EVENT_WAIT_LIST:
-			return "CLFFT_INVALID_EVENT_WAIT_LIST";
-		case CLFFT_INVALID_GLOBAL_OFFSET:
-			return "CLFFT_INVALID_GLOBAL_OFFSET";
-		case CLFFT_INVALID_WORK_ITEM_SIZE:
-			return "CLFFT_INVALID_WORK_ITEM_SIZE";
-		case CLFFT_INVALID_WORK_GROUP_SIZE:
-			return "CLFFT_INVALID_WORK_GROUP_SIZE";
-		case CLFFT_INVALID_WORK_DIMENSION:
-			return "CLFFT_INVALID_WORK_DIMENSION";
-		case CLFFT_INVALID_KERNEL_ARGS:
-			return "CLFFT_INVALID_KERNEL_ARGS";
-		case CLFFT_INVALID_ARG_SIZE:
-			return "CLFFT_INVALID_ARG_SIZE";
-		case CLFFT_INVALID_ARG_VALUE:
-			return "CLFFT_INVALID_ARG_VALUE";
-		case CLFFT_INVALID_ARG_INDEX:
-			return "CLFFT_INVALID_ARG_INDEX";
-		case CLFFT_INVALID_KERNEL:
-			return "CLFFT_INVALID_KERNEL";
-		case CLFFT_INVALID_KERNEL_DEFINITION:
-			return "CLFFT_INVALID_KERNEL_DEFINITION";
-		case CLFFT_INVALID_KERNEL_NAME:
-			return "CLFFT_INVALID_KERNEL_NAME";
-		case CLFFT_INVALID_PROGRAM_EXECUTABLE:
-			return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
-		case CLFFT_INVALID_PROGRAM:
-			return "CLFFT_INVALID_PROGRAM";
-		case CLFFT_INVALID_BUILD_OPTIONS:
-			return "CLFFT_INVALID_BUILD_OPTIONS";
-		case CLFFT_INVALID_BINARY:
-			return "CLFFT_INVALID_BINARY";
-		case CLFFT_INVALID_SAMPLER:
-			return "CLFFT_INVALID_SAMPLER";
-		case CLFFT_INVALID_IMAGE_SIZE:
-			return "CLFFT_INVALID_IMAGE_SIZE";
-		case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-			return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-		case CLFFT_INVALID_MEM_OBJECT:
-			return "CLFFT_INVALID_MEM_OBJECT";
-		case CLFFT_INVALID_HOST_PTR:
-			return "CLFFT_INVALID_HOST_PTR";
-		case CLFFT_INVALID_COMMAND_QUEUE:
-			return "CLFFT_INVALID_COMMAND_QUEUE";
-		case CLFFT_INVALID_QUEUE_PROPERTIES:
-			return "CLFFT_INVALID_QUEUE_PROPERTIES";
-		case CLFFT_INVALID_CONTEXT:
-			return "CLFFT_INVALID_CONTEXT";
-		case CLFFT_INVALID_DEVICE:
-			return "CLFFT_INVALID_DEVICE";
-		case CLFFT_INVALID_PLATFORM:
-			return "CLFFT_INVALID_PLATFORM";
-		case CLFFT_INVALID_DEVICE_TYPE:
-			return "CLFFT_INVALID_DEVICE_TYPE";
-		case CLFFT_INVALID_VALUE:
-			return "CLFFT_INVALID_VALUE";
-		case CLFFT_MAP_FAILURE:
-			return "CLFFT_MAP_FAILURE";
-		case CLFFT_BUILD_PROGRAM_FAILURE:
-			return "CLFFT_BUILD_PROGRAM_FAILURE";
-		case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
-			return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
-		case CLFFT_IMAGE_FORMAT_MISMATCH:
-			return "CLFFT_IMAGE_FORMAT_MISMATCH";
-		case CLFFT_MEM_COPY_OVERLAP:
-			return "CLFFT_MEM_COPY_OVERLAP";
-		case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
-			return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
-		case CLFFT_OUT_OF_HOST_MEMORY:
-			return "CLFFT_OUT_OF_HOST_MEMORY";
-		case CLFFT_OUT_OF_RESOURCES:
-			return "CLFFT_OUT_OF_RESOURCES";
-		case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
-			return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
-		case CLFFT_COMPILER_NOT_AVAILABLE:
-			return "CLFFT_COMPILER_NOT_AVAILABLE";
-		case CLFFT_DEVICE_NOT_AVAILABLE:
-			return "CLFFT_DEVICE_NOT_AVAILABLE";
-		case CLFFT_DEVICE_NOT_FOUND:
-			return "CLFFT_DEVICE_NOT_FOUND";
-		case CLFFT_SUCCESS:
-			return "CLFFT_SUCCESS";
-		case CLFFT_NOTIMPLEMENTED:
-			return "CLFFT_NOTIMPLEMENTED";
-		case CLFFT_FILE_NOT_FOUND:
-			return "CLFFT_FILE_NOT_FOUND";
-		case CLFFT_FILE_CREATE_FAILURE:
-			return "CLFFT_FILE_CREATE_FAILURE";
-		case CLFFT_VERSION_MISMATCH:
-			return "CLFFT_VERSION_MISMATCH";
-		case CLFFT_INVALID_PLAN:
-			return "CLFFT_INVALID_PLAN";
-		default:
-			return "Error code not defined";
-		break;
-	}
+    switch( status )
+    {
+        case CLFFT_INVALID_GLOBAL_WORK_SIZE:
+            return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
+        case CLFFT_INVALID_MIP_LEVEL:
+            return "CLFFT_INVALID_MIP_LEVEL";
+        case CLFFT_INVALID_BUFFER_SIZE:
+            return "CLFFT_INVALID_BUFFER_SIZE";
+        case CLFFT_INVALID_GL_OBJECT:
+            return "CLFFT_INVALID_GL_OBJECT";
+        case CLFFT_INVALID_OPERATION:
+            return "CLFFT_INVALID_OPERATION";
+        case CLFFT_INVALID_EVENT:
+            return "CLFFT_INVALID_EVENT";
+        case CLFFT_INVALID_EVENT_WAIT_LIST:
+            return "CLFFT_INVALID_EVENT_WAIT_LIST";
+        case CLFFT_INVALID_GLOBAL_OFFSET:
+            return "CLFFT_INVALID_GLOBAL_OFFSET";
+        case CLFFT_INVALID_WORK_ITEM_SIZE:
+            return "CLFFT_INVALID_WORK_ITEM_SIZE";
+        case CLFFT_INVALID_WORK_GROUP_SIZE:
+            return "CLFFT_INVALID_WORK_GROUP_SIZE";
+        case CLFFT_INVALID_WORK_DIMENSION:
+            return "CLFFT_INVALID_WORK_DIMENSION";
+        case CLFFT_INVALID_KERNEL_ARGS:
+            return "CLFFT_INVALID_KERNEL_ARGS";
+        case CLFFT_INVALID_ARG_SIZE:
+            return "CLFFT_INVALID_ARG_SIZE";
+        case CLFFT_INVALID_ARG_VALUE:
+            return "CLFFT_INVALID_ARG_VALUE";
+        case CLFFT_INVALID_ARG_INDEX:
+            return "CLFFT_INVALID_ARG_INDEX";
+        case CLFFT_INVALID_KERNEL:
+            return "CLFFT_INVALID_KERNEL";
+        case CLFFT_INVALID_KERNEL_DEFINITION:
+            return "CLFFT_INVALID_KERNEL_DEFINITION";
+        case CLFFT_INVALID_KERNEL_NAME:
+            return "CLFFT_INVALID_KERNEL_NAME";
+        case CLFFT_INVALID_PROGRAM_EXECUTABLE:
+            return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
+        case CLFFT_INVALID_PROGRAM:
+            return "CLFFT_INVALID_PROGRAM";
+        case CLFFT_INVALID_BUILD_OPTIONS:
+            return "CLFFT_INVALID_BUILD_OPTIONS";
+        case CLFFT_INVALID_BINARY:
+            return "CLFFT_INVALID_BINARY";
+        case CLFFT_INVALID_SAMPLER:
+            return "CLFFT_INVALID_SAMPLER";
+        case CLFFT_INVALID_IMAGE_SIZE:
+            return "CLFFT_INVALID_IMAGE_SIZE";
+        case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+            return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+        case CLFFT_INVALID_MEM_OBJECT:
+            return "CLFFT_INVALID_MEM_OBJECT";
+        case CLFFT_INVALID_HOST_PTR:
+            return "CLFFT_INVALID_HOST_PTR";
+        case CLFFT_INVALID_COMMAND_QUEUE:
+            return "CLFFT_INVALID_COMMAND_QUEUE";
+        case CLFFT_INVALID_QUEUE_PROPERTIES:
+            return "CLFFT_INVALID_QUEUE_PROPERTIES";
+        case CLFFT_INVALID_CONTEXT:
+            return "CLFFT_INVALID_CONTEXT";
+        case CLFFT_INVALID_DEVICE:
+            return "CLFFT_INVALID_DEVICE";
+        case CLFFT_INVALID_PLATFORM:
+            return "CLFFT_INVALID_PLATFORM";
+        case CLFFT_INVALID_DEVICE_TYPE:
+            return "CLFFT_INVALID_DEVICE_TYPE";
+        case CLFFT_INVALID_VALUE:
+            return "CLFFT_INVALID_VALUE";
+        case CLFFT_MAP_FAILURE:
+            return "CLFFT_MAP_FAILURE";
+        case CLFFT_BUILD_PROGRAM_FAILURE:
+            return "CLFFT_BUILD_PROGRAM_FAILURE";
+        case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
+            return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
+        case CLFFT_IMAGE_FORMAT_MISMATCH:
+            return "CLFFT_IMAGE_FORMAT_MISMATCH";
+        case CLFFT_MEM_COPY_OVERLAP:
+            return "CLFFT_MEM_COPY_OVERLAP";
+        case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
+            return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
+        case CLFFT_OUT_OF_HOST_MEMORY:
+            return "CLFFT_OUT_OF_HOST_MEMORY";
+        case CLFFT_OUT_OF_RESOURCES:
+            return "CLFFT_OUT_OF_RESOURCES";
+        case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
+            return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
+        case CLFFT_COMPILER_NOT_AVAILABLE:
+            return "CLFFT_COMPILER_NOT_AVAILABLE";
+        case CLFFT_DEVICE_NOT_AVAILABLE:
+            return "CLFFT_DEVICE_NOT_AVAILABLE";
+        case CLFFT_DEVICE_NOT_FOUND:
+            return "CLFFT_DEVICE_NOT_FOUND";
+        case CLFFT_SUCCESS:
+            return "CLFFT_SUCCESS";
+        case CLFFT_NOTIMPLEMENTED:
+            return "CLFFT_NOTIMPLEMENTED";
+        case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
+            return "CLFFT_TRANSPOSED_NOTIMPLEMENTED";
+        case CLFFT_FILE_NOT_FOUND:
+            return "CLFFT_FILE_NOT_FOUND";
+        case CLFFT_FILE_CREATE_FAILURE:
+            return "CLFFT_FILE_CREATE_FAILURE";
+        case CLFFT_VERSION_MISMATCH:
+            return "CLFFT_VERSION_MISMATCH";
+        case CLFFT_INVALID_PLAN:
+            return "CLFFT_INVALID_PLAN";
+        default:
+            return "Error code not defined";
+        break;
+    }
 }
 
 std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
-										  cl_uint deviceGpuList,
-										  cl_context& context,
-										  bool printclInfo )
+                                          cl_uint deviceGpuList,
+                                          cl_context& context,
+                                          bool printclInfo )
 {
-	cl_int status = 0;
-
-	/*
-		* Have a look at the available platforms and pick either
-		* the AMD one if available or a reasonable default.
-		*/
-
-	cl_uint numPlatforms	= 0;
-	cl_platform_id platform = NULL;
-	OPENCL_V_THROW( ::clGetPlatformIDs( 0, NULL, &numPlatforms ),
-			"Getting number of platforms( ::clGetPlatformsIDs() )" );
-
-	if( numPlatforms > 0 )
-	{
-		std::vector< cl_platform_id > platforms( numPlatforms );
-		OPENCL_V_THROW( ::clGetPlatformIDs( numPlatforms, &platforms[ 0 ], NULL ),
-			"Getting Platform Id's ( ::clGetPlatformsIDs() )" );
-
-		//	TODO: How should we determine what platform to choose?  We are just defaulting to the last one reported, as we
-		//	print out the info
-		for( unsigned int i=0; i < numPlatforms; ++i )
-		{
-			if( printclInfo )
-			{
-				std::cout << "OpenCL platform [ " << i << " ]:" << std::endl;
-				prettyPrintPlatformInfo( platforms[i] );
-			}
-
-			platform = platforms[i];
-		}
-	}
-
-	if( NULL == platform )
-	{
-		throw std::runtime_error( "No appropriate OpenCL platform could be found" );
-	}
-
-	/*
-	 * If we could find our platform, use it. Otherwise use just available platform.
-	 */
-
-	//	Get the device list for this type.
-	//
-	cl_uint num_devices = 0;
-	OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, 0, NULL, &num_devices ),
-		"Getting OpenCL devices ( ::clGetDeviceIDs() )" );
-	if( 0 == num_devices )
-	{
-		OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
-	}
-
-	std::vector< cl_device_id > deviceIDs( num_devices );
-	OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, num_devices, &deviceIDs[0], NULL),
-		"Getting OpenCL deviceIDs ( ::clGetDeviceIDs() )" );
-
-	if( (CL_DEVICE_TYPE_GPU == deviceType) && (~cl_uint(0) != deviceGpuList) )
-	{
-		//	The command line options specify to user certain gpu(s)
-		//
-		for( unsigned u = (unsigned) deviceIDs.size(); u-- > 0; )
-		{
-			if( 0 != (deviceGpuList & (1<<u) ) )
-				continue;
-
-			//  Remove this GPU from the list
-			deviceIDs[u] = deviceIDs.back();
-			deviceIDs.pop_back();
-		}
-	}
-
-	if( 0 == deviceIDs.size( ) )
-	{
-		OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
-	}
-
-	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
-
-	/////////////////////////////////////////////////////////////////
-	// Create an OpenCL context
-	/////////////////////////////////////////////////////////////////
-	context = clCreateContext( cps,
-							   (cl_uint) deviceIDs.size(),
-							   & deviceIDs[0],
-							   NULL,
-							   NULL,
-							   &status);
-	OPENCL_V_THROW( status, "Creating Context ( ::clCreateContextFromType() )" );
-
-	/* First, get the size of device list data */
-	size_t deviceListSize;
-	OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
-		"Getting device array size ( ::clGetContextInfo() )" );
-
-	/////////////////////////////////////////////////////////////////
-	// Detect OpenCL devices
-	/////////////////////////////////////////////////////////////////
-	std::vector< cl_device_id > devices( deviceListSize/sizeof( cl_device_id ) );
-
-	/* Now, get the device list data */
-	OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &devices[ 0 ], NULL ),
-		"Getting device array ( ::clGetContextInfo() )" );
-
-	if( printclInfo )
-	{
-		cl_uint cContextDevices	= 0;
-
-		size_t deviceVersionSize	= 0;
-		OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
-			"Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
-
-		std::vector< char > szDeviceVersion( deviceVersionSize );
-		OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
-			"Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
-
-		char openclstr[11]="OpenCL 1.0";
-
-		if (!strncmp((const char*)&szDeviceVersion[ 0 ], openclstr, 10))
-		{
-			cContextDevices	= 1;
-		}
-		else
-		{
-			OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_NUM_DEVICES, sizeof( cContextDevices ), &cContextDevices, NULL ),
-				"Getting number of context devices ( ::clGetContextInfo() )" );
-		}
-
-		for( cl_uint i = 0; i < cContextDevices; ++i )
-		{
-			std::cout << "OpenCL devices [ " << i << " ]:" << std::endl;
-			prettyPrintDeviceInfo( devices[i] );
-		}
-	}
-
-	return devices;
+    cl_int status = 0;
+
+    /*
+        * Have a look at the available platforms and pick either
+        * the AMD one if available or a reasonable default.
+        */
+
+    cl_uint numPlatforms	= 0;
+    cl_platform_id platform = NULL;
+    OPENCL_V_THROW( ::clGetPlatformIDs( 0, NULL, &numPlatforms ),
+            "Getting number of platforms( ::clGetPlatformsIDs() )" );
+
+    if( numPlatforms > 0 )
+    {
+        std::vector< cl_platform_id > platforms( numPlatforms );
+        OPENCL_V_THROW( ::clGetPlatformIDs( numPlatforms, &platforms[ 0 ], NULL ),
+            "Getting Platform Id's ( ::clGetPlatformsIDs() )" );
+
+        //	TODO: How should we determine what platform to choose?  We are just defaulting to the last one reported, as we
+        //	print out the info
+        for( unsigned int i=0; i < numPlatforms; ++i )
+        {
+            if( printclInfo )
+            {
+                std::cout << "OpenCL platform [ " << i << " ]:" << std::endl;
+                prettyPrintPlatformInfo( platforms[i] );
+            }
+
+            platform = platforms[i];
+        }
+    }
+
+    if( NULL == platform )
+    {
+        throw std::runtime_error( "No appropriate OpenCL platform could be found" );
+    }
+
+    /*
+     * If we could find our platform, use it. Otherwise use just available platform.
+     */
+
+    //	Get the device list for this type.
+    //
+    cl_uint num_devices = 0;
+    OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, 0, NULL, &num_devices ),
+        "Getting OpenCL devices ( ::clGetDeviceIDs() )" );
+    if( 0 == num_devices )
+    {
+        OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+    }
+
+    std::vector< cl_device_id > deviceIDs( num_devices );
+    OPENCL_V_THROW( ::clGetDeviceIDs( platform, deviceType, num_devices, &deviceIDs[0], NULL),
+        "Getting OpenCL deviceIDs ( ::clGetDeviceIDs() )" );
+
+    if( (CL_DEVICE_TYPE_GPU == deviceType) && (~cl_uint(0) != deviceGpuList) )
+    {
+        //	The command line options specify to user certain gpu(s)
+        //
+        for( unsigned u = (unsigned) deviceIDs.size(); u-- > 0; )
+        {
+            if( 0 != (deviceGpuList & (1<<u) ) )
+                continue;
+
+            //  Remove this GPU from the list
+            deviceIDs[u] = deviceIDs.back();
+            deviceIDs.pop_back();
+        }
+    }
+
+    if( 0 == deviceIDs.size( ) )
+    {
+        OPENCL_V_THROW( CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+    }
+
+    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
+
+    /////////////////////////////////////////////////////////////////
+    // Create an OpenCL context
+    /////////////////////////////////////////////////////////////////
+    context = clCreateContext( cps,
+                               (cl_uint) deviceIDs.size(),
+                               & deviceIDs[0],
+                               NULL,
+                               NULL,
+                               &status);
+    OPENCL_V_THROW( status, "Creating Context ( ::clCreateContextFromType() )" );
+
+    /* First, get the size of device list data */
+    size_t deviceListSize;
+    OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
+        "Getting device array size ( ::clGetContextInfo() )" );
+
+    /////////////////////////////////////////////////////////////////
+    // Detect OpenCL devices
+    /////////////////////////////////////////////////////////////////
+    std::vector< cl_device_id > devices( deviceListSize/sizeof( cl_device_id ) );
+
+    /* Now, get the device list data */
+    OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &devices[ 0 ], NULL ),
+        "Getting device array ( ::clGetContextInfo() )" );
+
+    if( printclInfo )
+    {
+        cl_uint cContextDevices	= 0;
+
+        size_t deviceVersionSize	= 0;
+        OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+            "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+        std::vector< char > szDeviceVersion( deviceVersionSize );
+        OPENCL_V_THROW( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+            "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+        char openclstr[11]="OpenCL 1.0";
+
+        if (!strncmp((const char*)&szDeviceVersion[ 0 ], openclstr, 10))
+        {
+            cContextDevices	= 1;
+        }
+        else
+        {
+            OPENCL_V_THROW( ::clGetContextInfo( context, CL_CONTEXT_NUM_DEVICES, sizeof( cContextDevices ), &cContextDevices, NULL ),
+                "Getting number of context devices ( ::clGetContextInfo() )" );
+        }
+
+        for( cl_uint i = 0; i < cContextDevices; ++i )
+        {
+            std::cout << "OpenCL devices [ " << i << " ]:" << std::endl;
+            prettyPrintDeviceInfo( devices[i] );
+        }
+    }
+
+    return devices;
 }
 
 int cleanupCL( cl_context* context, cl_command_queue* commandQueue,
-	const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent )
+    const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent )
 {
-	if( *outEvent != NULL )
-		OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" );
+    if( *outEvent != NULL )
+        OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" );
 
-	releaseOpenCLMemBuffer( numBuffersIn, inputBuffer);
-	releaseOpenCLMemBuffer( numBuffersOut, outputBuffer);
+    releaseOpenCLMemBuffer( numBuffersIn, inputBuffer);
+    releaseOpenCLMemBuffer( numBuffersOut, outputBuffer);
 
-	if( *commandQueue != NULL )
-		OPENCL_V_THROW( clReleaseCommandQueue( *commandQueue ), "Error: In clReleaseCommandQueue\n" );
+    if( *commandQueue != NULL )
+        OPENCL_V_THROW( clReleaseCommandQueue( *commandQueue ), "Error: In clReleaseCommandQueue\n" );
 
-	if( *context != NULL )
-		OPENCL_V_THROW( clReleaseContext( *context ), "Error: In clReleaseContext\n" );
+    if( *context != NULL )
+        OPENCL_V_THROW( clReleaseContext( *context ), "Error: In clReleaseContext\n" );
 
-	return 0;
+    return 0;
 }
 
 int createOpenCLMemoryBuffer( cl_context& context, const size_t bufferSizeBytes, const cl_uint numBuffers, cl_mem buffer[], cl_mem_flags accessibility) {
-	cl_int status = 0;
+    cl_int status = 0;
 
-	for( cl_uint i = 0; i < numBuffers; ++i )
-	{
-		buffer[ i ] = ::clCreateBuffer( context, accessibility, bufferSizeBytes, NULL, &status);
-		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
-	}
+    for( cl_uint i = 0; i < numBuffers; ++i )
+    {
+        buffer[ i ] = ::clCreateBuffer( context, accessibility, bufferSizeBytes, NULL, &status);
+        OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
+    }
 
-	return 0;
+    return 0;
 }
 
 int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[])
 {
-	for( cl_uint i = 0; i < numBuffers; ++i )
-	{
-		if( buffer[ i ] != NULL )
-			OPENCL_V_THROW( clReleaseMemObject( buffer[ i ] ), "Error: In clReleaseMemObject\n" );
-	}
+    for( cl_uint i = 0; i < numBuffers; ++i )
+    {
+        if( buffer[ i ] != NULL )
+            OPENCL_V_THROW( clReleaseMemObject( buffer[ i ] ), "Error: In clReleaseMemObject\n" );
+    }
 
-	return 0;
+    return 0;
 }
 
 void createOpenCLCommandQueue( cl_context& context,
-							   cl_uint commandQueueFlags,
-							   cl_command_queue& commandQueue,
-							   std::vector< cl_device_id > devices,
-							   const size_t bufferSizeBytesIn,
-							   const cl_uint numBuffersIn,
-							   cl_mem clMemBufferIn[],
-							   const size_t bufferSizeBytesOut,
-							   const cl_uint numBuffersOut,
-							   cl_mem clMemBufferOut[] )
+                               cl_uint commandQueueFlags,
+                               cl_command_queue& commandQueue,
+                               std::vector< cl_device_id > devices,
+                               const size_t bufferSizeBytesIn,
+                               const cl_uint numBuffersIn,
+                               cl_mem clMemBufferIn[],
+                               const size_t bufferSizeBytesOut,
+                               const cl_uint numBuffersOut,
+                               cl_mem clMemBufferOut[] )
 {
-	cl_int status = 0;
-	commandQueue = ::clCreateCommandQueue( context, devices[0], commandQueueFlags, &status );
-	OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
+    cl_int status = 0;
+    commandQueue = ::clCreateCommandQueue( context, devices[0], commandQueueFlags, &status );
+    OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
 
-	createOpenCLMemoryBuffer( context, bufferSizeBytesIn,  numBuffersIn,  clMemBufferIn,  CL_MEM_READ_WRITE);
-	createOpenCLMemoryBuffer( context, bufferSizeBytesOut, numBuffersOut, clMemBufferOut, CL_MEM_READ_WRITE);
+    createOpenCLMemoryBuffer( context, bufferSizeBytesIn,  numBuffersIn,  clMemBufferIn,  CL_MEM_READ_WRITE);
+    createOpenCLMemoryBuffer( context, bufferSizeBytesOut, numBuffersOut, clMemBufferOut, CL_MEM_READ_WRITE);
 }
 
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 63ac0f9..0c81ae3 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -20,7 +20,8 @@ set( clFFT.Source	transform.cpp
 								plan.cpp 
 								repo.cpp 
 								generator.stockham.cpp 
-								generator.transpose.cpp 
+								generator.transpose.vliw.cpp 
+								generator.transpose.gcn.cpp 
 								generator.copy.cpp
 								lifetime.cpp 
 								stdafx.cpp )
@@ -37,7 +38,8 @@ set( clFFT.Headers	private.h
 					mainpage.h  
 					generator.h 
 					generator.stockham.h 
-					generator.transpose.h 
+					generator.transpose.vliw.h 
+					generator.transpose.gcn.h 
 					../include/stdafx.h 
 					../include/unicode.compatibility.h 
 					../include/targetver.h 
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index 8d6ce65..81f3fec 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -79,29 +79,7 @@ clfftStatus clfftGetPlanPrecision( const clfftPlanHandle plHandle, clfftPrecisio
 	return	CLFFT_SUCCESS;
 }
 
-// This is a helper function to query a device for it's caps and check whether a certain user supplied cap is present
-// Returns CLFFT_SUCCESS if the cap is present, CLFFT_INVALID_OPERATION if it is not found.  All devices specified
-// in the devices vector must contain the cap.
-clfftStatus checkDevExt( std::string cap, std::vector< cl_device_id >& devices )
-{
-	for( size_t d = 0; d < devices.size( ); ++d)
-	{
-		size_t deviceExtSize	= 0;
-		OPENCL_V( ::clGetDeviceInfo( devices[ d ], CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
-			"Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
-
-		std::vector< char > szDeviceExt( deviceExtSize );
-		OPENCL_V( ::clGetDeviceInfo( devices[ d ], CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
-			"Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
-
-		std::string strDeviceExt = &szDeviceExt[ 0 ];
 
-		if( strDeviceExt.find( cap.c_str( ), 0 ) == std::string::npos )
-			return CLFFT_DEVICE_NO_DOUBLE;
-	}
-
-	return CLFFT_SUCCESS;
-}
 clfftStatus clfftSetPlanPrecision( clfftPlanHandle plHandle, clfftPrecision precision )
 {
 	FFTRepo& fftRepo	= FFTRepo::getInstance( );
@@ -118,18 +96,7 @@ clfftStatus clfftSetPlanPrecision( clfftPlanHandle plHandle, clfftPrecision prec
 	if( precision == CLFFT_SINGLE_FAST || precision == CLFFT_DOUBLE_FAST )
 		return CLFFT_NOTIMPLEMENTED;
 
-	//	If the user specifies double precision, check that the device supports double precision first
-	if( precision == CLFFT_DOUBLE || precision == CLFFT_DOUBLE_FAST )
-	{
-		clfftStatus retAmdFp64 = checkDevExt( "cl_amd_fp64", fftPlan->devices );
-		if( retAmdFp64 != CLFFT_SUCCESS )
-		{
-			//	If AMD's extention is not supported, check for Khronos extention
-			clfftStatus retKhrFp64 = checkDevExt( "cl_khr_fp64", fftPlan->devices );
-			if( retKhrFp64 != CLFFT_SUCCESS )
-				return retKhrFp64;
-		}
-	}
+
 
 	//	If we modify the state of the plan, we assume that we can't trust any pre-calculated contents anymore
 	fftPlan->baked		= false;
@@ -784,34 +751,6 @@ clfftStatus clfftGetTmpBufSize( const clfftPlanHandle plHandle, size_t* buffersi
 	return CLFFT_INVALID_OPERATION;
 }
 
-clfftStatus clfftSetInternal( clfftPlanHandle plHandle, void* data )
-{
-	FFTRepo& fftRepo	= FFTRepo::getInstance( );
-	FFTPlan* fftPlan	= NULL;
-	lockRAII* planLock	= NULL;
-
-	OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
-	scopedLock sLock( *planLock, _T( "clfftSetResultLocation" ) );
-
-	struct InternalData {
-		size_t					large1D_Xfactor;
-		size_t					cacheSize;
-		bool                    bLdsComplex;
-		bool                    ldsPadding;
-		unsigned                uLdsFraction;
-	} *mydata;
-
-	mydata = (InternalData *) data;
-
-	fftPlan->large1D_Xfactor = mydata->large1D_Xfactor;
-	fftPlan->cacheSize       = mydata->cacheSize;
-	fftPlan->bLdsComplex     = mydata->bLdsComplex;
-	fftPlan->ldsPadding      = mydata->ldsPadding;
-	fftPlan->uLdsFraction    = mydata->uLdsFraction;
-
-	return	CLFFT_SUCCESS;
-}
-
 clfftStatus clfftLocalMemSize( const clfftPlanHandle plHandle, cl_ulong* local_mem_size )
 {
 	FFTRepo& repo = FFTRepo::getInstance( );
diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
index b88adae..1221882 100644
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@ -32,6 +32,7 @@ namespace CopyGenerator
 		size_t Nt;
 		const FFTKernelGenKeyParams params;
 		bool h2c, c2h;
+		bool general;
 
 		inline std::string OffsetCalc(const std::string &off, bool input = true)
 		{
@@ -39,72 +40,21 @@ namespace CopyGenerator
 
 			const size_t *pStride = input ? params.fft_inStride : params.fft_outStride;
 
-			std::string batch = "batch";
-
-			switch(params.fft_DataDim)
+			str += "\t"; str += off; str += " = ";
+			std::string nextBatch = "batch";
+			for(size_t i=(params.fft_DataDim - 1); i>1; i--)
 			{
-			case 5:
-				{
-					str += "\t{\n\tuint ocalc1 = ";
-					str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
-					str += ";\n";
-
-					str += "\tuint ocalc0 = ";
-					str += "ocalc1"; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
-					str += ";\n";
-
-					str += "\t"; str += off; str += " = ";
-					str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
-					str += ")*"; str += SztToStr(pStride[4]); str += " + ";
+				size_t currentLength = 1;
+				for(int j=1; j<i; j++) currentLength *= params.fft_N[j];
 
-					str += "(ocalc1"; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
-					str += SztToStr(pStride[3]); str += " + ";
-
-					str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[2]); str += " + ";
-					str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[1]); str += ";\n";
-
-					str += "\t}\n";
-				}
-				break;
-			case 4:
-				{
-					str += "\t{\n\tuint ocalc0 = ";
-					str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
-					str += ";\n";
+				str += "("; str += nextBatch; str += "/"; str += SztToStr(currentLength);
+				str += ")*"; str += SztToStr(pStride[i]); str += " + ";
 
-					str += "\t"; str += off; str += " = ";
-					str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
-					str += SztToStr(pStride[3]); str += " + ";
-
-					str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[2]); str += " + ";
-					str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[1]); str += ";\n";
-
-					str += "\t}\n";
-				}
-				break;
-			case 3:
-				{
-					str += "\t"; str += off; str += " = ";
-					str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[2]); str += " + ";
-					str += "("; str += batch; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[1]); str += ";\n";
-				}
-				break;
-			case 2:
-				{
-					str += "\t"; str += off; str += " = ";
-					str += batch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
-				}
-				break;
-			default:
-				assert(false);
+				nextBatch = "(" + nextBatch + "%" + SztToStr(currentLength) + ")";
 			}
 
+			str += nextBatch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
+
 			return str;
 		}
 
@@ -121,6 +71,8 @@ namespace CopyGenerator
 			c2h = (	(params.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
 					(params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
 
+			general = !(h2c || c2h);
+
 			// We only do out-of-place copies at this point
 			assert(params.fft_placeness == CLFFT_OUTOFPLACE);
 		}
@@ -148,8 +100,13 @@ namespace CopyGenerator
 			str += "__kernel void ";
 
 			// Function name
-			if(h2c)	str += "copy_h2c";
-			else	str += "copy_c2h";
+			if(general)
+					str += "copy_general";
+			else
+			{
+				if(h2c)	str += "copy_h2c";
+				else	str += "copy_c2h";
+			}
 
 			str += "(";
 
@@ -177,7 +134,15 @@ namespace CopyGenerator
 			str += "{\n";
 
 			// Initialize
-			str += "\tuint me = get_global_id(0);\n\t";
+			if(general)
+			{
+				str += "\tuint me = get_local_id(0);\n\t";
+				str += "uint batch = get_group_id(0);\n\t";
+			}
+			else
+			{
+				str += "\tuint me = get_global_id(0);\n\t";
+			}
 
 			// Declare memory pointers
 			str += "\n\t";
@@ -222,11 +187,14 @@ namespace CopyGenerator
 			// Setup registers
 			str += "\t"; str += RegBaseType<PR>(2); str += " R;\n\n";
 
-			// Setup variables
-			str += "\tuint batch, mel, mel2;\n\t";
-			str += "batch = me/"; str += SztToStr(Nt); str += ";\n\t";
-			str += "mel = me%"; str += SztToStr(Nt); str += ";\n\t";
-			str += "mel2 = ("; str += SztToStr(N); str += " - mel)%"; str += SztToStr(N); str += ";\n\n";
+			if(!general)
+			{
+				// Setup variables
+				str += "\tuint batch, mel, mel2;\n\t";
+				str += "batch = me/"; str += SztToStr(Nt); str += ";\n\t";
+				str += "mel = me%"; str += SztToStr(Nt); str += ";\n\t";
+				str += "mel2 = ("; str += SztToStr(N); str += " - mel)%"; str += SztToStr(N); str += ";\n\n";
+			}
 
 
 			// Setup memory pointers
@@ -235,96 +203,132 @@ namespace CopyGenerator
 
 			// offset strings
 			std::string inF, inF2, outF, outF2;
-			inF   = "(mel*";  inF   += SztToStr(params.fft_inStride[0]);  inF   += ")";
-			inF2  = "(mel2*"; inF2  += SztToStr(params.fft_inStride[0]);  inF2  += ")";
-			outF  = "(mel*";  outF  += SztToStr(params.fft_outStride[0]); outF  += ")";
-			outF2 = "(mel2*"; outF2 += SztToStr(params.fft_outStride[0]); outF2 += ")";
+			if(general)
+			{
+				inF = inF2 = outF = outF2 = "";
+			}
+			else
+			{
+				inF   = " + (mel*";  inF   += SztToStr(params.fft_inStride[0]);  inF   += ")";
+				inF2  = " + (mel2*"; inF2  += SztToStr(params.fft_inStride[0]);  inF2  += ")";
+				outF  = " + (mel*";  outF  += SztToStr(params.fft_outStride[0]); outF  += ")";
+				outF2 = " + (mel2*"; outF2 += SztToStr(params.fft_outStride[0]); outF2 += ")";
+			}
 
 			str += "\n\t";
 
 			// inputs
 			if(inIlvd)
 			{
-				str += "lwbIn = gbIn + iOffset + "; str += inF; str += ";\n\t";
+				str += "lwbIn = gbIn + iOffset"; str += inF; str += ";\n\t";
 			}
 			else
 			{
-				str += "lwbInRe = gbInRe + iOffset + "; str += inF; str += ";\n\t";
-				str += "lwbInIm = gbInIm + iOffset + "; str += inF; str += ";\n\t";
+				str += "lwbInRe = gbInRe + iOffset"; str += inF; str += ";\n\t";
+				str += "lwbInIm = gbInIm + iOffset"; str += inF; str += ";\n\t";
 			}
 
 			// outputs
 			if(outIlvd)
 			{
-					str += "lwbOut = gbOut + oOffset + "; str += outF; str += ";\n";
+					str += "lwbOut = gbOut + oOffset"; str += outF; str += ";\n";
 				if(h2c)
 				{
 					str += "\t";
-					str += "lwbOut2 = gbOut + oOffset + "; str += outF2; str += ";\n";
+					str += "lwbOut2 = gbOut + oOffset"; str += outF2; str += ";\n";
 				}
 			}
 			else
 			{
-					str += "lwbOutRe = gbOutRe + oOffset + "; str += outF; str += ";\n\t";
-					str += "lwbOutIm = gbOutIm + oOffset + "; str += outF; str += ";\n";
+					str += "lwbOutRe = gbOutRe + oOffset"; str += outF; str += ";\n\t";
+					str += "lwbOutIm = gbOutIm + oOffset"; str += outF; str += ";\n";
 				if(h2c)
 				{
 					str += "\t";
-					str += "lwbOutRe2 = gbOutRe + oOffset + "; str += outF2; str += ";\n\t";
-					str += "lwbOutIm2 = gbOutIm + oOffset + "; str += outF2; str += ";\n";
+					str += "lwbOutRe2 = gbOutRe + oOffset"; str += outF2; str += ";\n\t";
+					str += "lwbOutIm2 = gbOutIm + oOffset"; str += outF2; str += ";\n";
 				}
 			}
 
 			str += "\n\t";
 
 			// Do the copy
-			if(c2h)
+			if(general)
 			{
+				str += "for(uint t=0; t<"; str += SztToStr(N/64); str += "; t++)\n\t{\n\t\t";
+				
 				if(inIlvd)
 				{
-					str += "R = lwbIn[0];\n\t";
+					str += "R = lwbIn[me + t*64];\n\t\t";
 				}
 				else
 				{
-					str += "R.x = lwbInRe[0];\n\t";
-					str += "R.y = lwbInIm[0];\n\t";
+					str += "R.x = lwbInRe[me + t*64];\n\t\t";
+					str += "R.y = lwbInIm[me + t*64];\n\t\t";
 				}
 
 				if(outIlvd)
 				{
-					str += "lwbOut[0] = R;\n\n";
+					str += "lwbOut[me + t*64] = R;\n";
 				}
 				else
 				{
-					str += "lwbOutRe[0] = R.x;\n\t";
-					str += "lwbOutIm[0] = R.y;\n\t";
+					str += "lwbOutRe[me + t*64] = R.x;\n\t\t";
+					str += "lwbOutIm[me + t*64] = R.y;\n";
 				}
+
+				str += "\t}\n\n";
 			}
 			else
 			{
-				if(inIlvd)
+				if(c2h)
 				{
-					str += "R = lwbIn[0];\n\t";
+					if(inIlvd)
+					{
+						str += "R = lwbIn[0];\n\t";
+					}
+					else
+					{
+						str += "R.x = lwbInRe[0];\n\t";
+						str += "R.y = lwbInIm[0];\n\t";
+					}
+
+					if(outIlvd)
+					{
+						str += "lwbOut[0] = R;\n\n";
+					}
+					else
+					{
+						str += "lwbOutRe[0] = R.x;\n\t";
+						str += "lwbOutIm[0] = R.y;\n\t";
+					}
 				}
 				else
 				{
-					str += "R.x = lwbInRe[0];\n\t";
-					str += "R.y = lwbInIm[0];\n\t";
-				}
-
-				if(outIlvd)
-				{
-					str += "lwbOut[0] = R;\n\t";
-					str += "R.y = -R.y;\n\t";
-					str += "lwbOut2[0] = R;\n\n";
-				}
-				else
-				{
-					str += "lwbOutRe[0] = R.x;\n\t";
-					str += "lwbOutIm[0] = R.y;\n\t";
-					str += "R.y = -R.y;\n\t";
-					str += "lwbOutRe2[0] = R.x;\n\t";
-					str += "lwbOutIm2[0] = R.y;\n\n";
+					if(inIlvd)
+					{
+						str += "R = lwbIn[0];\n\t";
+					}
+					else
+					{
+						str += "R.x = lwbInRe[0];\n\t";
+						str += "R.y = lwbInIm[0];\n\t";
+					}
+
+					if(outIlvd)
+					{
+						str += "lwbOut[0] = R;\n\t";
+						str += "R.y = -R.y;\n\t";
+						str += "lwbOut2[0] = R;\n\n";
+					}
+					else
+					{
+						str += "lwbOutRe[0] = R.x;\n\t";
+						str += "lwbOutIm[0] = R.y;\n\t";
+						str += "R.y = -R.y;\n\t";
+						str += "lwbOutRe2[0] = R.x;\n\t";
+						str += "lwbOutIm2[0] = R.y;\n\n";
+					}
 				}
 			}
 
@@ -355,58 +359,17 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Copy> (FFTKernelGenKeyParams & params) c
 
     params.fft_outputLayout = this->outputLayout;
 
-    switch (this->inStride.size()) {
-        //    1-D array is a 2-D data structure.
-        //    1-D unit is a special case of 1-D array.
-    case 1:
-        ARG_CHECK(this->length   .size() > 0);
-        ARG_CHECK(this->outStride.size() > 0);
-        params.fft_DataDim      = 2;
-        params.fft_N[0]         = this->length[0];
-        params.fft_inStride[0]  = this->inStride[0];
-        params.fft_inStride[1]  = this->iDist;
-        params.fft_outStride[0] = this->outStride[0];
-        params.fft_outStride[1] = this->oDist;
-        break;
-
-        //    2-D array is a 3-D data structure
-        //    2-D unit is a speical case of 2-D array.
-    case 2:
-        ARG_CHECK(this->length   .size() > 1);
-        ARG_CHECK(this->outStride.size() > 1);
-        params.fft_DataDim      = 3;
-        params.fft_N[0]         = this->length[0];
-        params.fft_N[1]         = this->length[1];
-        params.fft_inStride[0]  = this->inStride[0];
-        params.fft_inStride[1]  = this->inStride[1];
-        params.fft_inStride[2]  = this->iDist;
-        params.fft_outStride[0] = this->outStride[0];
-        params.fft_outStride[1] = this->outStride[1];
-        params.fft_outStride[2] = this->oDist;
-        break;
-
-        //    3-D array is a 4-D data structure
-        //    3-D unit is a special case of 3-D array.
-    case 3:
-        ARG_CHECK(this->length   .size() > 2);
-        ARG_CHECK(this->outStride.size() > 2);
-        params.fft_DataDim      = 4;
-        params.fft_N[0]         = this->length[0];
-        params.fft_N[1]         = this->length[1];
-        params.fft_N[2]         = this->length[2];
-        params.fft_inStride[0]  = this->inStride[0];
-        params.fft_inStride[1]  = this->inStride[1];
-        params.fft_inStride[2]  = this->inStride[2];
-        params.fft_inStride[3]  = this->iDist;
-        params.fft_outStride[0] = this->outStride[0];
-        params.fft_outStride[1] = this->outStride[1];
-        params.fft_outStride[2] = this->outStride[2];
-        params.fft_outStride[3] = this->oDist;
-        break;
-
-    default:
-        ARG_CHECK (false);
-    }
+	params.fft_DataDim = this->length.size() + 1;
+	int i = 0;
+	for(i = 0; i < (params.fft_DataDim - 1); i++)
+	{
+        params.fft_N[i]         = this->length[i];
+        params.fft_inStride[i]  = this->inStride[i];
+        params.fft_outStride[i] = this->outStride[i];
+
+	}
+    params.fft_inStride[i]  = this->iDist;
+    params.fft_outStride[i] = this->oDist;
 
     params.fft_fwdScale  = this->forwardScale;
     params.fft_backScale = this->backwardScale;
@@ -420,13 +383,33 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std:
     FFTKernelGenKeyParams fftParams;
 	OPENCL_V( this->GetKernelGenKeyPvt<Copy>( fftParams ), _T("GetKernelGenKey() failed!") );
 
+	bool h2c, c2h;
+	h2c = (	(fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
+			(fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+	c2h = (	(fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
+			(fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+
+	bool general = !(h2c || c2h);
+
 	size_t count = this->batchsize;
+
 	switch(fftParams.fft_DataDim)
 	{
 	case 5: assert(false);
 	case 4: count *= fftParams.fft_N[2];
 	case 3: count *= fftParams.fft_N[1];
-	case 2: count *= (1 + fftParams.fft_N[0]/2); break;
+	case 2:
+			{
+				if(general)
+				{
+					count *= 64;
+				}
+				else
+				{
+					count *= (1 + fftParams.fft_N[0]/2); 
+				}
+			}
+			break;
 	case 1: assert(false);
 	}
 
@@ -445,11 +428,19 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Copy> (size_t * longest) const
 using namespace CopyGenerator;
 
 template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
+clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
 {
   FFTKernelGenKeyParams params;
   OPENCL_V( this->GetKernelGenKeyPvt<Copy> (params), _T("GetKernelGenKey() failed!") );
 
+  bool h2c, c2h;
+  h2c = (	(params.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
+  			(params.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+  c2h = (	(params.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
+  			(params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+  
+  bool general = !(h2c || c2h);
+
   std::string programCode;
   Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
   switch(pr)
@@ -466,14 +457,25 @@ clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_
     } break;
   }
 
-  cl_int status = CL_SUCCESS;
-  cl_context QueueContext = NULL;
-  status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+	cl_int status = CL_SUCCESS;
+	cl_device_id Device = NULL;
+	status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+	OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
-  OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+    cl_context QueueContext = NULL;
+    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
-  OPENCL_V( fftRepo.setProgramCode( Copy, params, programCode, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-  OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_c2h", "copy_h2c", QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+  OPENCL_V( fftRepo.setProgramCode( Copy, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+
+  if(general)
+  {
+  OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_general", "copy_general", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+  }
+  else
+  {
+  OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_c2h", "copy_h2c", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+  }
 
   return CLFFT_SUCCESS;
 }
diff --git a/src/library/generator.h b/src/library/generator.h
index 2aac983..590f4a6 100644
--- a/src/library/generator.h
+++ b/src/library/generator.h
@@ -22,10 +22,11 @@
 //	Enum to help provide descriptive names to array indices, when indexing into our various vectors
 enum clfftGenerators
 {
-	Stockham, // Using the Stockham autosort frameworks
-	Transpose,
-	Copy,
-	ENDGENERATORS			///< This value will always be last, and marks the length of clfftGenerators
+    Stockham, // Using the Stockham autosort frameworks
+    Transpose_VLIW,
+    Transpose_GCN,
+    Copy,
+    ENDGENERATORS			///< This value will always be last, and marks the length of clfftGenerators
 };
 
 #endif
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index f6f7241..08f173c 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -272,7 +272,6 @@ namespace StockhamGenerator
 
 					//  Length, WorkGroupSize, NumTransforms, NumPasses,  Radices
 					{     1024,           128,             1,         4,     8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 },
-					//{      128,            64,             1,         7,     2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0 },
 					{      128,            64,             4,         3,     8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
 					{        8,            64,            16,         3,     2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
 
@@ -334,21 +333,17 @@ namespace StockhamGenerator
 		size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);
 
 		size_t l = length;
-		std::map<size_t, size_t> primeFactors;
 		std::map<size_t, size_t> primeFactorsExpanded;
 		for(size_t r=0; r<baseRadixSize; r++)
 		{
 			size_t rad = baseRadix[r];
-			size_t p = 0;
 			size_t e = 1;
 			while(!(l%rad))
 			{
 				l /= rad;
 				e *= rad;
-				p++;
 			}
 
-			primeFactors[rad] = p;
 			primeFactorsExpanded[rad] = e;
 		}
 
@@ -515,117 +510,6 @@ namespace StockhamGenerator
     };
 
 
-	// Twiddle factors table for large N
-	// used in 3-step algorithm
-    class TwiddleTableLarge
-    {
-        size_t N; // length
-		size_t X, Y;
-		size_t tableSize;
-		double *wc, *ws; // cosine, sine arrays
-
-	public:
-		TwiddleTableLarge(size_t length) : N(length)
-		{
-			X = size_t(1) << ARBITRARY::TWIDDLE_DEE;
-			Y = DivRoundingUp<size_t> (CeilPo2(N), ARBITRARY::TWIDDLE_DEE);
-			tableSize = X * Y;
-
-			// Allocate memory for the tables
-			wc = new double[tableSize];
-			ws = new double[tableSize];
-		}
-
-		~TwiddleTableLarge()
-		{
-			// Free
-			delete[] wc;
-			delete[] ws;
-		}
-
-		template <Precision PR>
-		void GenerateTwiddleTable(std::string &twStr)
-		{
-			const double TWO_PI = -6.283185307179586476925286766559;
-
-			// Generate the table
-			size_t nt = 0;
-			double phi = TWO_PI / double (N);
-			for (size_t iY = 0; iY < Y; ++iY)
-			{
-				size_t i = size_t(1) << (iY * ARBITRARY::TWIDDLE_DEE);
-				for (size_t iX = 0; iX < X; ++iX)
-				{
-					size_t j = i * iX;
-
-					double c = cos(phi * (double)j);
-					double s = sin(phi * (double)j);
-
-					//if (fabs(c) < 1.0E-12)	c = 0.0;
-					//if (fabs(s) < 1.0E-12)	s = 0.0;
-
-					wc[nt]   = c;
-					ws[nt++] = s;
-				}
-			}
-
-			std::string sfx = FloatSuffix<PR>();
-
-			// Stringize the table
-			std::stringstream ss;
-			nt = 0;
-
-			ss << "\n __constant ";
-			ss << RegBaseType<PR>(2);
-			ss << " " << TwTableLargeName();
-			ss << "[" << Y << "][" << X << "] = {\n";
-			for (size_t iY = 0; iY < Y; ++iY)
-			{
-				ss << "{ ";
-				for (size_t iX = 0; iX < X; ++iX)
-				{
-					char cv[64], sv[64];
-					sprintf(cv, "%036.34lf", wc[nt]);
-					sprintf(sv, "%036.34lf", ws[nt++]);
-					ss << "("; ss << RegBaseType<PR>(2); ss << ")(";
-					ss << cv; ss << sfx; ss << ", ";
-					ss << sv; ss << sfx; ss << ")";
-					ss << ", ";
-				}
-				ss << " },\n";
-			}
-			ss << "};\n\n";
-
-
-			// Twiddle calc function
-			ss << "__attribute__((always_inline)) ";
-			ss << RegBaseType<PR>(2);
-			ss << "\n" << TwTableLargeFunc() << "(uint u)\n{\n";
-
-			ss << "\t" "uint j = u & " << unsigned(X-1) << ";\n";
-			ss << "\t" ; ss << RegBaseType<PR>(2); ss << " result = ";
-			ss << TwTableLargeName();
-			ss << "[0][j];\n";
-
-			for (size_t iY = 1; iY < Y; ++iY)
-			{
-				std::string phasor = TwTableLargeName();
-				phasor += "[";
-				phasor += SztToStr(iY);
-				phasor += "][j]";
-
-				stringpair product = ComplexMul((RegBaseType<PR>(2)).c_str(), "result", phasor.c_str());
-
-				ss << "\t" "u >>= " << unsigned (ARBITRARY::TWIDDLE_DEE) << ";\n";
-				ss << "\t" "j = u & " << unsigned(X-1) << ";\n";
-				ss << "\t" "result = " << product.first << "\n";
-				ss << "\t" "\t" << product.second <<";\n";
-			}
-			ss << "\t" "return result;\n}\n\n";
-
-			twStr += ss.str();
-		}
-    };
 
     // A pass inside an FFT kernel
     template <Precision PR>
@@ -654,8 +538,9 @@ namespace StockhamGenerator
 		bool rcFull;
 		bool rcSimple;
 
-		bool enableGrouping;
-		bool linearRegs;
+		bool enableGrouping;				
+		bool linearRegs;					// scalar registers (non-vectorized registers) to be used
+		bool halfLds;						// only half the LDS of a complex length need to be used
 		Pass<PR> *nextPass;
 
 		inline void RegBase(size_t regC, std::string &str) const
@@ -767,7 +652,7 @@ namespace StockhamGenerator
 		// SweepRegs is to iterate through the registers to do the three basic operations:
 		// reading, twiddle multiplication, writing
 		void SweepRegs(	size_t flag, bool fwd, bool interleaved, size_t stride, size_t component,
-						double scale,
+						double scale, bool frontTwiddle,
 						const std::string &bufferRe, const std::string &bufferIm, const std::string &offset,
 						size_t regC, size_t numB, size_t numPrev, std::string &passStr) const
 		{
@@ -982,10 +867,23 @@ namespace StockhamGenerator
 							{
 								passStr += "\n\t{\n\t\t"; passStr += twType; passStr += " W = ";
 								passStr += tw3StepFunc; passStr += "( ";
-								passStr += "(("; passStr += SztToStr(numButterfly); passStr += "*me + ";
-								passStr += SztToStr(butterflyIndex);
-								passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
-								passStr += SztToStr(r*algLS); passStr += ") * b "; passStr += ");\n\t\t";
+
+								if(frontTwiddle)
+								{
+									assert(linearRegs);
+									passStr += "("; passStr += "me*"; passStr += SztToStr(numButterfly);
+									passStr += " + "; passStr += SztToStr(i); passStr += " + ";
+									passStr += SztToStr(r*length/radix); passStr += ") * b";
+								}
+								else
+								{
+									passStr += "(("; passStr += SztToStr(numButterfly); passStr += "*me + ";
+									passStr += SztToStr(butterflyIndex);
+									passStr += ")%"; passStr += SztToStr(algLS); passStr += " + ";
+									passStr += SztToStr(r*algLS); passStr += ") * b";
+								}
+
+								passStr += " );\n\t\t";
 							}
 
 							passStr += rType; passStr += " TR, TI;\n\t\t";
@@ -1454,9 +1352,9 @@ namespace StockhamGenerator
 
     public:
 		Pass(	size_t positionVal, size_t lengthVal, size_t radixVal, size_t cnPerWIVal,
-				size_t L, size_t LS, size_t R, bool linearRegsVal, bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal) :
+				size_t L, size_t LS, size_t R, bool linearRegsVal, bool halfLdsVal, bool r2cVal, bool c2rVal, bool rcFullVal, bool rcSimpleVal) :
 			position(positionVal), length(lengthVal), radix(radixVal), cnPerWI(cnPerWIVal),
-			algL(L), algLS(LS), algR(R), linearRegs(linearRegsVal),
+			algL(L), algLS(LS), algR(R), linearRegs(linearRegsVal), halfLds(halfLdsVal),
 			r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal),
 			enableGrouping(true),
 			numB1(0), numB2(0), numB4(0),
@@ -1488,6 +1386,10 @@ namespace StockhamGenerator
 
 				assert(numButterfly == (numB4*4 + numB2*2 + numB1));
 			}
+
+			// if only half LDS can be used, we need the passes to share registers
+			// and hence they need to be linear registers
+			if(halfLds) assert(linearRegs);
 		}
 
 		size_t GetNumB1() const { return numB1; }
@@ -1543,13 +1445,10 @@ namespace StockhamGenerator
 			passStr += "(";
 			passStr += "uint rw, uint b, uint me, uint inOffset, uint outOffset, ";
 
-			// For now, interleaved support is there for only global buffers
-			// TODO : add support for LDS interleaved
-			if(inInterleaved)  assert(gIn);
-			if(outInterleaved) assert(gOut);
-
 			if(r2c || c2r)
 			{
+				assert(halfLds);
+
 				if(gIn)
 				{
 					if(inInterleaved)
@@ -1618,8 +1517,15 @@ namespace StockhamGenerator
 				}
 				else
 				{
-					passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
-					passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+					if(inInterleaved)
+					{
+						passStr += "__local "; passStr += regB2Type; passStr += " *"; passStr += bufferInRe;  passStr += ", ";
+					}
+					else
+					{
+						passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInRe; passStr += ", ";
+						passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferInIm; passStr += ", ";
+					}
 				}
 
 
@@ -1637,8 +1543,15 @@ namespace StockhamGenerator
 				}
 				else
 				{
-					passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
-					passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+					if(outInterleaved)
+					{
+						passStr += "__local "; passStr += regB2Type; passStr += " *"; passStr += bufferOutRe;
+					}
+					else
+					{
+						passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutRe; passStr += ", ";
+						passStr += "__local "; passStr += regB1Type; passStr += " *"; passStr += bufferOutIm;
+					}
 				}
 			}
 
@@ -1690,7 +1603,7 @@ namespace StockhamGenerator
 				if(position == 0)
 				{
 					passStr += "\n\tif(rw)\n\t{";
-					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
 					passStr += "\n\t}\n";
 
 					if(rcSimple)
@@ -1702,7 +1615,7 @@ namespace StockhamGenerator
 					else
 					{
 						passStr += "\n\tif(rw > 1)\n\t{";
-						SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, bufferInRe2, bufferInIm2, "inOffset", 1, numB1, 0, passStr);
+						SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, bufferInRe2, bufferInIm2, "inOffset", 1, numB1, 0, passStr);
 						passStr += "\n\t}\n";
 
 						passStr += "\telse\n\t{";
@@ -1773,7 +1686,7 @@ namespace StockhamGenerator
 					}
 
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
-					SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
+					SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_REAL, 1.0f, false, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 
 
@@ -1823,31 +1736,49 @@ namespace StockhamGenerator
 					}
 
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
-					SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
+					SweepRegs(SR_READ, fwd, outInterleaved, processBufStride, SR_COMP_IMAG, 1.0f, false, processBufRe, processBufIm, processBufOffset, 1, numB1, 0, passStr);
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 				}
 			}
 			else
 			{
-				if( (!linearRegs) || (linearRegs && (position == 0)) )
+				if( (!halfLds) || (halfLds && (position == 0)) )
 				{
 					passStr += "\n\tif(rw)\n\t{";
-					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
-					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
-					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "inOffset", 4, numB4, 2*numB2 + numB1, passStr);
+					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
+					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 4, numB4, 2*numB2 + numB1, passStr);
 					passStr += "\n\t}\n";
 				}
 			}
 
+			passStr += "\n";
+
+			// 3-step twiddle multiplies done in the front
+			bool tw3Done = false;
+			if(fft_3StepTwiddle && (position == 0))
+			{
+				tw3Done = true;
+				if(linearRegs)
+				{
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+				}
+				else
+				{
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, true, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+				}
+			}
 
 			passStr += "\n";
 
 			// Twiddle multiply
 			if( (position > 0) && (radix > 1) )
 			{
-				SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
-				SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
-				SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+				SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+				SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+				SweepRegs(SR_TWMUL, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
 			}
 
 			// Butterfly calls
@@ -1858,31 +1789,30 @@ namespace StockhamGenerator
 				if(numB4) CallButterfly(ButterflyName(radix, 4, fwd), 4, numB4, passStr);
 			}
 
-			passStr += "\n";
 
 			if( (position != 0) && (!linearRegs) && (nextPass != NULL) )
 				passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 
-			passStr += "\n";
+			passStr += "\n\n";
 
 			// 3-step twiddle multiplies
-			if(fft_3StepTwiddle)
+			if(fft_3StepTwiddle && !tw3Done)
 			{
 				assert(nextPass == NULL);
 				if(linearRegs)
 				{
-					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
 				}
 				else
 				{
-					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
-					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
-					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 1, numB1, 0, passStr);
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 2, numB2, numB1, passStr);
+					SweepRegs(SR_TWMUL_3STEP, fwd, false, 1, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "", 4, numB4, 2*numB2 + numB1, passStr);
 				}
 			}
 
 			// Write back from registers
-			if(linearRegs)
+			if(halfLds)
 			{
 				// In this case, we have to write & again read back for the next pass since we are
 				// using only half the lds. Number of barriers will increase at the cost of halving the lds.
@@ -1893,7 +1823,7 @@ namespace StockhamGenerator
 					{
 						if(!singlePass)
 						{
-							SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+							SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
 							passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 							SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_REAL, 1.0f, false, false, false, bufferInRe, bufferInIm, "inOffset", passStr);
 							if(oddp)
@@ -1919,7 +1849,7 @@ namespace StockhamGenerator
 							passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 
 
-							SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
+							SweepRegs(SR_WRITE, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
 							passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 							SweepRegsRC(SR_READ, fwd, inInterleaved, inStride, SR_COMP_IMAG, 1.0f, false, false, false, bufferInRe, bufferInIm, "inOffset", passStr);
 							if(oddp)
@@ -1972,39 +1902,39 @@ namespace StockhamGenerator
 					else if(c2r)
 					{
 						passStr += "\n\tif(rw)\n\t{";
-						SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+						SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
 						passStr += "\n\t}\n";
 
 						if(!rcSimple)
 						{
 							passStr += "\n\tif(rw > 1)\n\t{";
-							SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe2, bufferOutIm2, "outOffset", 1, numB1, 0, passStr);
+							SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe2, bufferOutIm2, "outOffset", 1, numB1, 0, passStr);
 							passStr += "\n\t}\n";
 						}
 					}
 					else
 					{
 						passStr += "\n\tif(rw)\n\t{";
-						SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+						SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
 						passStr += "\n\t}\n";
 					}
 				}
 				else
 				{
 					passStr += "\n\tif(rw)\n\t{";
-					SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+					SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
 					passStr += "\n\t}\n";
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 					passStr += "\n\tif(rw)\n\t{";
-					nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
+					nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_REAL, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
 					passStr += "\n\t}\n";
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 					passStr += "\n\tif(rw)\n\t{";
-					SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+					SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
 					passStr += "\n\t}\n";
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 					passStr += "\n\tif(rw)\n\t{";
-					nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
+					nextPass->SweepRegs(SR_READ, fwd, outInterleaved, outStride, SR_COMP_IMAG, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, nextPass->GetNumB1(), 0, passStr);
 					passStr += "\n\t}\n";
 					passStr += "\n\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
 				}
@@ -2012,9 +1942,9 @@ namespace StockhamGenerator
 			else
 			{
 				passStr += "\n\tif(rw)\n\t{";
-				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
-				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 2, numB2, numB1, passStr);
-				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, bufferOutRe, bufferOutIm, "outOffset", 4, numB4, 2*numB2 + numB1, passStr);
+				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 1, numB1, 0, passStr);
+				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 2, numB2, numB1, passStr);
+				SweepRegs(SR_WRITE, fwd, outInterleaved, outStride, SR_COMP_BOTH, scale, false, bufferOutRe, bufferOutIm, "outOffset", 4, numB4, 2*numB2 + numB1, passStr);
 				passStr += "\n\t}\n";
 			}
 
@@ -2041,6 +1971,8 @@ namespace StockhamGenerator
 												// for passing intermediate data between the passes, if this is set
 												// then each pass-function should accept same set of registers
 
+		bool linearRegs;						// scalar registers
+
 		// Future optimization ideas
 		// bool limitRegs;							// TODO: Incrementally write to LDS, thereby using same set of registers for more than 1 butterflies
 		// bool combineReadTwMul;					// TODO: Combine reading into registers and Twiddle multiply
@@ -2050,6 +1982,11 @@ namespace StockhamGenerator
 		bool rcFull;
 		bool rcSimple;
 
+		bool blockCompute;						// When we have to compute FFT in blocks (either read or write is along columns)
+		BlockComputeType blockComputeType;
+		size_t blockWidth, blockWGS, blockLDS;
+
+
 		const FFTKernelGenKeyParams params;		// key params
 
 
@@ -2057,7 +1994,7 @@ namespace StockhamGenerator
 		{
 			std::string str = "";
 
-			if(halfLds)
+			if(linearRegs)
 			{
 				if(initComma) str += ", ";
 
@@ -2099,6 +2036,38 @@ namespace StockhamGenerator
 			return possible;
 		}
 
+		inline std::string OffsetCalcBlock(const std::string &off, bool input = true)
+		{
+			std::string str;
+
+			const size_t *pStride = input ? params.fft_inStride : params.fft_outStride;
+
+			str += "\t"; str += off; str += " = ";
+			std::string nextBatch = "batch";
+			for(size_t i=(params.fft_DataDim - 1); i>2; i--)
+			{
+				size_t currentLength = 1;
+				for(int j=2; j<i; j++) currentLength *= params.fft_N[j];
+				currentLength *= (params.fft_N[1]/blockWidth);
+
+				str += "("; str += nextBatch; str += "/"; str += SztToStr(currentLength);
+				str += ")*"; str += SztToStr(pStride[i]); str += " + ";
+
+				nextBatch = "(" + nextBatch + "%" + SztToStr(currentLength) + ")";
+			}
+
+			str += "("; str += nextBatch; str += "/"; str += SztToStr(params.fft_N[1]/blockWidth);
+			str += ")*"; str += SztToStr(pStride[2]); str += " + ("; str += nextBatch;
+			str += "%"; str += SztToStr(params.fft_N[1]/blockWidth); str += ")*";
+			if( (input && (blockComputeType == BCT_R2C)) || (!input && (blockComputeType == BCT_C2R)) )
+				str += SztToStr(blockWidth*length);
+			else
+				str += SztToStr(blockWidth);
+			str += ";\n";
+
+			return str;
+		}
+
 		inline std::string OffsetCalc(const std::string &off, bool input = true, bool rc_second_index = false)
 		{
 			std::string str;
@@ -2122,76 +2091,27 @@ namespace StockhamGenerator
 										batch += " + (me/"; batch += SztToStr(workGroupSizePerTrans); batch += "))"; }
 			}
 
-			switch(params.fft_DataDim)
+			str += "\t"; str += off; str += " = ";
+			std::string nextBatch = batch;
+			for(size_t i=(params.fft_DataDim - 1); i>1; i--)
 			{
-			case 5:
-				{
-					str += "\t{\n\tuint ocalc1 = ";
-					str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
-					str += ";\n";
-
-					str += "\tuint ocalc0 = ";
-					str += "ocalc1"; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
-					str += ";\n";
+				size_t currentLength = 1;
+				for(int j=1; j<i; j++) currentLength *= params.fft_N[j];
 
-					str += "\t"; str += off; str += " = ";
-					str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2] * params.fft_N[3]);
-					str += ")*"; str += SztToStr(pStride[4]); str += " + ";
+				str += "("; str += nextBatch; str += "/"; str += SztToStr(currentLength);
+				str += ")*"; str += SztToStr(pStride[i]); str += " + ";
 
-					str += "(ocalc1"; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
-					str += SztToStr(pStride[3]); str += " + ";
-
-					str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[2]); str += " + ";
-					str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[1]); str += ";\n";
-
-					str += "\t}\n";
-				}
-				break;
-			case 4:
-				{
-					str += "\t{\n\tuint ocalc0 = ";
-					str += batch; str += "%"; str += SztToStr(params.fft_N[1] * params.fft_N[2]);
-					str += ";\n";
-
-					str += "\t"; str += off; str += " = ";
-					str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1] * params.fft_N[2]); str += ")*";
-					str += SztToStr(pStride[3]); str += " + ";
-
-					str += "(ocalc0"; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[2]); str += " + ";
-					str += "(ocalc0"; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[1]); str += ";\n";
-
-					str += "\t}\n";
-				}
-				break;
-			case 3:
-				{
-					str += "\t"; str += off; str += " = ";
-					str += "("; str += batch; str += "/"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[2]); str += " + ";
-					str += "("; str += batch; str += "%"; str += SztToStr(params.fft_N[1]); str += ")*";
-					str += SztToStr(pStride[1]); str += ";\n";
-				}
-				break;
-			case 2:
-				{
-					str += "\t"; str += off; str += " = ";
-					str += batch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
-				}
-				break;
-			default:
-				assert(false);
+				nextBatch = "(" + nextBatch + "%" + SztToStr(currentLength) + ")";
 			}
 
+			str += nextBatch; str += "*"; str += SztToStr(pStride[1]); str += ";\n";
+
 			return str;
 		}
 
     public:
         Kernel( const FFTKernelGenKeyParams &paramsVal) :
-					params(paramsVal), r2c2r(false)
+			params(paramsVal), r2c2r(false)
 
         {
 			length = params.fft_N[0];
@@ -2222,12 +2142,24 @@ namespace StockhamGenerator
 			halfLds = ( (params.fft_inputLayout == CLFFT_COMPLEX_INTERLEAVED) &&
 						(params.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ) ? true : false;
 			halfLds = halfLds ? ((length & (length-1)) ? false : true) : false;
-			//halfLds = false;
 
 			// Set half lds for real transforms
 			halfLds = r2c2r ? true : halfLds;
 
-			bool linearRegs = halfLds ? true : false;
+			linearRegs = halfLds;
+
+			blockCompute = params.blockCompute;
+			blockComputeType = params.blockComputeType;
+			// Make sure we can utilize all Lds if we are going to
+			// use blocked columns to compute FFTs
+			if(blockCompute)
+			{
+				assert(length <= 256);  // 256 parameter comes from prototype experiments
+										// largest length at which block column possible given 32KB LDS limit
+										// if LDS limit is different this number need to be changed appropriately
+				halfLds = false;
+				linearRegs = true;
+			}
 
 			assert( ((length*numTrans)%workGroupSize) == 0 );
 			cnPerWI = (numTrans * length) / workGroupSize;
@@ -2258,7 +2190,7 @@ namespace StockhamGenerator
 					R /= rad;
 
 					radices.push_back(rad);
-					passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, r2c, c2r, rcFull, rcSimple));
+					passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
 
 					LS *= rad;
 				}
@@ -2271,17 +2203,18 @@ namespace StockhamGenerator
 				size_t cRad[] = {10,8,6,5,4,3,2,1}; // Must be in descending order
 				size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));
 
+				// Generate the radix and pass objects
 				while(true)
 				{
 					size_t rad;
 
 					assert(cRadSize >= 1);
+
+					// Picks the radices in descending order (biggest radix first)
 					for(size_t r=0; r<cRadSize; r++)
 					{
 						rad = cRad[r];
 
-						if( (rad == 16) && !linearRegs ) continue; // temporary - fix this !!!
-
 						if((rad > cnPerWI) || (cnPerWI%rad))
 							continue;
 
@@ -2295,7 +2228,7 @@ namespace StockhamGenerator
 					R /= rad;
 
 					radices.push_back(rad);
-					passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, r2c, c2r, rcFull, rcSimple));
+					passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple));
 
 					pid++;
 					LS *= rad;
@@ -2346,8 +2279,72 @@ namespace StockhamGenerator
 				for(size_t i=0; i < (numPasses - 1); i++)
 					passes[i].SetNextPass(&passes[i+1]);
 
+
+			if(blockCompute)
+			{
+				blockWidth = BlockSizes::BlockWidth(length);
+				blockWGS = BlockSizes::BlockWorkGroupSize(length);
+				blockLDS = BlockSizes::BlockLdsSize(length);
+			}
+			else
+			{
+				blockWidth = blockWGS = blockLDS = 0;
+			}
 		}
 
+		class BlockSizes
+		{
+		public:
+			enum ValType
+			{
+				BS_VT_WGS,
+				BS_VT_BWD,
+				BS_VT_LDS,
+			};
+
+			static size_t BlockLdsSize(size_t N) { return GetValue(N, BS_VT_LDS); }
+			static size_t BlockWidth(size_t N) { return GetValue(N, BS_VT_BWD); }
+			static size_t BlockWorkGroupSize(size_t N) { return GetValue(N, BS_VT_WGS); }
+
+		private:
+
+			static size_t GetValue(size_t N, ValType vt)
+			{
+				size_t wgs; // preferred work group size
+				size_t bwd; // block width to be used
+				size_t lds; // LDS size to be used for the block
+
+
+				KernelCoreSpecs<PR> kcs;
+				size_t t_wgs, t_nt;
+				kcs.GetWGSAndNT(N, t_wgs, t_nt);
+
+				switch(N)
+				{
+				case 256:	bwd = 8/PrecisionWidth<PR>();   wgs = (bwd > t_nt) ? 256 : t_wgs; break;
+				case 128:	bwd = 8/PrecisionWidth<PR>();   wgs = (bwd > t_nt) ? 128 : t_wgs; break;
+				case 64:	bwd = 16/PrecisionWidth<PR>();  wgs = (bwd > t_nt) ? 128 : t_wgs; break;
+				case 32:	bwd = 32/PrecisionWidth<PR>();  wgs = (bwd > t_nt) ? 64  : t_wgs; break;
+				case 16:	bwd = 64/PrecisionWidth<PR>();  wgs = (bwd > t_nt) ? 64  : t_wgs; break;
+				case 8:		bwd = 128/PrecisionWidth<PR>(); wgs = (bwd > t_nt) ? 64  : t_wgs; break;
+				default:	assert(false);
+				}
+
+				// block width cannot be less than numTrans, math in other parts of code depend on this assumption
+				assert(bwd >= t_nt);
+
+				lds = N*bwd;
+
+				switch(vt)
+				{
+				case BS_VT_WGS: return wgs;
+				case BS_VT_BWD: return bwd;
+				case BS_VT_LDS: return lds;
+				default: assert(false); return 0;
+				}
+			}
+		};
+
         void GenerateKernel(std::string &str, cl_device_id Dev_ID)
 		{
 			std::string twType = RegBaseType<PR>(2);
@@ -2361,6 +2358,11 @@ namespace StockhamGenerator
 			outInterleaved = (	(params.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ||
 								(params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
 
+			// use interleaved LDS when halfLds constraint absent
+			bool ldsInterleaved = inInterleaved || outInterleaved;
+			ldsInterleaved = halfLds ? false : ldsInterleaved;
+			ldsInterleaved = blockCompute ? true : ldsInterleaved;
+
 			bool inReal;  // Input is real format
 			bool outReal; // Output is real format
 			inReal  = (params.fft_inputLayout == CLFFT_REAL) ? true : false;
@@ -2410,7 +2412,7 @@ namespace StockhamGenerator
 			str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
 			str += "\n";
 
-			bool cReg = halfLds ? true : false;
+			bool cReg = linearRegs ? true : false;
 
 			// Generate butterflies for all unique radices
 			std::list<size_t> uradices;
@@ -2455,7 +2457,6 @@ namespace StockhamGenerator
 				}
 
 				double scale = fwd ? params.fft_fwdScale : params.fft_backScale;
-				bool tw3Step = false;
 
 				for(p = passes.begin(); p != passes.end(); p++)
 				{
@@ -2464,8 +2465,25 @@ namespace StockhamGenerator
 					bool gIn = false, gOut = false;
 					bool inIlvd = false, outIlvd = false;
 					bool inRl = false, outRl = false;
-					if(p == passes.begin())		{ inIlvd  = inInterleaved;  inRl  = inReal;  gIn  = true; ins  = params.fft_inStride[0];  }
-					if((p+1) == passes.end())	{ outIlvd = outInterleaved; outRl = outReal; gOut = true; outs = params.fft_outStride[0]; s = scale; tw3Step = params.fft_3StepTwiddle; }
+					bool tw3Step = false;
+
+
+					if(p == passes.begin() && params.fft_twiddleFront ) { tw3Step = params.fft_3StepTwiddle; }
+					if((p+1) == passes.end())	{ s = scale; if(!params.fft_twiddleFront) tw3Step = params.fft_3StepTwiddle; }
+
+					if(blockCompute && !r2c2r)
+					{
+						inIlvd = ldsInterleaved;
+						outIlvd = ldsInterleaved;
+					}
+					else
+					{
+						if(p == passes.begin())		{ inIlvd  = inInterleaved;  inRl  = inReal;  gIn  = true; ins  = params.fft_inStride[0];  }
+						if((p+1) == passes.end())	{ outIlvd = outInterleaved; outRl = outReal; gOut = true; outs = params.fft_outStride[0]; }
+
+						if(p != passes.begin())		{ inIlvd = ldsInterleaved; }
+						if((p+1) != passes.end())	{ outIlvd = ldsInterleaved; }
+					}
 
 					p->GeneratePass(fwd, str, tw3Step, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn, gOut);
 				}
@@ -2475,6 +2493,8 @@ namespace StockhamGenerator
 					break;
 			}
 
+
+
 			// TODO : address this kludge
 			str += " typedef union  { uint u; int i; } cb_t;\n\n";
 
@@ -2494,7 +2514,9 @@ namespace StockhamGenerator
 				// FFT kernel begin
 				// Function attribute
 				str += "__kernel __attribute__((reqd_work_group_size (";
-				str += SztToStr(workGroupSize); str += ",1,1)))\nvoid ";
+				if(blockCompute)	str += SztToStr(blockWGS);
+				else				str += SztToStr(workGroupSize);
+				str += ",1,1)))\nvoid ";
 
 				// Function name
 				if(fwd) str += "fft_fwd";
@@ -2610,13 +2632,25 @@ namespace StockhamGenerator
 				str += "uint batch = get_group_id(0);";
 				str += "\n";
 
+
+
 				// Allocate LDS
-				size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
-				if(numPasses > 1)
+				if(blockCompute)
 				{
-					str += "\n\t";
-					str += "__local "; str += rType; str += " lds[";
-					str += SztToStr(ldsSize); str += "];\n";
+					str += "\n\t"; str += "__local "; str += r2Type; str += " lds[";
+					str += SztToStr(blockLDS); str += "];\n";
+				}
+				else
+				{
+					size_t ldsSize = halfLds ? length*numTrans : 2*length*numTrans;
+					ldsSize = ldsInterleaved ? ldsSize/2 : ldsSize;
+
+					if(numPasses > 1)
+					{
+						str += "\n\t";
+						str += "__local "; str += ldsInterleaved ? r2Type: rType; str += " lds[";
+						str += SztToStr(ldsSize); str += "];\n";
+					}
 				}
 
 				// Declare memory pointers
@@ -2715,7 +2749,7 @@ namespace StockhamGenerator
 				}
 
 				// Setup registers if needed
-				if(halfLds)
+				if(linearRegs)
 				{
 					str += "\t"; str += RegBaseType<PR>(2);
 					str += " "; str += IterRegs("", false);
@@ -2730,7 +2764,7 @@ namespace StockhamGenerator
 					totalBatch += SztToStr(params.fft_N[i+1]); totalBatch += " * ";
 					i++;
 				}
-				totalBatch += "cb["; totalBatch += SztToStr(i); totalBatch += "].u)";
+				totalBatch += "cb[0].u)";
 
 				// Conditional read-write ('rw') for arbitrary batch number
 				if(r2c2r && !rcSimple)
@@ -2742,16 +2776,20 @@ namespace StockhamGenerator
 				}
 				else
 				{
-					if(numTrans > 1)
+					if( (numTrans > 1) && !blockCompute )
 					{
 						str += "\tuint rw = (me < ("; str += totalBatch;
 						str += " - batch*"; str += SztToStr(numTrans); str += ")*";
 						str += SztToStr(workGroupSizePerTrans); str += ") ? 1 : 0;\n\n";
 					}
+					else
+					{
+						str += "\tuint rw = 1;\n\n";
+					}
 				}
 
 				// Transform index for 3-step twiddles
-				if(params.fft_3StepTwiddle)
+				if(params.fft_3StepTwiddle && !blockCompute)
 				{
 					if(numTrans == 1)
 					{
@@ -2831,7 +2869,10 @@ namespace StockhamGenerator
 				{
 					if(params.fft_placeness == CLFFT_INPLACE)
 					{
-						str += OffsetCalc("ioOffset", true);
+						if(blockCompute)
+							str += OffsetCalcBlock("ioOffset", true);
+						else
+							str += OffsetCalc("ioOffset", true);
 
 						str += "\t";
 						if(inInterleaved)
@@ -2846,8 +2887,16 @@ namespace StockhamGenerator
 					}
 					else
 					{
-						str += OffsetCalc("iOffset", true);
-						str += OffsetCalc("oOffset", false);
+						if(blockCompute)
+						{
+							str += OffsetCalcBlock("iOffset", true);
+							str += OffsetCalcBlock("oOffset", false);
+						}
+						else
+						{
+							str += OffsetCalc("iOffset", true);
+							str += OffsetCalc("oOffset", false);
+						}
 
 						str += "\t";
 						if(inInterleaved)
@@ -2872,6 +2921,55 @@ namespace StockhamGenerator
 					}
 				}
 
+
+				// Read data into LDS for blocked access
+				if(blockCompute)
+				{
+
+					size_t loopCount = (length * blockWidth)/blockWGS;
+					
+					str += "\n\tfor(uint t=0; t<"; str += SztToStr(loopCount);
+					str += "; t++)\n\t{\n";
+
+					for(size_t c=0; c<2; c++)
+					{
+						std::string comp = "";
+						std::string readBuf = (params.fft_placeness == CLFFT_INPLACE) ? "lwb" : "lwbIn";
+						if(!inInterleaved) comp = c ? ".y" : ".x";
+						if(!inInterleaved)
+							readBuf = (params.fft_placeness == CLFFT_INPLACE) ? (c ? "lwbIm" : "lwbRe") : (c ? "lwbInIm" : "lwbInRe");
+
+						if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_C2R) )
+						{
+							str += "\t\tR0"; str+= comp; str+= " = "; str += readBuf; str += "[(me%"; str+= SztToStr(blockWidth); str += ") + ";
+							str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_inStride[0]);
+							str += " + t*"; str += SztToStr(params.fft_inStride[0]*blockWGS/blockWidth); str += "];\n";
+						}
+						else
+						{
+							str += "\t\tR0"; str+= comp; str+= " = "; str += readBuf; str += "[me + t*"; str += SztToStr(blockWGS); str += "];\n";
+						}
+
+
+						if(inInterleaved) break;
+					}
+
+					if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_C2R) )
+					{
+						str += "\t\tlds[t*"; str += SztToStr(blockWGS/blockWidth); str += " + ";
+						str += "(me%"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(length); str += " + ";
+						str += "(me/"; str+= SztToStr(blockWidth); str+= ")] = R0;"; str +="\n";
+					}
+					else
+					{
+						str += "\t\tlds[t*"; str += SztToStr(blockWGS); str += " + me] = R0;"; str +="\n";
+					}
+
+					str += "\t}\n\n";		
+					str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n";
+				}
+
+
 				// Set rw and 'me' per transform
 				// rw string also contains 'b'
 				std::string rw, me;
@@ -2882,6 +2980,8 @@ namespace StockhamGenerator
 				if(numTrans > 1)	{ me += "me%"; me += SztToStr(workGroupSizePerTrans); me += ", "; }
 				else				{ me += "me, "; }
 
+				if(blockCompute) { me = "me%"; me += SztToStr(workGroupSizePerTrans); me += ", "; }
+
 				// Buffer strings
 				std::string inBuf, outBuf;
 				if(r2c2r)
@@ -2917,6 +3017,24 @@ namespace StockhamGenerator
 					}
 				}
 
+
+				if(blockCompute)
+				{
+					str += "\n\tfor(uint t=0; t<"; str += SztToStr(blockWidth/(blockWGS/workGroupSizePerTrans));
+					str += "; t++)\n\t{\n\n";
+
+					inBuf = "lds, ";
+					outBuf = "lds";
+
+					if(params.fft_3StepTwiddle)
+					{
+						str += "\t\tb = (batch%"; str += SztToStr(params.fft_N[1]/blockWidth); str += ")*";
+						str += SztToStr(blockWidth); str += " + t*"; str += SztToStr(blockWGS/workGroupSizePerTrans);
+						str += " + (me/"; str += SztToStr(workGroupSizePerTrans); str += ");\n\n";
+					}
+				}
+
+
 				// Call passes
 				if(numPasses == 1)
 				{
@@ -2932,40 +3050,55 @@ namespace StockhamGenerator
 				{
 					for(typename std::vector<Pass<PR> >::const_iterator p = passes.begin(); p != passes.end(); p++)
 					{
+						std::string exTab = "";
+						if(blockCompute) exTab = "\t";
+
+						str += exTab;
 						str += "\t";
 						str += PassName(p->GetPosition(), fwd);
 						str += "(";
 
 						std::string ldsOff;
-						if(numTrans > 1)
+						if(blockCompute)
 						{
-							ldsOff += "(me/"; ldsOff += SztToStr(workGroupSizePerTrans);
-							ldsOff += ")*"; ldsOff += SztToStr(length);
+							ldsOff += "t*"; ldsOff += SztToStr(length*(blockWGS/workGroupSizePerTrans)); ldsOff += " + (me/";
+							ldsOff += SztToStr(workGroupSizePerTrans); ldsOff += ")*"; ldsOff += SztToStr(length);
 						}
 						else
 						{
-							ldsOff += "0";
+							if(numTrans > 1)
+							{
+								ldsOff += "(me/"; ldsOff += SztToStr(workGroupSizePerTrans);
+								ldsOff += ")*"; ldsOff += SztToStr(length);
+							}
+							else
+							{
+								ldsOff += "0";
+							}
 						}
 
 						std::string ldsArgs;
 						if(halfLds) { ldsArgs += "lds, lds"; }
-						else		{ ldsArgs += "lds, lds + "; ldsArgs += SztToStr(length*numTrans); }
+						else		{	if(ldsInterleaved) { ldsArgs += "lds"; }
+										else { ldsArgs += "lds, lds + "; ldsArgs += SztToStr(length*numTrans); } }
 
 						str += rw; str += me;
 						if(p == passes.begin()) // beginning pass
 						{
-							str += "0, ";
+							str += blockCompute ? ldsOff : "0";
+							str += ", ";
 							str += ldsOff;
 							str += ", ";
 							str += inBuf;
 							str += ldsArgs; str += IterRegs("&"); str += ");\n";
-							if(!halfLds) str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+							if(!halfLds) { str += exTab; str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"; }
 						}
 						else if((p+1) == passes.end()) // ending pass
 						{
 							str += ldsOff;
 							str += ", ";
-							str += "0, ";
+							str += blockCompute ? ldsOff : "0";
+							str += ", ";
 							str += ldsArgs; str += ", ";
 							str += outBuf;
 							str += IterRegs("&"); str += ");\n";
@@ -2978,11 +3111,66 @@ namespace StockhamGenerator
 							str += ", ";
 							str += ldsArgs; str += ", ";
 							str += ldsArgs; str += IterRegs("&"); str += ");\n";
-							if(!halfLds) str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n";
+							if(!halfLds) { str += exTab; str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"; }
 						}
 					}
 				}
 
+
+				if(blockCompute)
+				{
+					str += "\n\t}\n\n";
+				}
+
+
+				// Write data from LDS for blocked access
+				if(blockCompute)
+				{
+
+					size_t loopCount = (length * blockWidth)/blockWGS;
+					
+					str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n";
+					str += "\n\tfor(uint t=0; t<"; str += SztToStr(loopCount);
+					str += "; t++)\n\t{\n";
+
+					if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_R2C) )
+					{
+						str += "\t\tR0 = lds[t*"; str += SztToStr(blockWGS/blockWidth); str += " + ";
+						str += "(me%"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(length); str += " + ";
+						str += "(me/"; str+= SztToStr(blockWidth); str+= ")];"; str +="\n";
+					}
+					else
+					{
+						str += "\t\tR0 = lds[t*"; str += SztToStr(blockWGS); str += " + me];"; str +="\n";
+					}
+
+					for(size_t c=0; c<2; c++)
+					{
+						std::string comp = "";
+						std::string writeBuf = (params.fft_placeness == CLFFT_INPLACE) ? "lwb" : "lwbOut";
+						if(!outInterleaved) comp = c ? ".y" : ".x";
+						if(!outInterleaved)
+							writeBuf = (params.fft_placeness == CLFFT_INPLACE) ? (c ? "lwbIm" : "lwbRe") : (c ? "lwbOutIm" : "lwbOutRe");
+
+						if( (blockComputeType == BCT_C2C) || (blockComputeType == BCT_R2C) )
+						{
+							str += "\t\t"; str += writeBuf; str += "[(me%"; str+= SztToStr(blockWidth); str += ") + ";
+							str += "(me/"; str+= SztToStr(blockWidth); str+= ")*"; str += SztToStr(params.fft_outStride[0]);
+							str += " + t*"; str += SztToStr(params.fft_outStride[0]*blockWGS/blockWidth); str += "] = R0"; str+= comp; str += ";\n";
+						}
+						else
+						{
+							str += "\t\t"; str += writeBuf; str += "[me + t*"; str += SztToStr(blockWGS); str += "] = R0"; str+= comp; str += ";\n";
+						}
+
+						if(outInterleaved) break;
+					}
+					
+					str += "\t}\n\n";		
+				}
+				
+
+
 				str += "}\n\n";
 
 				if(r2c2r)
@@ -3011,6 +3199,10 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
     params.fft_inputLayout  = this->inputLayout;
 	params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
 
+    ARG_CHECK(this->length.size()    > 0);
+	ARG_CHECK(this->inStride.size()  > 0);
+    ARG_CHECK(this->outStride.size() > 0);
+
     ARG_CHECK (this->inStride.size() == this->outStride.size())
 
 	bool real_transform = ((this->inputLayout == CLFFT_REAL) || (this->outputLayout == CLFFT_REAL));
@@ -3029,93 +3221,26 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
         params.fft_outputLayout = this->outputLayout;
     }
 
-    switch (this->inStride.size()) {
-        //    1-D array is a 2-D data structure.
-        //    1-D unit is a special case of 1-D array.
-    case 1:
-        ARG_CHECK(this->length   .size() > 0);
-        ARG_CHECK(this->outStride.size() > 0);
-        params.fft_DataDim      = 2;
-        params.fft_N[0]         = this->length[0];
-        params.fft_inStride[0]  = this->inStride[0];
-        params.fft_inStride[1]  = this->iDist;
-        params.fft_outStride[0] = this->outStride[0];
-        params.fft_outStride[1] = this->oDist;
-        break;
-
-        //    2-D array is a 3-D data structure
-        //    2-D unit is a speical case of 2-D array.
-    case 2:
-        ARG_CHECK(this->length   .size() > 1);
-        ARG_CHECK(this->outStride.size() > 1);
-        params.fft_DataDim      = 3;
-        params.fft_N[0]         = this->length[0];
-        params.fft_N[1]         = this->length[1];
-        params.fft_inStride[0]  = this->inStride[0];
-        params.fft_inStride[1]  = this->inStride[1];
-        params.fft_inStride[2]  = this->iDist;
-        params.fft_outStride[0] = this->outStride[0];
-        params.fft_outStride[1] = this->outStride[1];
-        params.fft_outStride[2] = this->oDist;
-        break;
-
-        //    3-D array is a 4-D data structure
-        //    3-D unit is a special case of 3-D array.
-    case 3:
-        ARG_CHECK(this->length   .size() > 2);
-        ARG_CHECK(this->outStride.size() > 2);
-        params.fft_DataDim      = 4;
-        params.fft_N[0]         = this->length[0];
-        params.fft_N[1]         = this->length[1];
-        params.fft_N[2]         = this->length[2];
-        params.fft_inStride[0]  = this->inStride[0];
-        params.fft_inStride[1]  = this->inStride[1];
-        params.fft_inStride[2]  = this->inStride[2];
-        params.fft_inStride[3]  = this->iDist;
-        params.fft_outStride[0] = this->outStride[0];
-        params.fft_outStride[1] = this->outStride[1];
-        params.fft_outStride[2] = this->outStride[2];
-        params.fft_outStride[3] = this->oDist;
-        break;
-
-        //    5-D data structure
-        //    This can occur when a large dimension is split into two for
-        //    the "3-step" algorithm.
-        //
-    case 4:
-        ARG_CHECK(this->length   .size() > 3);
-        ARG_CHECK(this->outStride.size() > 3);
-        params.fft_DataDim      = 5;
-        params.fft_N[0]         = this->length[0];
-        params.fft_N[1]         = this->length[1];
-        params.fft_N[2]         = this->length[2];
-        params.fft_N[3]         = this->length[3];
-        params.fft_inStride[0]  = this->inStride[0];
-        params.fft_inStride[1]  = this->inStride[1];
-        params.fft_inStride[2]  = this->inStride[2];
-        params.fft_inStride[3]  = this->inStride[3];
-        params.fft_inStride[4]  = this->iDist;
-        params.fft_outStride[0] = this->outStride[0];
-        params.fft_outStride[1] = this->outStride[1];
-        params.fft_outStride[2] = this->outStride[2];
-        params.fft_outStride[3] = this->outStride[3];
-        params.fft_outStride[4] = this->oDist;
-        break;
-    default:
-        ARG_CHECK (false);
-    }
+	params.fft_DataDim = this->length.size() + 1;
+	int i = 0;
+	for(i = 0; i < (params.fft_DataDim - 1); i++)
+	{
+        params.fft_N[i]         = this->length[i];
+        params.fft_inStride[i]  = this->inStride[i];
+        params.fft_outStride[i] = this->outStride[i];
 
-    //    TODO:  we could simplify the address calculations in the kernel
-    //    when the input data is contiguous.
-    //    For example, a 3-D data structure with
-    //        lengths: [*, 64, *]
-    //        strides: [*, 1024, 65536]
-    //    could be reduced to a 2-D data structure.
+	}
+    params.fft_inStride[i]  = this->iDist;
+    params.fft_outStride[i] = this->oDist;
 
-    params.fft_LdsComplex = this->bLdsComplex;
 
 	params.fft_RCsimple = this->RCsimple;
 
+	params.blockCompute = this->blockCompute;
+	params.blockComputeType = this->blockComputeType;
+
+	params.fft_twiddleFront = this->twiddleFront;
+
 	size_t wgs, nt;
 #ifdef PARMETERS_TO_BE_READ
 	ParamRead pr;
@@ -3131,11 +3256,21 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
 		{
 			KernelCoreSpecs<P_SINGLE> kcs;
 			kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
+			if(params.blockCompute)
+			{
+				params.blockSIMD = Kernel<P_SINGLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
+				params.blockLDS  = Kernel<P_SINGLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+			}
 		} break;
 	case P_DOUBLE:
 		{
 			KernelCoreSpecs<P_DOUBLE> kcs;
 			kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
+			if(params.blockCompute)
+			{
+				params.blockSIMD = Kernel<P_DOUBLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
+				params.blockLDS  = Kernel<P_DOUBLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+			}
 		} break;
 	}
 
@@ -3155,14 +3290,11 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
 	params.fft_SIMD = wgs;
 
 
-    params.fft_MaxRadix     = params.fft_R;
-    params.fft_UseFMA       = true;
-
     if (this->large1D != 0) {
         ARG_CHECK (params.fft_N[0] != 0)
         ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
         params.fft_3StepTwiddle = true;
-        params.fft_N[1] = this->large1D / params.fft_N[0];
+		ARG_CHECK ( this->large1D  == (params.fft_N[1] * params.fft_N[0]) );
     }
 
     params.fft_fwdScale  = this->forwardScale;
@@ -3182,11 +3314,21 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Stockham> (std::vector<size_t> & globalWS,
     }
     count *= this->batchsize;
 
-
     FFTKernelGenKeyParams fftParams;
     //    Translate the user plan into the structure that we use to map plans to clPrograms
     OPENCL_V( this->GetKernelGenKeyPvt<Stockham>( fftParams ), _T("GetKernelGenKey() failed!") );
 
+	if(fftParams.blockCompute)
+	{
+		count = DivRoundingUp<unsigned long long> (count, fftParams.blockLDS); 
+		count = count * fftParams.blockSIMD; 
+
+		globalWS.push_back( static_cast< size_t >( count ) );
+		localWS.push_back( fftParams.blockSIMD );
+
+		return    CLFFT_SUCCESS;
+	}
+
     count = DivRoundingUp<unsigned long long> (count, fftParams.fft_R);      // count of WorkItems
     count = DivRoundingUp<unsigned long long> (count, fftParams.fft_SIMD);   // count of WorkGroups
 
@@ -3229,7 +3371,7 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Stockham> (size_t * longest) const
 }
 
 template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
+clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
 {
     FFTKernelGenKeyParams params;
     OPENCL_V( this->GetKernelGenKeyPvt<Stockham> (params), _T("GetKernelGenKey() failed!") );
@@ -3237,12 +3379,10 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
     cl_int status = CL_SUCCESS;
     cl_device_id Device = NULL;
     status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
-
     OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
     cl_context QueueContext = NULL;
     status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
-
     OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
 	std::string programCode;
@@ -3265,8 +3405,8 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
 	ReadKernelFromFile(programCode);
 #endif
 
-    OPENCL_V( fftRepo.setProgramCode( Stockham, params, programCode, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-    OPENCL_V( fftRepo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back", QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+    OPENCL_V( fftRepo.setProgramCode( Stockham, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+    OPENCL_V( fftRepo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
 
     return CLFFT_SUCCESS;
 }
diff --git a/src/library/generator.stockham.h b/src/library/generator.stockham.h
index bba7d64..201e4a0 100644
--- a/src/library/generator.stockham.h
+++ b/src/library/generator.stockham.h
@@ -79,6 +79,7 @@ namespace StockhamGenerator
 		return ss.str();
 	}
 
+
 	//	Find the smallest power of 2 that is >= n; return its power of 2 factor
 	//	e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
 	inline size_t CeilPo2 (size_t n)
@@ -131,6 +132,7 @@ namespace StockhamGenerator
 		return result;
 	}
 
+
 	// Register data base types
 	template <Precision PR>
 	inline std::string RegBaseType(size_t count)
@@ -209,6 +211,123 @@ namespace StockhamGenerator
 		return "TW3step";
 	}
 
+
+
+	// Twiddle factors table for large N
+	// used in 3-step algorithm
+    class TwiddleTableLarge
+    {
+        size_t N; // length
+		size_t X, Y;
+		size_t tableSize;
+		double *wc, *ws; // cosine, sine arrays
+
+	public:
+		TwiddleTableLarge(size_t length) : N(length)
+		{
+			X = size_t(1) << ARBITRARY::TWIDDLE_DEE;
+			Y = DivRoundingUp<size_t> (CeilPo2(N), ARBITRARY::TWIDDLE_DEE);
+			tableSize = X * Y;
+
+			// Allocate memory for the tables
+			wc = new double[tableSize];
+			ws = new double[tableSize];
+		}
+
+		~TwiddleTableLarge()
+		{
+			// Free
+			delete[] wc;
+			delete[] ws;
+		}
+
+		template <Precision PR>
+		void GenerateTwiddleTable(std::string &twStr)
+		{
+			const double TWO_PI = -6.283185307179586476925286766559;
+
+			// Generate the table
+			size_t nt = 0;
+			double phi = TWO_PI / double (N);
+			for (size_t iY = 0; iY < Y; ++iY)
+			{
+				size_t i = size_t(1) << (iY * ARBITRARY::TWIDDLE_DEE);
+				for (size_t iX = 0; iX < X; ++iX)
+				{
+					size_t j = i * iX;
+
+					double c = cos(phi * (double)j);
+					double s = sin(phi * (double)j);
+
+					//if (fabs(c) < 1.0E-12)	c = 0.0;
+					//if (fabs(s) < 1.0E-12)	s = 0.0;
+
+					wc[nt]   = c;
+					ws[nt++] = s;
+				}
+			}
+
+			std::string sfx = FloatSuffix<PR>();
+
+			// Stringize the table
+			std::stringstream ss;
+			nt = 0;
+
+			ss << "\n __constant ";
+			ss << RegBaseType<PR>(2);
+			ss << " " << TwTableLargeName();
+			ss << "[" << Y << "][" << X << "] = {\n";
+			for (size_t iY = 0; iY < Y; ++iY)
+			{
+				ss << "{ ";
+				for (size_t iX = 0; iX < X; ++iX)
+				{
+					char cv[64], sv[64];
+					sprintf(cv, "%036.34lf", wc[nt]);
+					sprintf(sv, "%036.34lf", ws[nt++]);
+					ss << "("; ss << RegBaseType<PR>(2); ss << ")(";
+					ss << cv; ss << sfx; ss << ", ";
+					ss << sv; ss << sfx; ss << ")";
+					ss << ", ";
+				}
+				ss << " },\n";
+			}
+			ss << "};\n\n";
+
+
+			// Twiddle calc function
+			ss << "__attribute__((always_inline)) ";
+			ss << RegBaseType<PR>(2);
+			ss << "\n" << TwTableLargeFunc() << "(uint u)\n{\n";
+
+			ss << "\t" "uint j = u & " << unsigned(X-1) << ";\n";
+			ss << "\t" ; ss << RegBaseType<PR>(2); ss << " result = ";
+			ss << TwTableLargeName();
+			ss << "[0][j];\n";
+
+			for (size_t iY = 1; iY < Y; ++iY)
+			{
+				std::string phasor = TwTableLargeName();
+				phasor += "[";
+				phasor += SztToStr(iY);
+				phasor += "][j]";
+
+				stringpair product = ComplexMul((RegBaseType<PR>(2)).c_str(), "result", phasor.c_str());
+
+				ss << "\t" "u >>= " << unsigned (ARBITRARY::TWIDDLE_DEE) << ";\n";
+				ss << "\t" "j = u & " << unsigned(X-1) << ";\n";
+				ss << "\t" "result = " << product.first << "\n";
+				ss << "\t" "\t" << product.second <<";\n";
+			}
+			ss << "\t" "return result;\n}\n\n";
+
+			twStr += ss.str();
+		}
+    };
+
+
+
+
 	// FFT butterfly
     template <Precision PR>
     class Butterfly
@@ -1181,165 +1300,6 @@ namespace StockhamGenerator
 						}
 					}
 				} break;
-			case 16:
-				{
-					if(fwd)
-					{
-						if(cReg)
-						{
-							bflyStr +=
-
-							"(*R1) = (*R0) - (*R1);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R1);\n\t"
-							"(*R3) = (*R2) - (*R3);\n\t"
-							"(*R2) = 2.0f * (*R2) - (*R3);\n\t"
-							"(*R5) = (*R4) - (*R5);\n\t"
-							"(*R4) = 2.0f * (*R4) - (*R5);\n\t"
-							"(*R7) = (*R6) - (*R7);\n\t"
-							"(*R6) = 2.0f * (*R6) - (*R7);\n\t"
-							"(*R9) = (*R8) - (*R9);\n\t"
-							"(*R8) = 2.0f * (*R8) - (*R9);\n\t"
-							"(*R11) = (*R10) - (*R11);\n\t"
-							"(*R10) = 2.0f * (*R10) - (*R11);\n\t"
-							"(*R13) = (*R12) - (*R13);\n\t"
-							"(*R12) = 2.0f * (*R12) - (*R13);\n\t"
-							"(*R15) = (*R14) - (*R15);\n\t"
-							"(*R14) = 2.0f * (*R14) - (*R15);\n\t"
-							"\n\t"
-							"(*R2) = (*R0) - (*R2);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R2);\n\t"
-							"(*R3) = (*R1) + (fvect2)(-(*R3).y, (*R3).x);\n\t"
-							"(*R1) = 2.0f * (*R1) - (*R3);\n\t"
-							"(*R6) = (*R4) - (*R6);\n\t"
-							"(*R4) = 2.0f * (*R4) - (*R6);\n\t"
-							"(*R7) = (*R5) + (fvect2)(-(*R7).y, (*R7).x);\n\t"
-							"(*R5) = 2.0f * (*R5) - (*R7);\n\t"
-							"(*R10) = (*R8) - (*R10);\n\t"
-							"(*R8) = 2.0f * (*R8) - (*R10);\n\t"
-							"(*R11) = (*R9) + (fvect2)(-(*R11).y, (*R11).x);\n\t"
-							"(*R9) = 2.0f * (*R9) - (*R11);\n\t"
-							"(*R14) = (*R12) - (*R14);\n\t"
-							"(*R12) = 2.0f * (*R12) - (*R14);\n\t"
-							"(*R15) = (*R13) + (fvect2)(-(*R15).y, (*R15).x);\n\t"
-							"(*R13) = 2.0f * (*R13) - (*R15);\n\t"
-							"\n\t"
-							"(*R4) = (*R0) - (*R4);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R4);\n\t"
-							"(*R5) = ((*R1) - C8Q * (*R5)) - C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
-							"(*R1) = 2.0f * (*R1) - (*R5);\n\t"
-							"(*R6) = (*R2) + (fvect2)(-(*R6).y, (*R6).x);\n\t"
-							"(*R2) = 2.0f * (*R2) - (*R6);\n\t"
-							"(*R7) = ((*R3) + C8Q * (*R7)) - C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
-							"(*R3) = 2.0f * (*R3) - (*R7);\n\t"
-							"(*R12) = (*R8) - (*R12);\n\t"
-							"(*R8) = 2.0f * (*R8) - (*R12);\n\t"
-							"(*R13) = ((*R9) - C8Q * (*R13)) - C8Q * (fvect2)((*R13).y, -(*R13).x);\n\t"
-							"(*R9) = 2.0f * (*R9) - (*R13);\n\t"
-							"(*R14) = (*R10) + (fvect2)(-(*R14).y, (*R14).x);\n\t"
-							"(*R10) = 2.0f * (*R10) - (*R14);\n\t"
-							"(*R15) = ((*R11) + C8Q * (*R15)) - C8Q * (fvect2)((*R15).y, -(*R15).x);\n\t"
-							"(*R11) = 2.0f * (*R11) - (*R15);\n\t"
-							"\n\t"
-							"(*R8) = (*R0) - (*R8);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R8);\n\t"
-							"(*R9) = ((*R1) - 0.92387953251128675612818318939679 * (*R9)) - 0.3826834323650897717284599840304 * (fvect2)((*R9).y, -(*R9).x);\n\t"
-							"(*R1) = 2.0f * (*R1) - (*R9);\n\t"
-							"(*R10) = ((*R2) - C8Q * (*R10)) - C8Q * (fvect2)((*R10).y, -(*R10).x);\n\t"
-							"(*R2) = 2.0f * (*R2) - (*R10);\n\t"
-							"(*R11) = ((*R3) - 0.3826834323650897717284599840304 * (*R11)) - 0.92387953251128675612818318939679 * (fvect2)((*R11).y, -(*R11).x);\n\t"
-							"(*R3) = 2.0f * (*R3) - (*R11);\n\t"
-							"(*R12) = (*R4) + (fvect2)(-(*R12).y, (*R12).x);\n\t"
-							"(*R4) = 2.0f * (*R4) - (*R12);\n\t"
-							"(*R13) = ((*R5) + 0.3826834323650897717284599840304 * (*R13)) - 0.92387953251128675612818318939679 * (fvect2)((*R13).y, -(*R13).x);\n\t"
-							"(*R5) = 2.0f * (*R5) - (*R13);\n\t"
-							"(*R14) = ((*R6) + C8Q * (*R14)) - C8Q * (fvect2)((*R14).y, -(*R14).x);\n\t"
-							"(*R6) = 2.0f * (*R6) - (*R14);\n\t"
-							"(*R15) = ((*R7) + 0.92387953251128675612818318939679 * (*R15)) - 0.3826834323650897717284599840304 * (fvect2)((*R15).y, -(*R15).x);\n\t"
-							"(*R7) = 2.0f * (*R7) - (*R15);\n\t";
-
-						}
-						else
-							assert(false);
-					}
-					else
-					{
-						if(cReg)
-						{
-							bflyStr +=
-
-							"(*R1) = (*R0) - (*R1);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R1);\n\t"
-							"(*R3) = (*R2) - (*R3);\n\t"
-							"(*R2) = 2.0f * (*R2) - (*R3);\n\t"
-							"(*R5) = (*R4) - (*R5);\n\t"
-							"(*R4) = 2.0f * (*R4) - (*R5);\n\t"
-							"(*R7) = (*R6) - (*R7);\n\t"
-							"(*R6) = 2.0f * (*R6) - (*R7);\n\t"
-							"(*R9) = (*R8) - (*R9);\n\t"
-							"(*R8) = 2.0f * (*R8) - (*R9);\n\t"
-							"(*R11) = (*R10) - (*R11);\n\t"
-							"(*R10) = 2.0f * (*R10) - (*R11);\n\t"
-							"(*R13) = (*R12) - (*R13);\n\t"
-							"(*R12) = 2.0f * (*R12) - (*R13);\n\t"
-							"(*R15) = (*R14) - (*R15);\n\t"
-							"(*R14) = 2.0f * (*R14) - (*R15);\n\t"
-							"\n\t"
-							"(*R2) = (*R0) - (*R2);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R2);\n\t"
-							"(*R3) = (*R1) + (fvect2)((*R3).y, -(*R3).x);\n\t"
-							"(*R1) = 2.0f * (*R1) - (*R3);\n\t"
-							"(*R6) = (*R4) - (*R6);\n\t"
-							"(*R4) = 2.0f * (*R4) - (*R6);\n\t"
-							"(*R7) = (*R5) + (fvect2)((*R7).y, -(*R7).x);\n\t"
-							"(*R5) = 2.0f * (*R5) - (*R7);\n\t"
-							"(*R10) = (*R8) - (*R10);\n\t"
-							"(*R8) = 2.0f * (*R8) - (*R10);\n\t"
-							"(*R11) = (*R9) + (fvect2)((*R11).y, -(*R11).x);\n\t"
-							"(*R9) = 2.0f * (*R9) - (*R11);\n\t"
-							"(*R14) = (*R12) - (*R14);\n\t"
-							"(*R12) = 2.0f * (*R12) - (*R14);\n\t"
-							"(*R15) = (*R13) + (fvect2)((*R15).y, -(*R15).x);\n\t"
-							"(*R13) = 2.0f * (*R13) - (*R15);\n\t"
-							"\n\t"
-							"(*R4) = (*R0) - (*R4);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R4);\n\t"
-							"(*R5) = ((*R1) - C8Q * (*R5)) + C8Q * (fvect2)((*R5).y, -(*R5).x);\n\t"
-							"(*R1) = 2.0f * (*R1) - (*R5);\n\t"
-							"(*R6) = (*R2) + (fvect2)((*R6).y, -(*R6).x);\n\t"
-							"(*R2) = 2.0f * (*R2) - (*R6);\n\t"
-							"(*R7) = ((*R3) + C8Q * (*R7)) + C8Q * (fvect2)((*R7).y, -(*R7).x);\n\t"
-							"(*R3) = 2.0f * (*R3) - (*R7);\n\t"
-							"(*R12) = (*R8) - (*R12);\n\t"
-							"(*R8) = 2.0f * (*R8) - (*R12);\n\t"
-							"(*R13) = ((*R9) - C8Q * (*R13)) + C8Q * (fvect2)((*R13).y, -(*R13).x);\n\t"
-							"(*R9) = 2.0f * (*R9) - (*R13);\n\t"
-							"(*R14) = (*R10) + (fvect2)((*R14).y, -(*R14).x);\n\t"
-							"(*R10) = 2.0f * (*R10) - (*R14);\n\t"
-							"(*R15) = ((*R11) + C8Q * (*R15)) + C8Q * (fvect2)((*R15).y, -(*R15).x);\n\t"
-							"(*R11) = 2.0f * (*R11) - (*R15);\n\t"
- 							"\n\t"
-							"(*R8) = (*R0) - (*R8);\n\t"
-							"(*R0) = 2.0f * (*R0) - (*R8);\n\t"
-							"(*R9) = ((*R1) - 0.92387953251128675612818318939679 * (*R9)) + 0.3826834323650897717284599840304 * (fvect2)((*R9).y, -(*R9).x);\n\t"
-							"(*R1) = 2.0f * (*R1) - (*R9);\n\t"
-							"(*R10) = ((*R2) - C8Q * (*R10)) + C8Q * (fvect2)((*R10).y, -(*R10).x);\n\t"
-							"(*R2) = 2.0f * (*R2) - (*R10);\n\t"
-							"(*R11) = ((*R3) - 0.3826834323650897717284599840304 * (*R11)) + 0.92387953251128675612818318939679 * (fvect2)((*R11).y, -(*R11).x);\n\t"
-							"(*R3) = 2.0f * (*R3) - (*R11);\n\t"
-							"(*R12) = (*R4) + (fvect2)((*R12).y, -(*R12).x);\n\t"
-							"(*R4) = 2.0f * (*R4) - (*R12);\n\t"
-							"(*R13) = ((*R5) + 0.3826834323650897717284599840304 * (*R13)) + 0.92387953251128675612818318939679 * (fvect2)((*R13).y, -(*R13).x);\n\t"
-							"(*R5) = 2.0f * (*R5) - (*R13);\n\t"
-							"(*R14) = ((*R6) + C8Q * (*R14)) + C8Q * (fvect2)((*R14).y, -(*R14).x);\n\t"
-							"(*R6) = 2.0f * (*R6) - (*R14);\n\t"
-							"(*R15) = ((*R7) + 0.92387953251128675612818318939679 * (*R15)) + 0.3826834323650897717284599840304 * (fvect2)((*R15).y, -(*R15).x);\n\t"
-							"(*R7) = 2.0f * (*R7) - (*R15);\n\t";
-
-						}
-						else
-							assert(false);
-					}
-				} break;
 			default:
 				assert(false);
 			}
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
new file mode 100644
index 0000000..1d4a46d
--- /dev/null
+++ b/src/library/generator.transpose.gcn.cpp
@@ -0,0 +1,660 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.generator.Transpose.cpp : Dynamic run-time generator of openCL transpose kernels
+//
+
+// TODO: generalize the kernel to work with any size
+
+#include "stdafx.h"
+
+#include <math.h>
+#include <iomanip>
+
+#include "generator.transpose.gcn.h"
+#include "generator.stockham.h"
+
+// A structure that represents a bounding box or tile, with convenient names for the row and column addresses
+// local work sizes
+struct tile
+{
+    union
+    {
+        size_t x;
+        size_t col;
+    };
+
+    union
+    {
+        size_t y;
+        size_t row;
+    };
+};
+
+inline std::stringstream& clKernWrite( std::stringstream& rhs, const size_t tabIndex )
+{
+    rhs << std::setw( tabIndex ) << "";
+    return rhs;
+}
+
+static size_t NumBlocksX(size_t N);
+
+static void OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& params, bool input )
+{
+	const size_t *stride = input ? params.fft_inStride : params.fft_outStride;
+	std::string offset = input ? "iOffset" : "oOffset";
+
+
+	clKernWrite( transKernel, 3 ) << "size_t " << offset << " = 0;" << std::endl;
+	clKernWrite( transKernel, 3 ) << "currDimSize = groupIndex.y;" << std::endl;
+
+
+	for(size_t i = params.fft_DataDim - 2; i > 0 ; i--)
+	{
+		clKernWrite( transKernel, 3 ) << offset << " += (currDimSize/numGroupsY_" << i << ")*" << stride[i+1] << ";" << std::endl;
+		clKernWrite( transKernel, 3 ) << "currDimSize = currDimSize % numGroupsY_" << i << ";" << std::endl;
+	}
+
+	clKernWrite( transKernel, 3 ) << "rowSizeinUnits = " << stride[1] << ";" << std::endl;
+
+	if(params.transOutHorizontal)
+	{
+		size_t numBlocksX = NumBlocksX(params.fft_N[ 0 ]);
+
+		if(input)
+		{	
+			clKernWrite( transKernel, 3 ) << offset << " += rowSizeinUnits * wgTileExtent.y * wgUnroll * "
+				<< "(groupIndex.x + " << numBlocksX << "*(currDimSize%(numGroupsY_1/" << numBlocksX << ")));" << std::endl;
+			clKernWrite( transKernel, 3 ) << offset << " += (currDimSize/(numGroupsY_1/" << numBlocksX
+				<< ")) * wgTileExtent.x;" << std::endl; 
+		}
+		else
+		{
+			clKernWrite( transKernel, 3 ) << offset << " += (currDimSize/(numGroupsY_1/" << numBlocksX
+				<< ")) * wgTileExtent.x * rowSizeinUnits;" << std::endl; 
+			clKernWrite( transKernel, 3 ) << offset << " += wgTileExtent.y * wgUnroll * "
+				<< "(groupIndex.x + " << numBlocksX << "*(currDimSize%(numGroupsY_1/" << numBlocksX << ")));" << std::endl;
+		}
+	}
+	else
+	{
+		if(input)
+		{	
+			clKernWrite( transKernel, 3 ) << offset << " += rowSizeinUnits * wgTileExtent.y * wgUnroll * currDimSize;" << std::endl;
+			clKernWrite( transKernel, 3 ) << offset << " += groupIndex.x * wgTileExtent.x;" << std::endl;
+		}
+		else
+		{
+			clKernWrite( transKernel, 3 ) << offset << " += rowSizeinUnits * wgTileExtent.x * groupIndex.x;" << std::endl;
+			clKernWrite( transKernel, 3 ) << offset << " += currDimSize * wgTileExtent.y * wgUnroll;" << std::endl;  
+		}
+	}
+
+	clKernWrite( transKernel, 3 ) << std::endl;
+}
+
+
+
+
+// Small snippet of code that multiplies the twiddle factors into the butterfiles.  It is only emitted if the plan tells
+// the generator that it wants the twiddle factors generated inside of the transpose
+static clfftStatus genTwiddleMath( const FFTKernelGenKeyParams& params, std::stringstream& transKernel, const std::string& dtComplex, bool fwd )
+{
+    clKernWrite( transKernel, 6 ) << dtComplex << " W = TW3step( (groupIndex.x * wgTileExtent.x + xInd) * (currDimSize * wgTileExtent.y * wgUnroll + yInd) );" << std::endl;
+    clKernWrite( transKernel, 6 ) << dtComplex << " T;" << std::endl;
+
+	if(fwd)
+	{
+		clKernWrite( transKernel, 6 ) << "T.x = ( W.x * tmp.x ) - ( W.y * tmp.y );" << std::endl;
+		clKernWrite( transKernel, 6 ) << "T.y = ( W.y * tmp.x ) + ( W.x * tmp.y );" << std::endl;
+	}
+	else
+	{
+		clKernWrite( transKernel, 6 ) << "T.x =  ( W.x * tmp.x ) + ( W.y * tmp.y );" << std::endl;
+		clKernWrite( transKernel, 6 ) << "T.y = -( W.y * tmp.x ) + ( W.x * tmp.y );" << std::endl;
+	}
+
+    clKernWrite( transKernel, 6 ) << "tmp.x = T.x;" << std::endl;
+    clKernWrite( transKernel, 6 ) << "tmp.y = T.y;" << std::endl;
+
+    return CLFFT_SUCCESS;
+}
+
+// These strings represent the names that are used as strKernel parameters
+const std::string pmRealIn( "pmRealIn" );
+const std::string pmImagIn( "pmImagIn" );
+const std::string pmRealOut( "pmRealOut" );
+const std::string pmImagOut( "pmImagOut" );
+const std::string pmComplexIn( "pmComplexIn" );
+const std::string pmComplexOut( "pmComplexOut" );
+
+static clfftStatus genTransposePrototype( const FFTKernelGenKeyParams& params, const tile& lwSize, const std::string& dtPlanar, const std::string& dtComplex, 
+                                         const std::string &funcName, std::stringstream& transKernel, std::string& dtInput, std::string& dtOutput )
+{
+
+    // Declare and define the function
+    clKernWrite( transKernel, 0 ) << "__attribute__(( reqd_work_group_size( " << lwSize.x << ", " << lwSize.y << ", 1 ) ))" << std::endl;
+    clKernWrite( transKernel, 0 ) << "kernel void" << std::endl;
+
+    clKernWrite( transKernel, 0 ) << funcName << "( ";
+
+    switch( params.fft_inputLayout )
+    {
+    case CLFFT_COMPLEX_INTERLEAVED:
+        dtInput = dtComplex;
+        clKernWrite( transKernel, 0 ) << "global " << dtInput << "* restrict " << pmComplexIn;
+
+        switch( params.fft_placeness )
+        {
+        case CLFFT_INPLACE:
+            dtOutput = dtComplex;
+            break;
+        case CLFFT_OUTOFPLACE:
+            switch( params.fft_outputLayout )
+            {
+            case CLFFT_COMPLEX_INTERLEAVED:
+                dtOutput = dtComplex;
+                clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmComplexOut;
+                break;
+            case CLFFT_COMPLEX_PLANAR:
+                dtOutput = dtPlanar;
+                clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmRealOut
+                    << ", global " << dtOutput << "* restrict " << pmImagOut;
+                break;
+            case CLFFT_HERMITIAN_INTERLEAVED:
+            case CLFFT_HERMITIAN_PLANAR:
+            case CLFFT_REAL:
+            default:
+                return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+            }
+            break;
+        default:
+            return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+        }
+        break;
+    case CLFFT_COMPLEX_PLANAR:
+        dtInput = dtPlanar;
+        clKernWrite( transKernel, 0 ) << "global " << dtInput << "* restrict " << pmRealIn << ", global " << dtInput << "* restrict " << pmImagIn;
+
+        switch( params.fft_placeness )
+        {
+        case CLFFT_INPLACE:
+            dtOutput = dtPlanar;
+            break;
+        case CLFFT_OUTOFPLACE:
+            switch( params.fft_outputLayout )
+            {
+            case CLFFT_COMPLEX_INTERLEAVED:
+                dtOutput = dtComplex;
+                clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmComplexOut;
+                break;
+            case CLFFT_COMPLEX_PLANAR:
+                dtOutput = dtPlanar;
+                clKernWrite( transKernel, 0 ) << ", global " << dtOutput << "* restrict " << pmRealOut
+                    << ", global " << dtOutput << "* restrict " << pmImagOut;
+                break;
+            case CLFFT_HERMITIAN_INTERLEAVED:
+            case CLFFT_HERMITIAN_PLANAR:
+            case CLFFT_REAL:
+            default:
+                return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+            }
+            break;
+        default:
+            return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+        }
+        break;
+    case CLFFT_HERMITIAN_INTERLEAVED:
+    case CLFFT_HERMITIAN_PLANAR:
+    case CLFFT_REAL:
+    default:
+        return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+    }
+
+    // Close the method signature
+    clKernWrite( transKernel, 0 ) << " )\n{" << std::endl;
+
+    return CLFFT_SUCCESS;
+}
+
+static clfftStatus genTransposeKernel( const FFTKernelGenKeyParams& params, std::string& strKernel, const tile& lwSize, const size_t reShapeFactor, 
+                                            const size_t loopCount, const size_t outRowPadding )
+{
+    strKernel.reserve( 4096 );
+    std::stringstream transKernel( std::stringstream::out );
+
+    // These strings represent the various data types we read or write in the kernel, depending on how the plan
+    // is configured
+    std::string dtInput;        // The type read as input into kernel
+    std::string dtOutput;       // The type written as output from kernel
+    std::string dtPlanar;       // Fundamental type for planar arrays
+    std::string dtComplex;      // Fundamental type for complex arrays
+
+    // NOTE:  Enable only for debug
+    // clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
+
+    switch( params.fft_precision )
+    {
+    case CLFFT_SINGLE:
+    case CLFFT_SINGLE_FAST:
+        dtPlanar = "float";
+        dtComplex = "float2";
+        break;
+    case CLFFT_DOUBLE:
+    case CLFFT_DOUBLE_FAST:
+        dtPlanar = "double";
+        dtComplex = "double2";
+
+        // Emit code that enables double precision in the kernel
+        clKernWrite( transKernel, 0 ) << "#ifdef cl_khr_fp64" << std::endl;
+        clKernWrite( transKernel, 3 ) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
+        clKernWrite( transKernel, 0 ) << "#else" << std::endl;
+        clKernWrite( transKernel, 3 ) <<  "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
+        clKernWrite( transKernel, 0 ) << "#endif\n" << std::endl;
+        break;
+    default:
+        return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+        break;
+    }
+
+
+	//	If twiddle computation has been requested, generate the lookup function
+	if(params.fft_3StepTwiddle)
+	{
+		std::string str;
+		StockhamGenerator::TwiddleTableLarge twLarge(params.fft_N[0] * params.fft_N[1]);
+		if( (params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST) )
+			twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
+		else
+			twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
+		clKernWrite( transKernel, 0 ) << str << std::endl;
+		clKernWrite( transKernel, 0 ) << std::endl;
+	}
+
+
+    clKernWrite( transKernel, 0 ) << "// Local structure to embody/capture tile dimensions" << std::endl;
+    clKernWrite( transKernel, 0 ) << "typedef struct tag_Tile" << std::endl;
+    clKernWrite( transKernel, 0 ) << "{" << std::endl;
+    clKernWrite( transKernel, 3 ) << "size_t x;" << std::endl;
+    clKernWrite( transKernel, 3 ) << "size_t y;" << std::endl;
+    clKernWrite( transKernel, 0 ) << "} Tile;" << std::endl << std::endl;
+
+    // This detects whether the input matrix is square
+    bool notSquare = ( params.fft_N[ 0 ] == params.fft_N[ 1 ] ) ? false : true;
+
+    if( notSquare && (params.fft_placeness == CLFFT_INPLACE) )
+        return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+
+
+	for(size_t bothDir=0; bothDir<2; bothDir++)
+	{
+		//	Generate the kernel entry point and parameter list
+		//
+		bool fwd = bothDir ? false : true;
+
+		std::string funcName;
+		if(params.fft_3StepTwiddle)
+			funcName = fwd ? "transpose_gcn_tw_fwd" : "transpose_gcn_tw_back";
+		else
+			funcName = "transpose_gcn";
+
+		genTransposePrototype( params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput );
+
+		clKernWrite( transKernel, 3 ) << "const Tile localIndex = { get_local_id( 0 ), get_local_id( 1 ) }; " << std::endl;
+		clKernWrite( transKernel, 3 ) << "const Tile localExtent = { get_local_size( 0 ), get_local_size( 1 ) }; " << std::endl;
+		clKernWrite( transKernel, 3 ) << "const Tile groupIndex = { get_group_id( 0 ), get_group_id( 1 ) };" << std::endl;
+		// clKernWrite( transKernel, 3 ) << "const Tile groupExtent = { get_num_groups( 0 ), get_num_groups( 1 ) }; " << std::endl;
+		clKernWrite( transKernel, 3 ) << std::endl;
+
+		// Debug index code to see what indices we recieve
+		//clKernWrite( transKernel, 3 ) << "printf( \"localExtent: (%lu, %lu) \", localExtent.x, localExtent.x );" << std::endl;
+		//clKernWrite( transKernel, 3 ) << "printf( \"localIndex.x: %lu   \", localIndex.x );" << std::endl;
+		//clKernWrite( transKernel, 3 ) << "printf( \"localIndex.x: %lu   \", localIndex.x );" << std::endl;
+		//clKernWrite( transKernel, 3 ) << "if( localIndex.x == 0 && localIndex.y == 0) {\n" << std::endl;
+		//clKernWrite( transKernel, 6 ) << "printf( \"localIndex.x: %lu   \", localIndex.x );" << std::endl;
+		//clKernWrite( transKernel, 6 ) << "printf( \"localIndex.y: %lu   \", localIndex.y );" << std::endl;
+		//clKernWrite( transKernel, 6 ) << "printf( \"groupIndex.x: %lu   \", groupIndex.x );" << std::endl;
+		//clKernWrite( transKernel, 6 ) << "printf( \"groupIndex.y: %lu\\n\", groupIndex.y );" << std::endl;
+		//clKernWrite( transKernel, 3 ) << "}\n" << std::endl;
+
+		// This is an interesting idea in that we might be able to reshape the input 1D array as a 2D array
+		//clKernWrite( transKernel, 3 ) << "global " << dtInput << " (*myTileIn)[ 4096 ] =(global " << dtInput << " (*)[ 4096 ]) " << pmComplexIn << ";" << std::endl;
+
+
+
+		clKernWrite( transKernel, 3 ) << "// Calculate the unit address (in terms of datatype) of the beginning of the Tile for the WG block" << std::endl;
+		clKernWrite( transKernel, 3 ) << "// Transpose of input & output blocks happens with the Offset calculation" << std::endl;
+		clKernWrite( transKernel, 3 ) << "const size_t reShapeFactor = " << reShapeFactor << ";" << std::endl;
+		clKernWrite( transKernel, 3 ) << "const size_t wgUnroll = " << loopCount << ";" << std::endl;
+		clKernWrite( transKernel, 3 ) << "const Tile wgTileExtent = { localExtent.x * reShapeFactor, localExtent.y / reShapeFactor };" << std::endl;
+		clKernWrite( transKernel, 3 ) << "const size_t tileSizeinUnits = wgTileExtent.x * wgTileExtent.y * wgUnroll;" << std::endl << std::endl;
+
+
+		// This is the size of a matrix in the y dimension in units of group size; used to calculate stride[2] indexing
+		//size_t numGroupsY = DivRoundingUp( params.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
+
+		//numGroupY_1 is the number of cumulative work groups up to 1st dimension
+		//numGroupY_2 is the number of cumulative work groups up to 2nd dimension and so forth
+
+		size_t numGroupsTemp = DivRoundingUp( params.fft_N[1], lwSize.y / reShapeFactor * loopCount );
+		clKernWrite( transKernel, 3 ) << "const size_t numGroupsY_1" << " = " << numGroupsTemp << ";" << std::endl;
+		for(int i = 2; i < params.fft_DataDim - 1; i++)
+		{
+			numGroupsTemp *= params.fft_N[i];
+			clKernWrite( transKernel, 3 ) << "const size_t numGroupsY_" << i << " = " << numGroupsTemp << ";" << std::endl;
+		}
+
+
+		// Generate the amount of local data share we need
+		// Assumption: Even for planar data, we will still store values in LDS as interleaved
+		tile ldsSize = { lwSize.x * reShapeFactor, lwSize.y / reShapeFactor * loopCount };
+		switch( params.fft_outputLayout )
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+		case CLFFT_COMPLEX_PLANAR:
+			clKernWrite( transKernel, 3 ) << "// LDS is always complex and allocated transposed: lds[ wgTileExtent.y * wgUnroll ][ wgTileExtent.x ];" << std::endl;
+			clKernWrite( transKernel, 3 ) << "local " << dtComplex << " lds[ " << ldsSize.x << " ][ " << ldsSize.y << " ];" << std::endl << std::endl;
+			break;
+		case CLFFT_HERMITIAN_INTERLEAVED:
+		case CLFFT_HERMITIAN_PLANAR:
+		case CLFFT_REAL:
+			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+		}
+
+
+		clKernWrite( transKernel, 3 ) << "size_t currDimSize;" << std::endl ;
+		clKernWrite( transKernel, 3 ) << "size_t rowSizeinUnits;" << std::endl << std::endl ;
+
+
+		OffsetCalc(transKernel, params, true);
+
+
+		switch( params.fft_inputLayout )
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			clKernWrite( transKernel, 3 ) << "global " << dtInput << "* tileIn = " << pmComplexIn << " + iOffset;" << std::endl;
+			break;
+		case CLFFT_COMPLEX_PLANAR:
+			clKernWrite( transKernel, 3 ) << "global " << dtInput << "* realTileIn = " << pmRealIn << " + iOffset;" << std::endl;
+			clKernWrite( transKernel, 3 ) << "global " << dtInput << "* imagTileIn = " << pmImagIn << " + iOffset;" << std::endl;
+			break;
+		case CLFFT_HERMITIAN_INTERLEAVED:
+		case CLFFT_HERMITIAN_PLANAR:
+		case CLFFT_REAL:
+			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+		}
+
+		// This is the loop reading through the Tile
+		clKernWrite( transKernel, 3 ) << dtComplex << " tmp;" << std::endl;
+		clKernWrite( transKernel, 3 ) << "rowSizeinUnits = " << params.fft_inStride[ 1 ] << ";" << std::endl; // get_num_groups( 0 ) * wgTileExtent.x;" << std::endl;
+		clKernWrite( transKernel, 3 ) << "for( uint t=0; t < wgUnroll; t++ )" << std::endl;
+		clKernWrite( transKernel, 3 ) << "{" << std::endl;
+
+		clKernWrite( transKernel, 6 ) << "size_t xInd = localIndex.x + localExtent.x * ( localIndex.y % wgTileExtent.y ); " << std::endl;
+		clKernWrite( transKernel, 6 ) << "size_t yInd = localIndex.y/wgTileExtent.y + t * wgTileExtent.y; " << std::endl;
+
+		// Calculating the index seperately enables easier debugging through tools
+		clKernWrite( transKernel, 6 ) << "size_t gInd = xInd + rowSizeinUnits * yInd;" << std::endl;
+
+		switch( params.fft_inputLayout )
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			clKernWrite( transKernel, 6 ) << "tmp = tileIn[ gInd ];" << std::endl;
+			break;
+		case CLFFT_COMPLEX_PLANAR:
+			clKernWrite( transKernel, 6 ) << "tmp.s0 = realTileIn[ gInd ];" << std::endl;
+			clKernWrite( transKernel, 6 ) << "tmp.s1 = imagTileIn[ gInd ];" << std::endl;
+			break;
+		case CLFFT_HERMITIAN_INTERLEAVED:
+		case CLFFT_HERMITIAN_PLANAR:
+		case CLFFT_REAL:
+			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+		}
+
+		clKernWrite( transKernel, 6 ) << "// Transpose of Tile data happens here" << std::endl;
+
+
+		// If requested, generate the Twiddle math to multiply constant values
+		if( params.fft_3StepTwiddle )
+			genTwiddleMath( params, transKernel, dtComplex, fwd );
+
+		clKernWrite( transKernel, 6 ) << "lds[ xInd ][ yInd ] = tmp; " << std::endl;
+		clKernWrite( transKernel, 3 ) << "}" << std::endl;
+		clKernWrite( transKernel, 3 ) << std::endl;
+		clKernWrite( transKernel, 3 ) << "barrier( CLK_LOCAL_MEM_FENCE );" << std::endl;
+		clKernWrite( transKernel, 3 ) << std::endl;
+
+		OffsetCalc(transKernel, params, false);
+
+
+		switch( params.fft_outputLayout )
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* tileOut = " << pmComplexOut << " + oOffset;" << std::endl << std::endl;
+			break;
+		case CLFFT_COMPLEX_PLANAR:
+			clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* realTileOut = " << pmRealOut << " + oOffset;" << std::endl;
+			clKernWrite( transKernel, 3 ) << "global " << dtOutput << "* imagTileOut = " << pmImagOut << " + oOffset;" << std::endl;
+			break;
+		case CLFFT_HERMITIAN_INTERLEAVED:
+		case CLFFT_HERMITIAN_PLANAR:
+		case CLFFT_REAL:
+			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+		}
+
+		// Write the transposed values from LDS into global memory
+		clKernWrite( transKernel, 3 ) << "rowSizeinUnits = " << params.fft_outStride[ 1 ] << ";" << std::endl; // get_num_groups( 0 ) * wgTileExtent.x;" << std::endl;
+		clKernWrite( transKernel, 3 ) << "const size_t transposeRatio = wgTileExtent.x / ( wgTileExtent.y * wgUnroll );" << std::endl;
+		clKernWrite( transKernel, 3 ) << "const size_t groupingPerY = wgUnroll / wgTileExtent.y;" << std::endl;
+		clKernWrite( transKernel, 3 ) << "for( uint t=0; t < wgUnroll; t++ )" << std::endl;
+		clKernWrite( transKernel, 3 ) << "{" << std::endl;
+		clKernWrite( transKernel, 6 ) << "size_t xInd = localIndex.x + localExtent.x * ( localIndex.y % groupingPerY ); " << std::endl;
+		clKernWrite( transKernel, 6 ) << "size_t yInd = localIndex.y/groupingPerY + t * (wgTileExtent.y * transposeRatio); " << std::endl;
+		clKernWrite( transKernel, 6 ) << "tmp = lds[ yInd ][ xInd ]; " << std::endl;
+		clKernWrite( transKernel, 6 ) << "size_t gInd = xInd + rowSizeinUnits * yInd;" << std::endl;
+
+		switch( params.fft_outputLayout )
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			clKernWrite( transKernel, 6 ) << "tileOut[ gInd ] = tmp;" << std::endl;
+			break;
+		case CLFFT_COMPLEX_PLANAR:
+			clKernWrite( transKernel, 6 ) << "realTileOut[ gInd ] = tmp.s0;" << std::endl;
+			clKernWrite( transKernel, 6 ) << "imagTileOut[ gInd ] = tmp.s1;" << std::endl;
+			break;
+		case CLFFT_HERMITIAN_INTERLEAVED:
+		case CLFFT_HERMITIAN_PLANAR:
+		case CLFFT_REAL:
+			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+		}
+
+		clKernWrite( transKernel, 3 ) << "}" << std::endl;
+		clKernWrite( transKernel, 3 ) << std::endl;
+
+		clKernWrite( transKernel, 0 ) << "}\n" << std::endl;
+
+		strKernel = transKernel.str( );
+		//std::cout << strKernel;
+
+		if(!params.fft_3StepTwiddle)
+			break;
+	}
+
+    return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_GCN> (FFTKernelGenKeyParams & params) const
+{
+    ::memset( &params, 0, sizeof( params ) );
+    params.fft_precision    = this->precision;
+    params.fft_placeness    = this->placeness;
+    params.fft_inputLayout  = this->inputLayout;
+    params.fft_outputLayout = this->outputLayout;
+    params.fft_3StepTwiddle = false;
+
+	params.transOutHorizontal = this->transOutHorizontal;	// using the twiddle front flag to specify horizontal write
+														// we do this so as to reuse flags in FFTKernelGenKeyParams
+														// and to avoid making a new one 
+
+    ARG_CHECK( this->inStride.size( ) == this->outStride.size( ) );
+
+    if( CLFFT_INPLACE == params.fft_placeness )
+    {
+        //	If this is an in-place transform the
+        //	input and output layout, dimensions and strides
+        //	*MUST* be the same.
+        //
+        ARG_CHECK( params.fft_inputLayout == params.fft_outputLayout )
+
+        for( size_t u = this->inStride.size(); u-- > 0; )
+        {
+            ARG_CHECK( this->inStride[u] == this->outStride[u] );
+        }
+    }
+
+	params.fft_DataDim = this->length.size() + 1;
+	int i = 0;
+	for(i = 0; i < (params.fft_DataDim - 1); i++)
+	{
+        params.fft_N[i]         = this->length[i];
+        params.fft_inStride[i]  = this->inStride[i];
+        params.fft_outStride[i] = this->outStride[i];
+
+	}
+    params.fft_inStride[i]  = this->iDist;
+    params.fft_outStride[i] = this->oDist;
+
+    if (this->large1D != 0) {
+        ARG_CHECK (params.fft_N[0] != 0)
+        ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
+        params.fft_3StepTwiddle = true;
+		ARG_CHECK ( this->large1D  == (params.fft_N[1] * params.fft_N[0]) );
+    }
+
+    //	Query the devices in this context for their local memory sizes
+    //	How we generate a kernel depends on the *minimum* LDS size for all devices.
+    //
+    const FFTEnvelope * pEnvelope = NULL;
+    OPENCL_V( this->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
+    BUG_CHECK( NULL != pEnvelope );
+
+    // TODO:  Since I am going with a 2D workgroup size now, I need a better check than this 1D use
+    // Check:  CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
+    // CL_DEVICE_MAX_WORK_ITEM_SIZES
+    params.fft_R = 1; // Dont think i'll use
+    params.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+
+    return CLFFT_SUCCESS;
+}
+
+// Constants that specify the bounding sizes of the block that each workgroup will transpose
+const tile lwSize = { 16, 16 };
+const size_t reShapeFactor = 4;   // wgTileSize = { lwSize.x * reShapeFactor, lwSize.y / reShapeFactor }
+const size_t outRowPadding = 0;
+
+static size_t NumBlocksX(size_t N)
+{
+	return DivRoundingUp( N, lwSize.x * reShapeFactor );
+}
+
+// This is global, but should consider to be part of FFTPlan
+size_t loopCount = 0;
+
+//	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
+//	Feed this generator the FFTPlan, and it returns the generated program as a string
+template<>
+clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+{
+    FFTKernelGenKeyParams params;
+    OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( params ), _T( "GetKernelGenKey() failed!" ) );
+
+    switch( params.fft_precision )
+    {
+    case CLFFT_SINGLE:
+    case CLFFT_SINGLE_FAST:
+        loopCount = 16;
+        break;
+    case CLFFT_DOUBLE:
+    case CLFFT_DOUBLE_FAST:
+        // Double precisions need about half the amount of LDS space as singles do
+        loopCount = 8;
+        break;
+    default:
+        return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+        break;
+    }
+
+    std::string programCode;
+    OPENCL_V( genTransposeKernel( params, programCode, lwSize, reShapeFactor, loopCount, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
+
+    cl_int status = CL_SUCCESS;
+    cl_device_id Device = NULL;
+    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+
+    cl_context QueueContext = NULL;
+    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+
+
+    OPENCL_V( fftRepo.setProgramCode( Transpose_GCN, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+
+    // Note:  See genFunctionPrototype( )
+    if( params.fft_3StepTwiddle )
+    {
+        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn_tw_fwd", "transpose_gcn_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+    }
+    else
+    {
+        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn", "transpose_gcn", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+    }
+
+    return CLFFT_SUCCESS;
+}
+
+template<>
+clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_GCN>( std::vector< size_t >& globalWS, std::vector< size_t >& localWS ) const
+{
+	FFTKernelGenKeyParams parameters;
+    OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( parameters ), _T( "GetKernelGenKey() failed!" ) );
+    // We need to make sure that the global work size is evenly divisible by the local work size
+    // Our transpose works in tiles, so divide tiles in each dimension to get count of blocks, rounding up for remainder items
+    size_t numBlocksX = NumBlocksX(parameters.fft_N[ 0 ]);
+    size_t numBlocksY = DivRoundingUp( parameters.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
+    size_t numWIX = numBlocksX * lwSize.x;
+
+    // Batches of matrices are lined up along the Y axis, 1 after the other
+	size_t numWIY = numBlocksY * lwSize.y * this->batchsize;
+	// fft_DataDim has one more dimension than the actual fft data, which is devoted to batch.
+	// dim from 2 to fft_DataDim - 2 are lined up along the Y axis
+	for(int i = 2; i < parameters.fft_DataDim - 1; i++)
+	{
+		numWIY *= parameters.fft_N[i];
+	}
+
+
+    globalWS.clear( );
+    globalWS.push_back( numWIX );
+    globalWS.push_back( numWIY );
+
+    localWS.clear( );
+    localWS.push_back( lwSize.x );
+    localWS.push_back( lwSize.y );
+
+    return CLFFT_SUCCESS;
+}
diff --git a/src/library/generator.transpose.h b/src/library/generator.transpose.gcn.h
similarity index 100%
copy from src/library/generator.transpose.h
copy to src/library/generator.transpose.gcn.h
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.vliw.cpp
similarity index 94%
rename from src/library/generator.transpose.cpp
rename to src/library/generator.transpose.vliw.cpp
index 0615b99..f715c7d 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.vliw.cpp
@@ -22,7 +22,7 @@
 
 #include "stdafx.h"
 #include <math.h>
-#include "generator.transpose.h"
+#include "generator.transpose.vliw.h"
 
 #define QUOTEMARK(x) #x
 
@@ -737,7 +737,7 @@ static clfftStatus GenerateTransposeKernel (FFTKernelGenKeyParams & params,
 }
 
 template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_VLIW> (FFTKernelGenKeyParams & params) const
 {
 
 	//	Query the devices in this context for their local memory sizes
@@ -797,12 +797,12 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose> (FFTKernelGenKeyParams & para
 }
 
 template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Transpose> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_VLIW> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
 {
 	//	How many numbers per workitem in the generated kernel?
 	FFTKernelGenKeyParams fftParams;
 	//	Translate the user plan into the structure that we use to map plans to clPrograms
-	OPENCL_V( this->GetKernelGenKeyPvt<Transpose>( fftParams ), _T("GetKernelGenKey() failed!") );
+	OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW>( fftParams ), _T("GetKernelGenKey() failed!") );
 
 	unsigned long long count, count0, count1;
 	count0 = DivRoundingUp<unsigned long long> (this->length[0], fftParams.fft_R);
@@ -822,22 +822,25 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Transpose> (std::vector<size_t> & globalWS,
 //	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
 //	Feed this generator the FFTPlan, and it returns the generated program as a string
 template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Transpose> ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
+clfftStatus FFTPlan::GenerateKernelPvt<Transpose_VLIW> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
 {
 	FFTKernelGenKeyParams params;
-	OPENCL_V( this->GetKernelGenKeyPvt<Transpose> (params), _T("GetKernelGenKey() failed!") );
+	OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW> (params), _T("GetKernelGenKey() failed!") );
 
 	std::string programCode;
 	OPENCL_V( GenerateTransposeKernel( params, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
 
-  cl_int status = CL_SUCCESS;
-  cl_context QueueContext = NULL;
-  status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+    cl_int status = CL_SUCCESS;
+    cl_device_id Device = NULL;
+    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
-  OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
+    cl_context QueueContext = NULL;
+    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
-  OPENCL_V( fftRepo.setProgramCode( Transpose, params, programCode, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-	OPENCL_V( fftRepo.setProgramEntryPoints( Transpose, params, "fft_trans", "fft_trans",QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+	OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+	OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW, params, "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
 
 	return CLFFT_SUCCESS;
 }
diff --git a/src/library/generator.transpose.h b/src/library/generator.transpose.vliw.h
similarity index 100%
rename from src/library/generator.transpose.h
rename to src/library/generator.transpose.vliw.h
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 5a750d1..60389ad 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -44,6 +44,26 @@ bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& r
 	return false;
 }
 
+// Returns CLFFT_SUCCESS if the fp64 is present, CLFFT_DEVICE_NO_DOUBLE if it is not found.  
+clfftStatus checkDevExt( std::string ext, const cl_device_id &device )
+{
+	size_t deviceExtSize	= 0;
+	OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+		"Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+	std::vector< char > szDeviceExt( deviceExtSize );
+	OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+		"Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+	std::string strDeviceExt = &szDeviceExt[ 0 ];
+
+	if( strDeviceExt.find( ext.c_str( ), 0 ) == std::string::npos )
+		return CLFFT_DEVICE_NO_DOUBLE;
+
+
+	return CLFFT_SUCCESS;
+}
+
 clfftStatus	clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
 						const size_t* clLengths )
 {
@@ -119,6 +139,7 @@ clfftStatus	clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
 	fftPlan->forwardScale	= 1.0;
 	fftPlan->backwardScale	= 1.0 / static_cast< double >( lenX * lenY * lenZ );
 	fftPlan->batchsize		= 1;
+	fftPlan->userPlan		= true;
 
 	fftPlan->gen			= Stockham; //default setting
 
@@ -126,6 +147,7 @@ clfftStatus	clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
 
 	clRetainContext( fftPlan->context );
 
+#if 0
 	/////////////////////////////////////////////////////////////////
 	// Detect OpenCL devices
 	/////////////////////////////////////////////////////////////////
@@ -140,6 +162,7 @@ clfftStatus	clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
 	/* Now, get the device list data */
 	OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &fftPlan->devices[ 0 ], NULL ),
 		"Getting device array ( ::clGetContextInfo() )" );
+#endif
 
 	//	Need to devise a way to generate better names
 	tstringstream	tstream;
@@ -193,39 +216,7 @@ clfftStatus	clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
 	return	CLFFT_SUCCESS;
 }
 
-//	Read the kernels that this plan uses from file, and store into the plan
-clfftStatus WriteKernel( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelGenKeyParams& fftParams, const cl_context& context )
-{
-	FFTRepo& fftRepo	= FFTRepo::getInstance( );
-
-	//	Logic to define a sensible filename
-	const std::string kernelPrefix( "clfft.kernel." );
-	std::string generatorName;
-	std::stringstream kernelPath;
-
-	switch( gen )
-	{
-		case Stockham:		generatorName = "Stockham"; break;
-		case Transpose:		generatorName = "Transpose"; break;
-	}
-
-	kernelPath << kernelPrefix << generatorName << plHandle << ".cl";
 
-	//	Logic to write string contents out to file
-	tofstreamRAII< std::ofstream, std::string > kernelFile( kernelPath.str( ) );
-	if( !kernelFile.get( ) )
-	{
-		std::cerr << "Failed to open kernel file for writing: " << kernelPath.str( ) << std::endl;
-		return CLFFT_FILE_CREATE_FAILURE;
-	}
-
-	std::string kernel;
-	OPENCL_V( fftRepo.getProgramCode( gen, fftParams, kernel, context ), _T( "fftRepo.getProgramCode failed." ) );
-
-	kernelFile.get( ) << kernel << std::endl;
-
-	return	CLFFT_SUCCESS;
-}
 
 // **************** TODO TODO TODO ***********************
 // Making CompileKernels function take in command queue parameter so we can build for 1 particular device only;
@@ -243,23 +234,18 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
 
 	// create a cl program executable for the device associated with command queue
 	// Get the device
-	cl_device_id q_device;
-	clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &q_device, NULL);
+	cl_device_id &q_device = fftPlan->bakeDevice;
+	//clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &q_device, NULL);
 
 	FFTKernelGenKeyParams fftParams;
 	OPENCL_V( fftPlan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
 
 	cl_program program;
-  if( fftRepo.getclProgram( gen, fftParams, program, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
+	if( fftRepo.getclProgram( gen, fftParams, program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
 	{
-		//	If the user wishes us to write the kernels out to disk, we do so
-		if( fftRepo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
-		{
-			OPENCL_V( WriteKernel( plHandle, gen, fftParams, fftPlan->context ), _T( "WriteKernel failed." ) );
-		}
 
 		std::string programCode;
-		OPENCL_V( fftRepo.getProgramCode( gen, fftParams, programCode, fftPlan->context  ), _T( "fftRepo.getProgramCode failed." ) );
+		OPENCL_V( fftRepo.getProgramCode( gen, fftParams, programCode, q_device, fftPlan->context  ), _T( "fftRepo.getProgramCode failed." ) );
 
 		const char* source = programCode.c_str();
 		program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
@@ -301,23 +287,24 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
 			OPENCL_V( status, _T( "clBuildProgram failed" ) );
 		}
 
-		fftRepo.setclProgram( gen, fftParams, program );
+		fftRepo.setclProgram( gen, fftParams, program, q_device, fftPlan->context );
 
 		// For real transforms we comppile either forward or backward kernel
 		bool r2c_transform = (fftParams.fft_inputLayout == CLFFT_REAL);
 		bool c2r_transform = (fftParams.fft_outputLayout == CLFFT_REAL);
-		bool real_transform = (gen == Copy) ? true : (r2c_transform || c2r_transform);
 		bool h2c = (gen == Copy) && ((fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED));
 		bool c2h = (gen == Copy) && ((fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED));
+		bool generalCopy = !(h2c || c2h) && (gen == Copy);
+		bool complexTransform = ( !(r2c_transform || c2r_transform) && (gen != Copy) );
 
 		// get a kernel object handle for a kernel with the given name
 		cl_kernel kernel;
-		if( (!real_transform) || r2c_transform || c2h )
+		if( complexTransform || r2c_transform || c2h || generalCopy)
 		{
 			if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
 			{
 				std::string entryPoint;
-				OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_FORWARD, entryPoint, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+				OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
 
 				kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
 				OPENCL_V( status, _T( "clCreateKernel failed" ) );
@@ -326,12 +313,12 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
 			}
 		}
 
-		if( (!real_transform) || c2r_transform || h2c )
+		if( complexTransform || c2r_transform || h2c || generalCopy)
 		{
 			if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
 			{
 				std::string entryPoint;
-				OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_BACKWARD, entryPoint, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+				OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
 
 				kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
 				OPENCL_V( status, _T( "clCreateKernel failed" ) );
@@ -341,130 +328,22 @@ clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlan
 		}
 	}
 
-//TODO caching kernel binaries for later reload
-#if 0
-	// figure out number of devices and the sizes of the binary for each device.
-	OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(fftPlan->number_of_devices), &(fftPlan->number_of_devices), NULL ), _T("CompileKernels(): error getting number of devices") );
-
-
-	// get the sizes of the different binaries
-	fftPlan->ResetBinarySizes();
-	OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * fftPlan->number_of_devices, fftPlan->binary_sizes.get(), NULL ), _T("CompileKernels(): error getting binary sizes") );
-
-	// we need a list of naked pointers to all of the binaries for OpenCL
-	std::unique_ptr<char*[]> naked_binary_pointers( new char*[fftPlan->number_of_devices] );
-
-	// make space for all of the generated binaries
-	for( int i = 0; i < fftPlan->number_of_devices; i++ )
-	{
-		// this is our permanent storage place for the binaries
-		fftPlan->binaries.push_back( std::unique_ptr<char[]>(new char[fftPlan->binary_sizes[i]] ) );
-		// and we need this second copy of it for OpenCL
-		naked_binary_pointers[i] = fftPlan->binaries[i].get();
-	}
-
-	// copy all of the generated binaries over
-	OPENCL_V( clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*) * fftPlan->number_of_devices, naked_binary_pointers.get(), NULL ), _T("CompileKernels(): error getting program binaries") );
-#endif
 
 	return	CLFFT_SUCCESS;
 }
 
-//TODO caching kernel binaries for later reload
-#if 0
-//	Compile the kernels that this plan uses, and store into the plan
-clfftStatus LoadCompiledKernels( const clfftPlanHandle plHandle, const clfftGenerators gen, FFTPlan* plan )
-{
-	// if there are no devices, there are not any kernels to load
-	if( plan->number_of_devices == 0 )
-		return CLFFT_SUCCESS;
 
-	FFTRepo& repo = FFTRepo::getInstance( );
 
-	FFTKernelGenKeyParams fftParams;
-	OPENCL_V( plan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
 
-	cl_program program;
-	if( repo.getclProgram( gen, fftParams, program ) == CLFFT_INVALID_PROGRAM )
+inline size_t PrecisionWidth(clfftPrecision pr)
+{
+	switch(pr)
 	{
-		//if( repo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
-		//{
-		//	OPENCL_V( WriteKernel( plHandle, gen, fftParams ), _T( "WriteKernel failed." ) );
-		//	//TODO there's no source to spit out, but we should consider giving the user a helpful message
-		//	// such as "there's no source to output -- kernel binaries loaded from file"
-		//}
-
-		std::unique_ptr<cl_int[]> binary_status( new cl_int[plan->number_of_devices] );
-		cl_int error_code;
-
-		std::unique_ptr<const unsigned char*[]> binaries( new const unsigned char*[plan->number_of_devices] );
-		for( int i = 0; i < plan->number_of_devices; i++ )
-		{
-			binaries[i] = reinterpret_cast<const unsigned char*>(plan->binaries[0].get());
-		}
-
-		if( plan->number_of_devices > 0 )
-		{
-			program = clCreateProgramWithBinary( plan->context,
-				(cl_uint)plan->number_of_devices, &plan->devices[0], &plan->binary_sizes[0], &binaries[0],
-				binary_status.get(), &error_code);
-
-			cl_int status = 0;
-			// create a cl program executable for all the devices specified
-			status = clBuildProgram( program, 1, &plan->devices[0], NULL, NULL, NULL);
-
-			if( status != CL_SUCCESS )
-			{
-				if( status == CL_BUILD_PROGRAM_FAILURE )
-				{
-					size_t buildLogSize = 0;
-					OPENCL_V( clGetProgramBuildInfo( program, plan->devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
-						_T( "clGetProgramBuildInfo failed" ) );
-
-					vector< char > buildLog( buildLogSize );
-					::memset( &buildLog[ 0 ], 0x0, buildLogSize );
-
-					OPENCL_V( clGetProgramBuildInfo( program, plan->devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
-						_T( "clGetProgramBuildInfo failed" ) );
-
-					std::cerr << " \n\t\t\tBUILD LOG\n";
-					std::cerr << " ************************************************\n";
-					std::cerr << &buildLog[ 0 ] << std::endl;
-					std::cerr << " ************************************************\n";
-				}
-
-				OPENCL_V( status, _T( "clBuildProgram failed" ) );
-			}
-
-			repo.setclProgram( gen, fftParams, program );
-
-			// get a kernel object handle for a kernel with the given name
-			cl_kernel kernel;
-			if( repo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
-			{
-				kernel = clCreateKernel( program, "fft_fwd", &status );
-				OPENCL_V( status, _T( "clCreateKernel failed" ) );
-
-				repo.setclKernel( program, CLFFT_FORWARD, kernel );
-			}
-
-			if( repo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
-			{
-				kernel = clCreateKernel( program, "fft_back", &status );
-				OPENCL_V( status, _T( "clCreateKernel failed" ) );
-
-				repo.setclKernel( program, CLFFT_BACKWARD, kernel );
-			}
-
-			FFTKernelGenKeyParams params;
-			plan->GetKernelGenKey( params );
-			OPENCL_V( repo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back" ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
-		}
+	case CLFFT_SINGLE:	return 1;
+	case CLFFT_DOUBLE:	return 2;
+	default:		assert(false); return 1;
 	}
-
-	return CLFFT_SUCCESS;
 }
-#endif
 
 clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
 							void (CL_CALLBACK *pfn_notify)( clfftPlanHandle plHandle, void *user_data ), void* user_data )
@@ -506,11 +385,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 		case CLFFT_1D: pLength *= fftPlan->length[DimX];
 	}
 
+	const bool rc = (fftPlan->inputLayout == CLFFT_REAL) || (fftPlan->outputLayout == CLFFT_REAL);
+
 	// upper bounds on transfrom lengths - address this in the next release
 	size_t SP_MAX_LEN = 1 << 24;
 	size_t DP_MAX_LEN = 1 << 22;
-	if((fftPlan->precision == CLFFT_SINGLE) && (pLength > SP_MAX_LEN)) return CLFFT_NOTIMPLEMENTED;
-	if((fftPlan->precision == CLFFT_DOUBLE) && (pLength > DP_MAX_LEN)) return CLFFT_NOTIMPLEMENTED;
+	if((fftPlan->precision == CLFFT_SINGLE) && (pLength > SP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
+	if((fftPlan->precision == CLFFT_DOUBLE) && (pLength > DP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
 
 
 	// release buffers, as these will be created only in EnqueueTransform
@@ -519,7 +400,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 	if( NULL != fftPlan->intBufferC2R ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferC2R ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferC2R = NULL; }
 
 
-	if(fftPlan->dim == fftPlan->length.size() && fftPlan->gen != Transpose && fftPlan->gen != Copy) // confirm it is top-level plan (user plan)
+    if( fftPlan->dim == fftPlan->length.size( ) && ( fftPlan->gen != Transpose_VLIW ) && ( fftPlan->gen != Transpose_GCN ) && ( fftPlan->gen != Copy ) ) // confirm it is top-level plan (user plan)
 	{
 		if(fftPlan->placeness == CLFFT_INPLACE)
 		{
@@ -552,42 +433,26 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 		return	CLFFT_SUCCESS;
 	}
 
-//TODO caching kernel binaries for later reload
-#if 0
-	if( fftPlan->readFromFile == true )
-	{
-		OPENCL_V( LoadCompiledKernels( plHandle, fftPlan->gen, fftPlan ), _T( "LoadCompiledKernels() failed" ) );
-
-		// all of the plan compressing and subplan making should be done already,
-		// but we still need to make constant buffers
-		OPENCL_V( fftPlan->AllocateBuffers(), _T("AllocateBuffers() failed"));
-		fftPlan->ConstructAndEnqueueConstantBuffers( commQueueFFT );
-
-		if( fftPlan->planX )
-		{
-			OPENCL_V( clfftBakePlan( fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planX)" );
-		}
-
-		if( fftPlan->planY )
-		{
-			OPENCL_V( clfftBakePlan( fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planY)" );
-		}
 
-		if( fftPlan->planZ )
+	if( fftPlan->userPlan )
+	{
+		//	If the user specifies double precision, check that the device supports double precision first
+		if( fftPlan->precision == CLFFT_DOUBLE || fftPlan->precision == CLFFT_DOUBLE_FAST )
 		{
-			OPENCL_V( clfftBakePlan( fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), "clfftBakePlan failed (planZ)" );
+			clfftStatus retAmdFp64 = checkDevExt( "cl_amd_fp64", fftPlan->bakeDevice );
+			if( retAmdFp64 != CLFFT_SUCCESS )
+			{
+				//	If AMD's extention is not supported, check for Khronos extention
+				clfftStatus retKhrFp64 = checkDevExt( "cl_khr_fp64", fftPlan->bakeDevice );
+				if( retKhrFp64 != CLFFT_SUCCESS )
+					return retKhrFp64;
+			}
 		}
-
-		fftPlan->baked = true;
-		return CLFFT_SUCCESS;
 	}
-#endif
-
-	bool rc = (fftPlan->inputLayout == CLFFT_REAL) || (fftPlan->outputLayout == CLFFT_REAL);
 
 	// Compress the plan by discarding length '1' dimensions
 	// decision to pick generator
-	if(fftPlan->dim == fftPlan->length.size() && fftPlan->gen != Transpose && !rc) // confirm it is top-level plan (user plan)
+	if( fftPlan->userPlan && !rc ) // confirm it is top-level plan (user plan)
 	{
 		size_t dmnsn = fftPlan->dim;
 		bool pow2flag = true;
@@ -651,39 +516,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 	//
 	size_t Large1DThreshold = 0;
 
-	//First time check or see if LDS paramters are set-up.
-	if (fftPlan->uLdsFraction == 0)
-	{
-		switch( fftPlan->dim )
-		{
-		case CLFFT_1D:
-			{
-				if (fftPlan->length[0] < 32768 || fftPlan->length[0] > 1048576)
-					fftPlan->uLdsFraction = 8;
-				else
-					fftPlan->uLdsFraction = 4;
 
-				if (fftPlan->length[0] < 1024 )
-					fftPlan->bLdsComplex = true;
-				else
-					fftPlan->bLdsComplex = false;
-			}
-			break;
-		case CLFFT_2D:
-			{
-				fftPlan->uLdsFraction = 4;
-				fftPlan->bLdsComplex = false;
-			}
-			break;
-		case CLFFT_3D:
-			{
-				//for case 128*128*128 and 1024*128*128, fraction = 8 is faster.
-				fftPlan->uLdsFraction = 4;
-				fftPlan->bLdsComplex = false;
-			}
-			break;
-		}
-	}
 	OPENCL_V(fftPlan->GetMax1DLength (&Large1DThreshold), _T("GetMax1DLength failed"));
 	BUG_CHECK (Large1DThreshold > 1);
 
@@ -698,17 +531,59 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				size_t in_1d, in_x, count;
 
 				BUG_CHECK (IsPo2 (Large1DThreshold))
-					//ARG_CHECK (IsPo2 (fftPlan->length[0]))
 
-					// see whether large1D_Xfactor are fixed or not
-					if (fftPlan->large1D_Xfactor == 0 )
+
+				if( IsPo2(fftPlan->length[0]) )
+				{
+					// Enable block compute under these conditions
+					if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
+						&& (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) )
+					{
+						fftPlan->blockCompute = true;
+
+						if(1 == PrecisionWidth(fftPlan->precision))
+						{
+							switch(fftPlan->length[0])
+							{
+							case 8192:		clLengths[1] = 64;	break;
+							case 16384:		clLengths[1] = 64;	break;
+							case 32768:		clLengths[1] = 128;	break;
+							case 65536:		clLengths[1] = 256;	break;
+							case 131072:	clLengths[1] = 64;	break;
+							case 262144:	clLengths[1] = 64;	break;
+							case 524288:	clLengths[1] = 256; break;
+							case 1048576:	clLengths[1] = 256; break;
+							default:		assert(false);
+							}
+						}
+						else
+						{
+							switch(fftPlan->length[0])
+							{
+							case 4096:		clLengths[1] = 64;	break;
+							case 8192:		clLengths[1] = 64;	break;
+							case 16384:		clLengths[1] = 64;	break;
+							case 32768:		clLengths[1] = 128;	break;
+							case 65536:		clLengths[1] = 64;	break;
+							case 131072:	clLengths[1] = 64;	break;
+							case 262144:	clLengths[1] = 128;	break;
+							case 524288:	clLengths[1] = 256; break;
+							default:		assert(false);
+							}
+						}
+					}
+					else
 					{
-						if( IsPo2(fftPlan->length[0]) )
+						if(fftPlan->length[0] > (Large1DThreshold * Large1DThreshold) )
+						{
+							clLengths[1] = fftPlan->length[0] / Large1DThreshold;
+						}
+						else
 						{
 							in_1d = BitScanF (Large1DThreshold);	// this is log2(LARGE1D_THRESHOLD)
 							in_x  = BitScanF (fftPlan->length[0]);	// this is log2(length)
 							BUG_CHECK (in_1d > 0)
-								count = in_x/in_1d;
+							count = in_x/in_1d;
 							if (count*in_1d < in_x)
 							{
 								count++;
@@ -716,74 +591,570 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 								if (in_1d * count < in_x) in_1d++;
 							}
 							clLengths[1] = (size_t)1 << in_1d;
+						}
+					}
+				}
+				else
+				{
+					// This array must be kept sorted in the ascending order
+					size_t supported[] = {	1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40,
+											45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
+											144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288,
+											300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480, 486, 500, 512, 540,
+											576, 600, 625, 640, 648, 675, 720, 729, 750, 768, 800, 810, 864, 900, 960,
+											972, 1000, 1024, 1080, 1125, 1152, 1200, 1215, 1250, 1280, 1296, 1350, 1440,
+											1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048,
+											2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
+											3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888, 4000, 4050, 4096 };
+
+					size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
+					size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
+
+					size_t halfPowerLength = (size_t)1 << ( (StockhamGenerator::CeilPo2(fftPlan->length[0]) + 1) / 2 );
+					size_t factoredLengthStart =  (halfPowerLength < maxFactoredLength) ? halfPowerLength : maxFactoredLength;
+
+					size_t indexStart = 0;
+					while(supported[indexStart] < factoredLengthStart) indexStart++;
+
+					for(size_t i = indexStart; i >= 1; i--)
+					{
+						if( fftPlan->length[0] % supported[i] == 0 )
+						{
+							clLengths[1] = supported[i];
+							break;
+						}
+					}
+				}
+
+				clLengths[0] = fftPlan->length[0]/clLengths[1];
+
+
+                // Start of block where transposes are generated; 1D FFT
+				while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
+				{
+					if (!IsPo2(fftPlan->length[0])) break;
+
+					//TBD, only one dimension?
+					if (fftPlan->length.size() > 1) break;
+					if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
+
+					if (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) break;
 
+
+					ARG_CHECK(clLengths[0] <= Large1DThreshold);
+					ARG_CHECK(clLengths[0]>=32 && clLengths[1]>=32);
+
+					size_t padding = 64;
+					size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
+					size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
+
+					if (fftPlan->tmpBufSize==0 )
+					{
+						fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
+							fftPlan->batchsize * fftPlan->ElementSize();
+					}
+
+					//Transpose
+					//Input --> tmp buffer
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
+						_T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+
+					FFTPlan* trans1Plan	= NULL;
+					lockRAII* trans1Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					trans1Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans1Plan->precision     = fftPlan->precision;
+					trans1Plan->tmpBufSize    = 0;
+					trans1Plan->batchsize     = fftPlan->batchsize;
+					trans1Plan->envelope	  = fftPlan->envelope;
+					trans1Plan->inputLayout   = fftPlan->inputLayout;
+					trans1Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					trans1Plan->inStride[0]   = fftPlan->inStride[0];
+					trans1Plan->inStride[1]   = clLengths[0];
+					trans1Plan->outStride[0]  = 1;
+					trans1Plan->outStride[1]  = clLengths[1] + padding;
+					trans1Plan->iDist         = fftPlan->iDist;
+					trans1Plan->oDist         = clLengths[0] * trans1Plan->outStride[1];
+					trans1Plan->gen           = Transpose_GCN;
+					trans1Plan->transflag     = true;
+
+					OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d trans1 plan failed" ) );
+
+					//Row transform
+					//tmp->output
+					//size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+						_T( "CreateDefaultPlan Large1d column failed" ) );
+
+					FFTPlan* row1Plan	= NULL;
+					lockRAII* row1Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					row1Plan->placeness     = CLFFT_OUTOFPLACE;
+					row1Plan->precision     = fftPlan->precision;
+					row1Plan->forwardScale  = 1.0f;
+					row1Plan->backwardScale = 1.0f;
+					row1Plan->tmpBufSize    = 0;
+					row1Plan->batchsize     = fftPlan->batchsize;
+
+					row1Plan->gen			= fftPlan->gen;
+					row1Plan->envelope		= fftPlan->envelope;
+
+					// twiddling is done in row2
+					row1Plan->large1D		= 0;
+
+					row1Plan->length.push_back(clLengths[0]);
+					row1Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					row1Plan->outputLayout  = fftPlan->outputLayout;
+					row1Plan->inStride[0]   = 1;
+					row1Plan->outStride[0]  = fftPlan->outStride[0];
+					row1Plan->inStride.push_back(clLengths[1]+padding);
+					row1Plan->outStride.push_back(clLengths[1]);
+					row1Plan->iDist         = clLengths[0] * row1Plan->inStride[1];
+					row1Plan->oDist         = fftPlan->oDist;
+
+
+					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d first row plan failed" ) );
+
+					//Transpose 2
+					//Output --> tmp buffer
+					clLengths[2] = clLengths[0];
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
+						_T( "CreateDefaultPlan Large1d transpose 2 failed" ) );
+
+					FFTPlan* trans2Plan	= NULL;
+					lockRAII* trans2Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					trans2Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans2Plan->precision     = fftPlan->precision;
+					trans2Plan->tmpBufSize    = 0;
+					trans2Plan->batchsize     = fftPlan->batchsize;
+					trans2Plan->envelope	  = fftPlan->envelope;
+					trans2Plan->inputLayout   = fftPlan->outputLayout;
+					trans2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					trans2Plan->inStride[0]   = fftPlan->outStride[0];
+					trans2Plan->inStride[1]   = clLengths[1];
+					trans2Plan->outStride[0]  = 1;
+					trans2Plan->outStride[1]  = clLengths[0] + padding;
+					trans2Plan->iDist         = fftPlan->oDist;
+					trans2Plan->oDist         = clLengths[1] * trans2Plan->outStride[1];
+                    trans2Plan->gen           = Transpose_GCN;
+					trans2Plan->large1D			= fftPlan->length[0];
+					trans2Plan->transflag     = true;
+
+					OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d trans2 plan failed" ) );
+
+					//Row transform 2
+					//tmp->tmp
+					//size clLengths[0], batch clLengths[1]
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+						_T( "CreateDefaultPlan Large1d second row plan failed" ) );
+
+					FFTPlan* row2Plan	= NULL;
+					lockRAII* row2Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					row2Plan->placeness     = CLFFT_INPLACE;
+					row2Plan->precision     = fftPlan->precision;
+					row2Plan->forwardScale  = fftPlan->forwardScale;
+					row2Plan->backwardScale = fftPlan->backwardScale;
+					row2Plan->tmpBufSize    = 0;
+					row2Plan->batchsize     = fftPlan->batchsize;
+
+					row2Plan->gen			= fftPlan->gen;
+					row2Plan->envelope		= fftPlan->envelope;
+
+
+					row2Plan->length.push_back(clLengths[1]);
+					row2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					row2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					row2Plan->inStride[0]   = 1;
+					row2Plan->outStride[0]  = 1;
+					row2Plan->inStride.push_back(clLengths[0] + padding);
+					row2Plan->outStride.push_back(clLengths[0] + padding);
+					row2Plan->iDist         = clLengths[1] * row2Plan->inStride[1];
+					row2Plan->oDist         = clLengths[1] * row2Plan->outStride[1];
+
+
+					OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d second row plan failed" ) );
+
+					//Transpose 3
+					//tmp --> output
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
+						_T( "CreateDefaultPlan Large1d transpose 3 failed" ) );
+
+					FFTPlan* trans3Plan	= NULL;
+					lockRAII* trans3Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					trans3Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans3Plan->precision     = fftPlan->precision;
+					trans3Plan->tmpBufSize    = 0;
+					trans3Plan->batchsize     = fftPlan->batchsize;
+					trans3Plan->envelope	  = fftPlan->envelope;
+					trans3Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					trans3Plan->outputLayout  = fftPlan->outputLayout;
+					trans3Plan->inStride[0]   = 1;
+					trans3Plan->inStride[1]   = clLengths[0] + padding;
+					trans3Plan->outStride[0]  = fftPlan->outStride[0];
+					trans3Plan->outStride[1]  = clLengths[1];
+					trans3Plan->iDist         = clLengths[1] * trans3Plan->inStride[1];
+					trans3Plan->oDist         = fftPlan->oDist;
+                    trans3Plan->gen           = Transpose_GCN;
+					trans3Plan->transflag     = true;
+					trans3Plan->transOutHorizontal = true;
+
+					OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d trans3 plan failed" ) );
+
+					fftPlan->transflag = true;
+					fftPlan->baked = true;
+					return	CLFFT_SUCCESS;
+				}
+
+				size_t length0 = clLengths[0];
+				size_t length1 = clLengths[1];
+
+				if(fftPlan->inputLayout == CLFFT_REAL)
+				{
+					if (fftPlan->tmpBufSizeRC==0 )
+					{
+						fftPlan->tmpBufSizeRC = length0 * length1 *
+							fftPlan->batchsize * fftPlan->ElementSize();
+						for (size_t index=1; index < fftPlan->length.size(); index++)
+						{
+							fftPlan->tmpBufSizeRC *= fftPlan->length[index];
 						}
-						else
+					}
+
+					// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+					// transposed output
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+						_T( "CreateDefaultPlan Large1d column failed" ) );
+
+					FFTPlan* colTPlan	= NULL;
+					lockRAII* colLock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+					// current plan is to create intermediate buffer, packed and interleave
+					// This is a column FFT, the first elements distance between each FFT is the distance of the first two
+					// elements in the original buffer. Like a transpose of the matrix
+					// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
+
+					//this part are common for both passes
+					colTPlan->placeness     = CLFFT_OUTOFPLACE;
+					colTPlan->precision     = fftPlan->precision;
+					colTPlan->forwardScale  = 1.0f;
+					colTPlan->backwardScale = 1.0f;
+					colTPlan->tmpBufSize    = 0;
+					colTPlan->batchsize     = fftPlan->batchsize;
+
+					colTPlan->gen			= fftPlan->gen;
+					colTPlan->envelope			= fftPlan->envelope;
+
+					//Pass large1D flag to confirm we need multiply twiddle factor
+					colTPlan->large1D       = fftPlan->length[0];
+					colTPlan->RCsimple		= true;
+
+					colTPlan->length.push_back(clLengths[0]);
+
+					// first Pass
+					colTPlan->inputLayout   = fftPlan->inputLayout;
+					colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					colTPlan->inStride[0]   = fftPlan->inStride[0] * clLengths[0];
+					colTPlan->outStride[0]  = 1;
+					colTPlan->iDist         = fftPlan->iDist;
+					colTPlan->oDist         = length0 * length1;//fftPlan->length[0];
+					colTPlan->inStride.push_back(fftPlan->inStride[0]);
+					colTPlan->outStride.push_back(length1);//clLengths[1]);
+
+					for (size_t index=1; index < fftPlan->length.size(); index++)
+					{
+						colTPlan->length.push_back(fftPlan->length[index]);
+						colTPlan->inStride.push_back(fftPlan->inStride[index]);
+						// tmp buffer is tightly packed
+						colTPlan->outStride.push_back(colTPlan->oDist);
+						colTPlan->oDist        *= fftPlan->length[index];
+					}
+
+					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+
+					//another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
+						_T( "CreateDefaultPlan large1D row failed" ) );
+
+					FFTPlan* col2Plan	= NULL;
+					lockRAII* rowLock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+					// This is second column fft, intermediate buffer is packed and interleaved
+					// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+					// common part for both passes
+					col2Plan->placeness     = CLFFT_INPLACE;
+					col2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					col2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+
+					col2Plan->precision     = fftPlan->precision;
+					col2Plan->forwardScale  = fftPlan->forwardScale;
+					col2Plan->backwardScale = fftPlan->backwardScale;
+					col2Plan->tmpBufSize    = 0;
+					col2Plan->batchsize     = fftPlan->batchsize;
+
+					col2Plan->gen			= fftPlan->gen;
+					col2Plan->envelope			= fftPlan->envelope;
+
+					col2Plan->length.push_back(length1);
+
+					col2Plan->inStride[0]  = length1;
+					col2Plan->inStride.push_back(1);
+					col2Plan->iDist        = length0 * length1;
+
+					col2Plan->outStride[0] = length1;
+					col2Plan->outStride.push_back(1);
+					col2Plan->oDist         = length0 * length1;
+
+					for (size_t index=1; index < fftPlan->length.size(); index++)
+					{
+						col2Plan->length.push_back(fftPlan->length[index]);
+						col2Plan->inStride.push_back(col2Plan->iDist);
+						col2Plan->outStride.push_back(col2Plan->oDist);
+						col2Plan->iDist   *= fftPlan->length[index];
+						col2Plan->oDist   *= fftPlan->length[index];
+					}
+
+					OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+
+
+					// copy plan to get back to hermitian
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D,  &fftPlan->length[0] ),
+						_T( "CreateDefaultPlan RC copy failed" ) );
+
+					FFTPlan* copyPlan	= NULL;
+					lockRAII* copyLock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+
+					// This is second column fft, intermediate buffer is packed and interleaved
+					// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+					// common part for both passes
+					copyPlan->placeness     = CLFFT_OUTOFPLACE;
+					copyPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					copyPlan->outputLayout  = fftPlan->outputLayout;
+
+					copyPlan->precision     = fftPlan->precision;
+					copyPlan->forwardScale  = 1.0f;
+					copyPlan->backwardScale = 1.0f;
+					copyPlan->tmpBufSize    = 0;
+					copyPlan->batchsize     = fftPlan->batchsize;
+
+					copyPlan->gen			= Copy;
+					copyPlan->envelope		= fftPlan->envelope;
+
+
+					copyPlan->inStride[0]  = 1;
+					copyPlan->iDist        = fftPlan->length[0];
+
+					copyPlan->outStride[0] = fftPlan->outStride[0];
+					copyPlan->oDist         = fftPlan->oDist;
+
+					for (size_t index=1; index < fftPlan->length.size(); index++)
+					{
+						copyPlan->length.push_back(fftPlan->length[index]);
+						copyPlan->inStride.push_back(copyPlan->inStride[index-1] * fftPlan->length[index-1]);
+						copyPlan->iDist   *= fftPlan->length[index];
+						copyPlan->outStride.push_back(fftPlan->outStride[index]);
+					}
+
+					OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
+
+				}
+				else if(fftPlan->outputLayout == CLFFT_REAL)
+				{
+					if (fftPlan->tmpBufSizeRC==0 )
+					{
+						fftPlan->tmpBufSizeRC = length0 * length1 *
+							fftPlan->batchsize * fftPlan->ElementSize();
+						for (size_t index=1; index < fftPlan->length.size(); index++)
 						{
-							// This array must be kept sorted in the ascending order
-							size_t supported[] = {	1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40,
-													45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
-													144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288,
-													300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480, 486, 500, 512, 540,
-													576, 600, 625, 640, 648, 675, 720, 729, 750, 768, 800, 810, 864, 900, 960,
-													972, 1000, 1024, 1080, 1125, 1152, 1200, 1215, 1250, 1280, 1296, 1350, 1440,
-													1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048,
-													2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
-													3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888, 4000, 4050, 4096 };
-
-							size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
-							size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
-
-							size_t halfPowerLength = (size_t)1 << ( (StockhamGenerator::CeilPo2(fftPlan->length[0]) + 1) / 2 );
-							size_t factoredLengthStart =  (halfPowerLength < maxFactoredLength) ? halfPowerLength : maxFactoredLength;
-
-							size_t indexStart = 0;
-							while(supported[indexStart] < factoredLengthStart) indexStart++;
-
-							for(size_t i = indexStart; i >= 1; i--)
-							{
-								if( fftPlan->length[0] % supported[i] == 0 )
-								{
-									clLengths[1] = supported[i];
-									break;
-								}
-							}
+							fftPlan->tmpBufSizeRC *= fftPlan->length[index];
 						}
+					}
+
+					// copy plan to from hermitian to full complex
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D,  &fftPlan->length[0] ),
+						_T( "CreateDefaultPlan RC copy failed" ) );
+
+					FFTPlan* copyPlan	= NULL;
+					lockRAII* copyLock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
 
-						clLengths[0] = fftPlan->length[0]/clLengths[1];
+					// This is second column fft, intermediate buffer is packed and interleaved
+					// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+					// common part for both passes
+					copyPlan->placeness     = CLFFT_OUTOFPLACE;
+					copyPlan->inputLayout   = fftPlan->inputLayout;
+					copyPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+
+					copyPlan->precision     = fftPlan->precision;
+					copyPlan->forwardScale  = 1.0f;
+					copyPlan->backwardScale = 1.0f;
+					copyPlan->tmpBufSize    = 0;
+					copyPlan->batchsize     = fftPlan->batchsize;
+
+					copyPlan->gen			= Copy;
+					copyPlan->envelope		= fftPlan->envelope;
+
+					copyPlan->inStride[0]  = fftPlan->inStride[0];
+					copyPlan->iDist        = fftPlan->iDist;
+
+					copyPlan->outStride[0]  = 1;
+					copyPlan->oDist        = fftPlan->length[0];
+
+					for (size_t index=1; index < fftPlan->length.size(); index++)
+					{
+						copyPlan->length.push_back(fftPlan->length[index]);
+						copyPlan->outStride.push_back(copyPlan->outStride[index-1] * fftPlan->length[index-1]);
+						copyPlan->oDist   *= fftPlan->length[index];
+						copyPlan->inStride.push_back(fftPlan->inStride[index]);
 					}
-					else
+
+					OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
+
+					// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+					// transposed output
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+						_T( "CreateDefaultPlan Large1d column failed" ) );
+
+					FFTPlan* colTPlan	= NULL;
+					lockRAII* colLock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+
+					// current plan is to create intermediate buffer, packed and interleave
+					// This is a column FFT, the first elements distance between each FFT is the distance of the first two
+					// elements in the original buffer. Like a transpose of the matrix
+					// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
+
+					//this part are common for both passes
+					colTPlan->placeness     = CLFFT_INPLACE;
+					colTPlan->precision     = fftPlan->precision;
+					colTPlan->forwardScale  = 1.0f;
+					colTPlan->backwardScale = 1.0f;
+					colTPlan->tmpBufSize    = 0;
+					colTPlan->batchsize     = fftPlan->batchsize;
+
+					colTPlan->gen			= fftPlan->gen;
+					colTPlan->envelope			= fftPlan->envelope;
+
+					//Pass large1D flag to confirm we need multiply twiddle factor
+					colTPlan->large1D       = fftPlan->length[0];
+
+					colTPlan->length.push_back(clLengths[0]);
+
+					// first Pass
+					colTPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+
+
+					colTPlan->inStride[0]  = length0;
+					colTPlan->inStride.push_back(1);
+					colTPlan->iDist        = length0 * length1;
+
+					colTPlan->outStride[0] = length0;
+					colTPlan->outStride.push_back(1);
+					colTPlan->oDist         = length0 * length1;
+
+					for (size_t index=1; index < fftPlan->length.size(); index++)
 					{
-						//large1D_Xfactor will not pass to the second level of call
-						clLengths[0] = fftPlan->large1D_Xfactor;
-						clLengths[1] = fftPlan->length[0]/clLengths[0];
-						ARG_CHECK (fftPlan->length[0] == clLengths[0] * clLengths[1]);
+						colTPlan->length.push_back(fftPlan->length[index]);
+						colTPlan->inStride.push_back(colTPlan->iDist);
+						colTPlan->outStride.push_back(colTPlan->oDist);
+						colTPlan->iDist   *= fftPlan->length[index];
+						colTPlan->oDist   *= fftPlan->length[index];
 					}
 
-					while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
+
+					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+
+					//another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+					OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
+						_T( "CreateDefaultPlan large1D row failed" ) );
+
+					FFTPlan* col2Plan	= NULL;
+					lockRAII* rowLock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+
+					// This is second column fft, intermediate buffer is packed and interleaved
+					// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+
+					// common part for both passes
+					col2Plan->placeness     = CLFFT_OUTOFPLACE;
+					col2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					col2Plan->outputLayout  = fftPlan->outputLayout;
+
+					col2Plan->precision     = fftPlan->precision;
+					col2Plan->forwardScale  = fftPlan->forwardScale;
+					col2Plan->backwardScale = fftPlan->backwardScale;
+					col2Plan->tmpBufSize    = 0;
+					col2Plan->batchsize     = fftPlan->batchsize;
+
+					col2Plan->gen			= fftPlan->gen;
+					col2Plan->envelope			= fftPlan->envelope;
+
+					col2Plan->RCsimple = true;
+					col2Plan->length.push_back(length1);
+
+					col2Plan->inStride[0]  = 1;
+					col2Plan->inStride.push_back(length0);
+					col2Plan->iDist        = length0 * length1;
+
+					col2Plan->outStride[0] = length1 * fftPlan->outStride[0];
+					col2Plan->outStride.push_back(fftPlan->outStride[0]);
+					col2Plan->oDist         = fftPlan->oDist;
+
+					for (size_t index=1; index < fftPlan->length.size(); index++)
+					{
+						col2Plan->length.push_back(fftPlan->length[index]);
+						col2Plan->inStride.push_back(col2Plan->iDist);
+						col2Plan->iDist   *= fftPlan->length[index];
+						col2Plan->outStride.push_back(fftPlan->outStride[index]);
+					}
+
+					OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+				}
+				else
+				{
+
+					if( (fftPlan->length[0] > 262144/PrecisionWidth(fftPlan->precision)) && fftPlan->blockCompute )
 					{
-						if (!IsPo2(fftPlan->length[0])) break;
-						//if (fftPlan->precision != CLFFT_SINGLE) break;
-						//TBD, only one dimension?
-						if (fftPlan->length.size() > 1) break;
-						if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
-						//This length is good for using transpose
-						if (fftPlan->length[0] < 131072) break;
-
-						//first version not support huge1D, TBD
-						if (clLengths[0] > Large1DThreshold) break;
-						ARG_CHECK(clLengths[0]>=32 && clLengths[1]>=32);
+						assert(fftPlan->length[0] <= 1048576);
 
+
+						size_t padding = 64;	
 						if (fftPlan->tmpBufSize==0 )
 						{
-							fftPlan->tmpBufSize = clLengths[0] * clLengths[1] *
-								fftPlan->batchsize * fftPlan->ElementSize();
+							fftPlan->tmpBufSize = (length1 + padding) * length0 *
+									fftPlan->batchsize * fftPlan->ElementSize();
+							for (size_t index=1; index < fftPlan->length.size(); index++)
+							{
+								fftPlan->tmpBufSize *= fftPlan->length[index];
+							}
 						}
 
-						//Transpose
-						//Input --> tmp buffer
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
-							_T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+						// Algorithm in this case is 
+						// T(with pad, out_of_place), R (in_place), C(in_place), Unpad(out_of_place)
+
+						size_t len[3] = { clLengths[1], clLengths[0], 1 };
+
+						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTX, fftPlan->context, CLFFT_2D, len ),
+						_T( "CreateDefaultPlan Large1d trans1 failed" ) );
 
 						FFTPlan* trans1Plan	= NULL;
 						lockRAII* trans1Lock	= NULL;
@@ -797,260 +1168,99 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						trans1Plan->inputLayout   = fftPlan->inputLayout;
 						trans1Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
 						trans1Plan->inStride[0]   = fftPlan->inStride[0];
-						trans1Plan->inStride[1]   = clLengths[0];
+						trans1Plan->inStride[1]   = length1;
 						trans1Plan->outStride[0]  = 1;
-						trans1Plan->outStride[1]  = clLengths[1];
+						trans1Plan->outStride[1]  = length0 + padding;
 						trans1Plan->iDist         = fftPlan->iDist;
-						trans1Plan->oDist         = fftPlan->length[0];
-						trans1Plan->gen           = Transpose;
+						trans1Plan->oDist         = length1 * trans1Plan->outStride[1];
+						trans1Plan->gen           = Transpose_GCN;
 						trans1Plan->transflag     = true;
 
 						OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
 							_T( "BakePlan large1d trans1 plan failed" ) );
 
-						//Row transform
-						//tmp->output
-						//size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
-							_T( "CreateDefaultPlan Large1d column failed" ) );
 
-						FFTPlan* row1Plan	= NULL;
-						lockRAII* row1Lock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
-
-						row1Plan->placeness     = CLFFT_OUTOFPLACE;
-						row1Plan->precision     = fftPlan->precision;
-						row1Plan->forwardScale  = 1.0f;
-						row1Plan->backwardScale = 1.0f;
-						row1Plan->tmpBufSize    = 0;
-						row1Plan->batchsize     = fftPlan->batchsize;
-						row1Plan->bLdsComplex   = fftPlan->bLdsComplex;
-						row1Plan->uLdsFraction  = fftPlan->uLdsFraction;
-						row1Plan->ldsPadding    = fftPlan->ldsPadding;
-						row1Plan->gen			= fftPlan->gen;
-						row1Plan->envelope		= fftPlan->envelope;
-
-						//Pass large1D flag to confirm we need multiply twiddle factor
-						row1Plan->large1D       = fftPlan->length[0];
-
-						row1Plan->length.push_back(clLengths[0]);
-						row1Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-						row1Plan->outputLayout  = fftPlan->outputLayout;
-						row1Plan->inStride[0]   = 1;
-						row1Plan->outStride[0]  = fftPlan->outStride[0];
-						row1Plan->iDist         = fftPlan->length[0];
-						row1Plan->oDist         = fftPlan->oDist;
-						row1Plan->inStride.push_back(clLengths[1]);
-						row1Plan->outStride.push_back(clLengths[1]);
-
-						OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
-							_T( "BakePlan large1d first row plan failed" ) );
-
-						//Transpose 2
-						//Output --> tmp buffer
-						clLengths[2] = clLengths[0];
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
-							_T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
-
-						FFTPlan* trans2Plan	= NULL;
-						lockRAII* trans2Lock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
-
-						trans2Plan->placeness     = CLFFT_OUTOFPLACE;
-						trans2Plan->precision     = fftPlan->precision;
-						trans2Plan->tmpBufSize    = 0;
-						trans2Plan->batchsize     = fftPlan->batchsize;
-						trans2Plan->envelope	  = fftPlan->envelope;
-						trans2Plan->inputLayout   = fftPlan->outputLayout;
-						trans2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-						trans2Plan->inStride[0]   = fftPlan->outStride[0];
-						trans2Plan->inStride[1]   = clLengths[1];
-						trans2Plan->outStride[0]  = 1;
-						trans2Plan->outStride[1]  = clLengths[0];
-						trans2Plan->iDist         = fftPlan->oDist;
-						trans2Plan->oDist         = fftPlan->length[0];
-						trans2Plan->gen           = Transpose;
-						trans2Plan->transflag     = true;
-
-						OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
-							_T( "BakePlan large1d trans2 plan failed" ) );
-
-						//Row transform 2
-						//tmp->tmp
-						//size clLengths[0], batch clLengths[1]
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+						// row FFT
+						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[0] ),
 							_T( "CreateDefaultPlan Large1d column failed" ) );
 
-						FFTPlan* row2Plan	= NULL;
-						lockRAII* row2Lock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
-
-						row2Plan->placeness     = CLFFT_INPLACE;
-						row2Plan->precision     = fftPlan->precision;
-						row2Plan->forwardScale  = fftPlan->forwardScale;
-						row2Plan->backwardScale = fftPlan->backwardScale;
-						row2Plan->tmpBufSize    = 0;
-						row2Plan->batchsize     = fftPlan->batchsize;
-						row2Plan->bLdsComplex   = fftPlan->bLdsComplex;
-						row2Plan->uLdsFraction  = fftPlan->uLdsFraction;
-						row2Plan->ldsPadding    = fftPlan->ldsPadding;
-						row2Plan->gen			= fftPlan->gen;
-						row2Plan->envelope		= fftPlan->envelope;
-
-						//No twiddle factor is needed.
-						row2Plan->large1D       = 0;
-
-						row2Plan->length.push_back(clLengths[1]);
-						row2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-						row2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-						row2Plan->inStride[0]   = 1;
-						row2Plan->outStride[0]  = 1;
-						row2Plan->iDist         = fftPlan->length[0];
-						row2Plan->oDist         = fftPlan->length[0];
-						row2Plan->inStride.push_back(clLengths[0]);
-						row2Plan->outStride.push_back(clLengths[0]);
-
-						OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
-							_T( "BakePlan large1d first row plan failed" ) );
-
-						//Transpose 3
-						//tmp --> output
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
-							_T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
-
-						FFTPlan* trans3Plan	= NULL;
-						lockRAII* trans3Lock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
-
-						trans3Plan->placeness     = CLFFT_OUTOFPLACE;
-						trans3Plan->precision     = fftPlan->precision;
-						trans3Plan->tmpBufSize    = 0;
-						trans3Plan->batchsize     = fftPlan->batchsize;
-						trans3Plan->envelope	  = fftPlan->envelope;
-						trans3Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-						trans3Plan->outputLayout  = fftPlan->outputLayout;
-						trans3Plan->inStride[0]   = 1;
-						trans3Plan->inStride[1]   = clLengths[0];
-						trans3Plan->outStride[0]  = fftPlan->outStride[0];
-						trans3Plan->outStride[1]  = clLengths[1];
-						trans3Plan->iDist         = fftPlan->length[0];
-						trans3Plan->oDist         = fftPlan->oDist;
-						trans3Plan->gen           = Transpose;
-						trans3Plan->transflag     = true;
-
-						OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
-							_T( "BakePlan large1d trans3 plan failed" ) );
-
-						fftPlan->transflag = true;
-						fftPlan->baked = true;
-						return	CLFFT_SUCCESS;
-					}
+						FFTPlan* rowPlan	= NULL;
+						lockRAII* rowLock	= NULL;
+						OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
 
-					size_t length0 = clLengths[0];
-					size_t length1 = clLengths[1];
+						assert(fftPlan->large1D == 0);
 
-					if(fftPlan->inputLayout == CLFFT_REAL)
-					{
-						if (fftPlan->tmpBufSizeRC==0 )
-						{
-							fftPlan->tmpBufSizeRC = length0 * length1 *
-								fftPlan->batchsize * fftPlan->ElementSize();
-							for (size_t index=1; index < fftPlan->length.size(); index++)
-							{
-								fftPlan->tmpBufSizeRC *= fftPlan->length[index];
-							}
-						}
+						rowPlan->placeness     = CLFFT_INPLACE;
+						rowPlan->precision     = fftPlan->precision;
+						rowPlan->forwardScale  = 1.0f;
+						rowPlan->backwardScale = 1.0f;
+						rowPlan->tmpBufSize    = 0;
+						rowPlan->batchsize     = fftPlan->batchsize;
 
-						// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
-						// transposed output
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
-							_T( "CreateDefaultPlan Large1d column failed" ) );
+						rowPlan->gen			= fftPlan->gen;
+						rowPlan->envelope		= fftPlan->envelope;
 
-						FFTPlan* colTPlan	= NULL;
-						lockRAII* colLock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+						rowPlan->length.push_back(length1);
 
-						// current plan is to create intermediate buffer, packed and interleave
-						// This is a column FFT, the first elements distance between each FFT is the distance of the first two
-						// elements in the original buffer. Like a transpose of the matrix
-						// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
 
-						//this part are common for both passes
-						colTPlan->placeness     = CLFFT_OUTOFPLACE;
-						colTPlan->precision     = fftPlan->precision;
-						colTPlan->forwardScale  = 1.0f;
-						colTPlan->backwardScale = 1.0f;
-						colTPlan->tmpBufSize    = 0;
-						colTPlan->batchsize     = fftPlan->batchsize;
-						colTPlan->bLdsComplex   = fftPlan->bLdsComplex;
-						colTPlan->uLdsFraction  = fftPlan->uLdsFraction;
-						colTPlan->ldsPadding    = fftPlan->ldsPadding;
-						colTPlan->gen			= fftPlan->gen;
-						colTPlan->envelope			= fftPlan->envelope;
-
-						//Pass large1D flag to confirm we need multiply twiddle factor
-						colTPlan->large1D       = fftPlan->length[0];
-						colTPlan->RCsimple		= true;
-
-						colTPlan->length.push_back(clLengths[0]);
-
-						// first Pass
-						colTPlan->inputLayout   = fftPlan->inputLayout;
-						colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-						colTPlan->inStride[0]   = fftPlan->inStride[0] * clLengths[0];
-						colTPlan->outStride[0]  = 1;
-						colTPlan->iDist         = fftPlan->iDist;
-						colTPlan->oDist         = length0 * length1;//fftPlan->length[0];
-						colTPlan->inStride.push_back(fftPlan->inStride[0]);
-						colTPlan->outStride.push_back(length1);//clLengths[1]);
+						rowPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+						rowPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+						rowPlan->inStride[0]   = 1;
+						rowPlan->outStride[0]  = 1;
+						rowPlan->inStride.push_back(length0+padding);
+						rowPlan->outStride.push_back(length0+padding);
+						rowPlan->iDist         = (length0+padding)*length1;
+						rowPlan->oDist         = (length0+padding)*length1;
 
 						for (size_t index=1; index < fftPlan->length.size(); index++)
 						{
-							colTPlan->length.push_back(fftPlan->length[index]);
-							colTPlan->inStride.push_back(fftPlan->inStride[index]);
-							// tmp buffer is tightly packed
-							colTPlan->outStride.push_back(colTPlan->oDist);
-							colTPlan->oDist        *= fftPlan->length[index];
+							rowPlan->length.push_back(fftPlan->length[index]);
+							rowPlan->inStride.push_back(rowPlan->iDist);
+							rowPlan->iDist *= fftPlan->length[index];
+							rowPlan->outStride.push_back(rowPlan->oDist);
+							rowPlan->oDist *= fftPlan->length[index];
 						}
 
-						OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
 
-						//another column FFT, size clLengths[0], batch clLengths[1], output without transpose
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
-							_T( "CreateDefaultPlan large1D row failed" ) );
+						OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first row plan failed" ) );
 
-						FFTPlan* col2Plan	= NULL;
-						lockRAII* rowLock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+						//column FFT
+						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[1] ),
+							_T( "CreateDefaultPlan large1D column failed" ) );
 
-						// This is second column fft, intermediate buffer is packed and interleaved
-						// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+						FFTPlan* col2Plan	= NULL;
+						lockRAII* colLock	= NULL;
+						OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, colLock ), _T( "fftRepo.getPlan failed" ) );
 
-						// common part for both passes
 						col2Plan->placeness     = CLFFT_INPLACE;
 						col2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
 						col2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-
 						col2Plan->precision     = fftPlan->precision;
 						col2Plan->forwardScale  = fftPlan->forwardScale;
 						col2Plan->backwardScale = fftPlan->backwardScale;
 						col2Plan->tmpBufSize    = 0;
 						col2Plan->batchsize     = fftPlan->batchsize;
-						col2Plan->bLdsComplex   = fftPlan->bLdsComplex;
-						col2Plan->uLdsFraction  = fftPlan->uLdsFraction;
-						col2Plan->ldsPadding    = fftPlan->ldsPadding;
+
 						col2Plan->gen			= fftPlan->gen;
-						col2Plan->envelope			= fftPlan->envelope;
+						col2Plan->envelope		= fftPlan->envelope;
 
-						col2Plan->length.push_back(length1);
+						col2Plan->large1D       = fftPlan->length[0];
+						col2Plan->twiddleFront	= true;
+
+						col2Plan->length.push_back(clLengths[0]);
 
-						col2Plan->inStride[0]  = length1;
-						col2Plan->inStride.push_back(1);
-						col2Plan->iDist        = length0 * length1;
 
-						col2Plan->outStride[0] = length1;
+
+						col2Plan->blockCompute = true;
+						col2Plan->blockComputeType = BCT_C2C;
+
+						col2Plan->inStride[0]  = length0+padding;
+						col2Plan->outStride[0] = length0+padding;
+						col2Plan->iDist        = (length0+padding) * length1;
+						col2Plan->oDist        = (length0+padding) * length1;
+						col2Plan->inStride.push_back(1);
 						col2Plan->outStride.push_back(1);
-						col2Plan->oDist         = length0 * length1;
+
 
 						for (size_t index=1; index < fftPlan->length.size(); index++)
 						{
@@ -1061,21 +1271,19 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 							col2Plan->oDist   *= fftPlan->length[index];
 						}
 
+
 						OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
 
 
-						// copy plan to get back to hermitian
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D,  &fftPlan->length[0] ),
-							_T( "CreateDefaultPlan RC copy failed" ) );
+						// copy plan to get results back to packed output
+						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planCopy, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
+							_T( "CreateDefaultPlan Copy failed" ) );
 
 						FFTPlan* copyPlan	= NULL;
 						lockRAII* copyLock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+						OPENCL_V( fftRepo.getPlan( fftPlan->planCopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
 
-						// This is second column fft, intermediate buffer is packed and interleaved
-						// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
 
-						// common part for both passes
 						copyPlan->placeness     = CLFFT_OUTOFPLACE;
 						copyPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
 						copyPlan->outputLayout  = fftPlan->outputLayout;
@@ -1085,85 +1293,43 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						copyPlan->backwardScale = 1.0f;
 						copyPlan->tmpBufSize    = 0;
 						copyPlan->batchsize     = fftPlan->batchsize;
-						copyPlan->bLdsComplex   = fftPlan->bLdsComplex;
-						copyPlan->uLdsFraction  = fftPlan->uLdsFraction;
-						copyPlan->ldsPadding    = fftPlan->ldsPadding;
+
 						copyPlan->gen			= Copy;
 						copyPlan->envelope		= fftPlan->envelope;
 
+						copyPlan->length.push_back(length1);
 
 						copyPlan->inStride[0]  = 1;
-						copyPlan->iDist        = fftPlan->length[0];
+						copyPlan->inStride.push_back(length0+padding);
+						copyPlan->iDist        = length1*(length0+padding);
 
 						copyPlan->outStride[0] = fftPlan->outStride[0];
+						copyPlan->outStride.push_back(length0);
 						copyPlan->oDist         = fftPlan->oDist;
 
 						for (size_t index=1; index < fftPlan->length.size(); index++)
 						{
 							copyPlan->length.push_back(fftPlan->length[index]);
-							copyPlan->inStride.push_back(copyPlan->inStride[index-1] * fftPlan->length[index-1]);
+							copyPlan->inStride.push_back(copyPlan->inStride[index] * copyPlan->length[index]);
 							copyPlan->iDist   *= fftPlan->length[index];
 							copyPlan->outStride.push_back(fftPlan->outStride[index]);
 						}
 
-						OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
-
+						OPENCL_V(clfftBakePlan(fftPlan->planCopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d copy plan failed" ) );
 					}
-					else if(fftPlan->outputLayout == CLFFT_REAL)
+					else
 					{
-						if (fftPlan->tmpBufSizeRC==0 )
+
+						if (fftPlan->tmpBufSize==0 )
 						{
-							fftPlan->tmpBufSizeRC = length0 * length1 *
+							fftPlan->tmpBufSize = length0 * length1 *
 								fftPlan->batchsize * fftPlan->ElementSize();
 							for (size_t index=1; index < fftPlan->length.size(); index++)
 							{
-								fftPlan->tmpBufSizeRC *= fftPlan->length[index];
+								fftPlan->tmpBufSize *= fftPlan->length[index];
 							}
 						}
 
-						// copy plan to from hermitian to full complex
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D,  &fftPlan->length[0] ),
-							_T( "CreateDefaultPlan RC copy failed" ) );
-
-						FFTPlan* copyPlan	= NULL;
-						lockRAII* copyLock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
-
-						// This is second column fft, intermediate buffer is packed and interleaved
-						// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
-
-						// common part for both passes
-						copyPlan->placeness     = CLFFT_OUTOFPLACE;
-						copyPlan->inputLayout   = fftPlan->inputLayout;
-						copyPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-
-						copyPlan->precision     = fftPlan->precision;
-						copyPlan->forwardScale  = 1.0f;
-						copyPlan->backwardScale = 1.0f;
-						copyPlan->tmpBufSize    = 0;
-						copyPlan->batchsize     = fftPlan->batchsize;
-						copyPlan->bLdsComplex   = fftPlan->bLdsComplex;
-						copyPlan->uLdsFraction  = fftPlan->uLdsFraction;
-						copyPlan->ldsPadding    = fftPlan->ldsPadding;
-						copyPlan->gen			= Copy;
-						copyPlan->envelope		= fftPlan->envelope;
-
-						copyPlan->inStride[0]  = fftPlan->inStride[0];
-						copyPlan->iDist        = fftPlan->iDist;
-
-						copyPlan->outStride[0]  = 1;
-						copyPlan->oDist        = fftPlan->length[0];
-
-						for (size_t index=1; index < fftPlan->length.size(); index++)
-						{
-							copyPlan->length.push_back(fftPlan->length[index]);
-							copyPlan->outStride.push_back(copyPlan->outStride[index-1] * fftPlan->length[index-1]);
-							copyPlan->oDist   *= fftPlan->length[index];
-							copyPlan->inStride.push_back(fftPlan->inStride[index]);
-						}
-
-						OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
-
 						// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
 						// transposed output
 						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
@@ -1173,49 +1339,53 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						lockRAII* colLock	= NULL;
 						OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
 
+						assert(fftPlan->large1D == 0);
+
 						// current plan is to create intermediate buffer, packed and interleave
 						// This is a column FFT, the first elements distance between each FFT is the distance of the first two
 						// elements in the original buffer. Like a transpose of the matrix
 						// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
 
 						//this part are common for both passes
-						colTPlan->placeness     = CLFFT_INPLACE;
+						colTPlan->placeness     = CLFFT_OUTOFPLACE;
 						colTPlan->precision     = fftPlan->precision;
 						colTPlan->forwardScale  = 1.0f;
 						colTPlan->backwardScale = 1.0f;
 						colTPlan->tmpBufSize    = 0;
 						colTPlan->batchsize     = fftPlan->batchsize;
-						colTPlan->bLdsComplex   = fftPlan->bLdsComplex;
-						colTPlan->uLdsFraction  = fftPlan->uLdsFraction;
-						colTPlan->ldsPadding    = fftPlan->ldsPadding;
+
 						colTPlan->gen			= fftPlan->gen;
 						colTPlan->envelope			= fftPlan->envelope;
 
 						//Pass large1D flag to confirm we need multiply twiddle factor
 						colTPlan->large1D       = fftPlan->length[0];
 
-						colTPlan->length.push_back(clLengths[0]);
-
-						// first Pass
-						colTPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-						colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+						colTPlan->length.push_back(length0);
 
 
-						colTPlan->inStride[0]  = length0;
-						colTPlan->inStride.push_back(1);
-						colTPlan->iDist        = length0 * length1;
-
-						colTPlan->outStride[0] = length0;
-						colTPlan->outStride.push_back(1);
+						colTPlan->inputLayout   = fftPlan->inputLayout;
+						colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+						colTPlan->inStride[0]   = fftPlan->inStride[0] * length0;
+						colTPlan->outStride[0]  = length0;
+						colTPlan->iDist         = fftPlan->iDist;
 						colTPlan->oDist         = length0 * length1;
+						colTPlan->inStride.push_back(fftPlan->inStride[0]);
+						colTPlan->outStride.push_back(1);
+
+						// Enabling block column compute
+						if( (colTPlan->inStride[0] == length0) && IsPo2(fftPlan->length[0]) && (fftPlan->length[0] < 524288) )
+						{
+							colTPlan->blockCompute = true;
+							colTPlan->blockComputeType = BCT_C2C;
+						}
 
 						for (size_t index=1; index < fftPlan->length.size(); index++)
 						{
 							colTPlan->length.push_back(fftPlan->length[index]);
-							colTPlan->inStride.push_back(colTPlan->iDist);
+							colTPlan->inStride.push_back(fftPlan->inStride[index]);
+							// tmp buffer is tightly packed
 							colTPlan->outStride.push_back(colTPlan->oDist);
-							colTPlan->iDist   *= fftPlan->length[index];
-							colTPlan->oDist   *= fftPlan->length[index];
+							colTPlan->oDist        *= fftPlan->length[index];
 						}
 
 
@@ -1233,229 +1403,122 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
 
 						// common part for both passes
-						col2Plan->placeness     = CLFFT_OUTOFPLACE;
-						col2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
 						col2Plan->outputLayout  = fftPlan->outputLayout;
-
 						col2Plan->precision     = fftPlan->precision;
 						col2Plan->forwardScale  = fftPlan->forwardScale;
 						col2Plan->backwardScale = fftPlan->backwardScale;
 						col2Plan->tmpBufSize    = 0;
 						col2Plan->batchsize     = fftPlan->batchsize;
-						col2Plan->bLdsComplex   = fftPlan->bLdsComplex;
-						col2Plan->uLdsFraction  = fftPlan->uLdsFraction;
-						col2Plan->ldsPadding    = fftPlan->ldsPadding;
-						col2Plan->gen			= fftPlan->gen;
-						col2Plan->envelope			= fftPlan->envelope;
-
-						col2Plan->RCsimple = true;
-						col2Plan->length.push_back(length1);
-
-						col2Plan->inStride[0]  = 1;
-						col2Plan->inStride.push_back(length0);
-						col2Plan->iDist        = length0 * length1;
-
-						col2Plan->outStride[0] = length1 * fftPlan->outStride[0];
-						col2Plan->outStride.push_back(fftPlan->outStride[0]);
 						col2Plan->oDist         = fftPlan->oDist;
 
-						for (size_t index=1; index < fftPlan->length.size(); index++)
-						{
-							col2Plan->length.push_back(fftPlan->length[index]);
-							col2Plan->inStride.push_back(col2Plan->iDist);
-							col2Plan->iDist   *= fftPlan->length[index];
-							col2Plan->outStride.push_back(fftPlan->outStride[index]);
-						}
-
-						OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
-					}
-					else
-					{
-						if (fftPlan->cacheSize) {
-							length0 += fftPlan->cacheSize & 0xFF;
-							length1 += (fftPlan->cacheSize >> 8) & 0xFF;
-							if (length0 * length1 > 2 * fftPlan->length[0])
-							{
-								length0 = clLengths[0];
-								length1 = clLengths[1];
-							}
-						}
-						else
-						{
-							if (fftPlan->length[0] == 131072) length1 += 1;     //x0=0, y0=1 good for Cayman card
-							else if (fftPlan->length[0] == 65536) length1 += 8; //x0=0, y0=8 good for Cypress card
-						}
-
-						if (clLengths[0] > Large1DThreshold)
-						{//make no change for Huge 1D case
-							length0 = clLengths[0];
-							length1 = clLengths[1];
-						}
-
-						if (fftPlan->tmpBufSize==0 )
-						{
-							fftPlan->tmpBufSize = length0 * length1 *
-								fftPlan->batchsize * fftPlan->ElementSize();
-							for (size_t index=1; index < fftPlan->length.size(); index++)
-							{
-								fftPlan->tmpBufSize *= fftPlan->length[index];
-							}
-						}
-						else
-						{//make no change for cases passed from higher dimension
-							length0 = clLengths[0];
-							length1 = clLengths[1];
-						}
-
-						// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
-						// transposed output
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
-							_T( "CreateDefaultPlan Large1d column failed" ) );
-
-						FFTPlan* colTPlan	= NULL;
-						lockRAII* colLock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+						col2Plan->gen			= fftPlan->gen;
+						col2Plan->envelope		= fftPlan->envelope;
 
-						// current plan is to create intermediate buffer, packed and interleave
-						// This is a column FFT, the first elements distance between each FFT is the distance of the first two
-						// elements in the original buffer. Like a transpose of the matrix
-						// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
 
-						//this part are common for both passes
-						colTPlan->placeness     = CLFFT_OUTOFPLACE;
-						colTPlan->precision     = fftPlan->precision;
-						colTPlan->forwardScale  = 1.0f;
-						colTPlan->backwardScale = 1.0f;
-						colTPlan->tmpBufSize    = 0;
-						colTPlan->batchsize     = fftPlan->batchsize;
-						colTPlan->bLdsComplex   = fftPlan->bLdsComplex;
-						colTPlan->uLdsFraction  = fftPlan->uLdsFraction;
-						colTPlan->ldsPadding    = fftPlan->ldsPadding;
-						colTPlan->gen			= fftPlan->gen;
-						colTPlan->envelope			= fftPlan->envelope;
+						col2Plan->length.push_back(clLengths[1]);
 
-						//Pass large1D flag to confirm we need multiply twiddle factor
-						colTPlan->large1D       = fftPlan->length[0];
+						bool integratedTranposes = true;
 
-						colTPlan->length.push_back(clLengths[0]);
 
-						if (fftPlan->large1D == 0)
+						if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) && clLengths[0] <= 256)
 						{
-							// first Pass
-							colTPlan->inputLayout   = fftPlan->inputLayout;
-							colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-							colTPlan->inStride[0]   = fftPlan->inStride[0] * clLengths[0];
-							colTPlan->outStride[0]  = 1;
-							colTPlan->iDist         = fftPlan->iDist;
-							colTPlan->oDist         = length0 * length1;//fftPlan->length[0];
-							colTPlan->inStride.push_back(fftPlan->inStride[0]);
-							colTPlan->outStride.push_back(length1);//clLengths[1]);
+							col2Plan->blockCompute = true;
+							col2Plan->blockComputeType = BCT_R2C;
 
-							for (size_t index=1; index < fftPlan->length.size(); index++)
-							{
-								colTPlan->length.push_back(fftPlan->length[index]);
-								colTPlan->inStride.push_back(fftPlan->inStride[index]);
-								// tmp buffer is tightly packed
-								colTPlan->outStride.push_back(colTPlan->oDist);
-								colTPlan->oDist        *= fftPlan->length[index];
-							}
+							col2Plan->placeness    = CLFFT_OUTOFPLACE;
+							col2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+							col2Plan->inStride[0]  = 1;
+							col2Plan->outStride[0] = length1;
+							col2Plan->iDist        = length0 * length1;
+							col2Plan->inStride.push_back(length0);
+							col2Plan->outStride.push_back(1);
 						}
-						else
+						else if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) )
 						{
-							// second pass for huge 1D
-							colTPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-							colTPlan->outputLayout  = fftPlan->outputLayout;
-							colTPlan->inStride[0]   = fftPlan->length[1]*clLengths[0];
-							colTPlan->outStride[0]  = fftPlan->outStride[0];
-							colTPlan->iDist         = fftPlan->length[0];
-							colTPlan->oDist         = fftPlan->oDist;
-							colTPlan->inStride.push_back(fftPlan->length[1]);
-							colTPlan->outStride.push_back(fftPlan->outStride[0]*clLengths[1]);
+							integratedTranposes = false;
 
-							for (size_t index=1; index < fftPlan->length.size(); index++)
-							{
-								colTPlan->length.push_back(fftPlan->length[index]);
-								colTPlan->inStride.push_back(fftPlan->inStride[index]);
-								colTPlan->outStride.push_back(fftPlan->outStride[index]);
-								colTPlan->iDist        *= fftPlan->length[index];
-							}
+							col2Plan->placeness    = CLFFT_INPLACE;
+							col2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+							col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+							col2Plan->inStride[0]  = 1;
+							col2Plan->outStride[0] = 1;
+							col2Plan->iDist        = length0 * length1;
+							col2Plan->oDist        = length0 * length1;
+							col2Plan->inStride.push_back(length0);
+							col2Plan->outStride.push_back(length0);
 						}
-
-						OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
-
-						//another column FFT, size clLengths[0], batch clLengths[1], output without transpose
-						OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
-							_T( "CreateDefaultPlan large1D row failed" ) );
-
-						FFTPlan* col2Plan	= NULL;
-						lockRAII* rowLock	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
-
-						// This is second column fft, intermediate buffer is packed and interleaved
-						// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
-
-						// common part for both passes
-						col2Plan->outputLayout  = fftPlan->outputLayout;
-						col2Plan->precision     = fftPlan->precision;
-						col2Plan->forwardScale  = fftPlan->forwardScale;
-						col2Plan->backwardScale = fftPlan->backwardScale;
-						col2Plan->tmpBufSize    = 0;
-						col2Plan->batchsize     = fftPlan->batchsize;
-						col2Plan->oDist         = fftPlan->oDist;
-						col2Plan->bLdsComplex   = fftPlan->bLdsComplex;
-						col2Plan->uLdsFraction  = fftPlan->uLdsFraction;
-						col2Plan->ldsPadding    = fftPlan->ldsPadding;
-						col2Plan->gen			= fftPlan->gen;
-						col2Plan->envelope			= fftPlan->envelope;
-
-						if (clLengths[0] > Large1DThreshold)
-							//prepare for huge 1D
-							col2Plan->large1D   = fftPlan->length[0];
-
-						col2Plan->length.push_back(clLengths[1]);
-						col2Plan->outStride.push_back(fftPlan->outStride[0]);
-
-						if (fftPlan->large1D == 0)
+						else
 						{
 							//first layer, large 1D from tmp buffer to output buffer
 							col2Plan->placeness    = CLFFT_OUTOFPLACE;
 							col2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-							col2Plan->inStride[0]  = length1;//clLengths[1];
+							col2Plan->inStride[0]  = 1;
 							col2Plan->outStride[0] = fftPlan->outStride[0] * clLengths[1];
 							col2Plan->iDist        = length0 * length1; //fftPlan->length[0];
-							col2Plan->inStride.push_back(1);
+							col2Plan->inStride.push_back(length0);
+							col2Plan->outStride.push_back(fftPlan->outStride[0]);
+						}
 
+						if(!integratedTranposes)
+						{
 							for (size_t index=1; index < fftPlan->length.size(); index++)
 							{
 								col2Plan->length.push_back(fftPlan->length[index]);
 								col2Plan->inStride.push_back(col2Plan->iDist);
-								col2Plan->outStride.push_back(fftPlan->outStride[index]);
-								col2Plan->iDist   *= fftPlan->length[index];
+								col2Plan->outStride.push_back(col2Plan->oDist);
+								col2Plan->iDist        *= fftPlan->length[index];
+								col2Plan->oDist        *= fftPlan->length[index];
 							}
 						}
 						else
 						{
-							//second layer, huge 1D from output buffer to output buffer
-							col2Plan->placeness    = CLFFT_INPLACE;
-							col2Plan->inputLayout  = fftPlan->outputLayout;
-							col2Plan->inStride[0]  = fftPlan->outStride[0] * clLengths[1];
-							col2Plan->outStride[0] = col2Plan->inStride[0];
-							col2Plan->iDist        = fftPlan->oDist;
-							col2Plan->inStride.push_back(fftPlan->outStride[0]);
-
 							for (size_t index=1; index < fftPlan->length.size(); index++)
 							{
 								col2Plan->length.push_back(fftPlan->length[index]);
-								col2Plan->inStride.push_back(fftPlan->outStride[index]);
+								col2Plan->inStride.push_back(col2Plan->iDist);
 								col2Plan->outStride.push_back(fftPlan->outStride[index]);
+								col2Plan->iDist   *= fftPlan->length[index];
 							}
 						}
 
+
 						OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+
+						if(!integratedTranposes)
+						{
+							//Transpose 
+							//tmp --> output
+							OPENCL_V(clfftCreateDefaultPlan( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
+								_T( "CreateDefaultPlan Large1d transpose failed" ) );
+
+							FFTPlan* trans3Plan	= NULL;
+							lockRAII* trans3Lock	= NULL;
+							OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
+
+							trans3Plan->placeness     = CLFFT_OUTOFPLACE;
+							trans3Plan->precision     = fftPlan->precision;
+							trans3Plan->tmpBufSize    = 0;
+							trans3Plan->batchsize     = fftPlan->batchsize;
+							trans3Plan->envelope	  = fftPlan->envelope;
+							trans3Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+							trans3Plan->outputLayout  = fftPlan->outputLayout;
+							trans3Plan->inStride[0]   = 1;
+							trans3Plan->inStride[1]   = clLengths[0];
+							trans3Plan->outStride[0]  = fftPlan->outStride[0];
+							trans3Plan->outStride[1]  = clLengths[1] * fftPlan->outStride[0];
+							trans3Plan->iDist         = fftPlan->length[0];
+							trans3Plan->oDist         = fftPlan->oDist;
+							trans3Plan->gen           = Transpose_GCN;
+							trans3Plan->transflag     = true;
+
+							OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
+								_T( "BakePlan large1d trans plan failed" ) );
+						}
 					}
+				}
 
-					fftPlan->baked = true;
-					return	CLFFT_SUCCESS;
+				fftPlan->baked = true;
+				return	CLFFT_SUCCESS;
 			}
 		}
 		break;
@@ -1465,40 +1528,28 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 			size_t length1 = fftPlan->length[1];
 
 
-			if (fftPlan->cacheSize)
+			if (fftPlan->length[0]==256 && fftPlan->length[1]==256)
 			{
-				length0 += fftPlan->cacheSize & 0xFF;
-				length1 += (fftPlan->cacheSize >> 8) & 0xFF;
-				if (length0 * length1 > 2 * fftPlan->length[0] * fftPlan->length[1])
-				{
-					length0 = fftPlan->length[0];
-					length1 = fftPlan->length[1];
-				}
+				length0 += 8;
+				length1 += 1;
 			}
-			else
+			else if (fftPlan->length[0]==512 && fftPlan->length[1]==512)
 			{
-				if (fftPlan->length[0]==256 && fftPlan->length[1]==256)
-				{
-					length0 += 8;
-					length1 += 1;
-				}
-				else if (fftPlan->length[0]==512 && fftPlan->length[1]==512)
-				{
-					length0 += 1;
-					length1 += 1;//length1 += 0;
-				}
-				else if (fftPlan->length[0]==1024 && fftPlan->length[1]==512)
-				{
-					length0 += 2;
-					length1 += 2;//length1 += 0;
-				}
-				else if (fftPlan->length[0]==1024 && fftPlan->length[1]==1024)
-				{
-					length0 += 1;
-					length1 += 1;//length1 += 0;
-				}
+				length0 += 1;
+				length1 += 1;//length1 += 0;
+			}
+			else if (fftPlan->length[0]==1024 && fftPlan->length[1]==512)
+			{
+				length0 += 2;
+				length1 += 2;//length1 += 0;
+			}
+			else if (fftPlan->length[0]==1024 && fftPlan->length[1]==1024)
+			{
+				length0 += 1;
+				length1 += 1;//length1 += 0;
 			}
 
+
 			if (fftPlan->length[0] > Large1DThreshold ||
 				fftPlan->length[1] > Large1DThreshold)
 				fftPlan->large2D = true;
@@ -1571,9 +1622,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				rowPlan->forwardScale    = 1.0f;
 				rowPlan->backwardScale   = 1.0f;
 				rowPlan->tmpBufSize      = 0;
-				rowPlan->bLdsComplex     = fftPlan->bLdsComplex;
-				rowPlan->uLdsFraction    = fftPlan->uLdsFraction;
-				rowPlan->ldsPadding      = fftPlan->ldsPadding;
+
 				rowPlan->gen			 = fftPlan->gen;
 				rowPlan->envelope		 = fftPlan->envelope;
 				rowPlan->batchsize       = fftPlan->batchsize;
@@ -1610,7 +1659,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				transPlanX->inputLayout     = fftPlan->outputLayout;
 				transPlanX->precision       = fftPlan->precision;
 				transPlanX->tmpBufSize      = 0;
-				transPlanX->gen			    = Transpose;
+				transPlanX->gen			    = Transpose_VLIW;
 				transPlanX->envelope		= fftPlan->envelope;
 				transPlanX->batchsize       = fftPlan->batchsize;
 				transPlanX->inStride[0]     = fftPlan->outStride[0];
@@ -1689,9 +1738,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->forwardScale    = fftPlan->forwardScale;
 				colPlan->backwardScale   = fftPlan->backwardScale;
 				colPlan->tmpBufSize      = 0;
-				colPlan->bLdsComplex     = fftPlan->bLdsComplex;
-				colPlan->uLdsFraction    = fftPlan->uLdsFraction;
-				colPlan->ldsPadding      = fftPlan->ldsPadding;
+
 				colPlan->gen			 = fftPlan->gen;
 				colPlan->envelope		 = fftPlan->envelope;
 				colPlan->batchsize       = fftPlan->batchsize;
@@ -1739,7 +1786,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				transPlanY->oDist           = fftPlan->oDist;
 				transPlanY->precision       = fftPlan->precision;
 				transPlanY->tmpBufSize      = 0;
-				transPlanY->gen			    = Transpose;
+				transPlanY->gen			    = Transpose_VLIW;
 				transPlanY->envelope		= fftPlan->envelope;
 				transPlanY->batchsize       = fftPlan->batchsize;
 				transPlanY->transflag       = true;
@@ -1793,9 +1840,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				rowPlan->forwardScale  = 1.0f;
 				rowPlan->backwardScale = 1.0f;
 				rowPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				rowPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				rowPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				rowPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				rowPlan->gen			= fftPlan->gen;
 				rowPlan->envelope			= fftPlan->envelope;
 
@@ -1855,9 +1900,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->forwardScale  = fftPlan->forwardScale;
 				colPlan->backwardScale = fftPlan->backwardScale;
 				colPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				colPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				colPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				colPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				colPlan->gen			= fftPlan->gen;
 				colPlan->envelope			= fftPlan->envelope;
 
@@ -1951,9 +1994,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->forwardScale  = 1.0f;
 				colPlan->backwardScale = 1.0f;
 				colPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				colPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				colPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				colPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				colPlan->gen			= fftPlan->gen;
 				colPlan->envelope			= fftPlan->envelope;
 
@@ -1992,9 +2033,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				rowPlan->forwardScale  = fftPlan->forwardScale;
 				rowPlan->backwardScale = fftPlan->backwardScale;
 				rowPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				rowPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				rowPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				rowPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				rowPlan->gen			= fftPlan->gen;
 				rowPlan->envelope			= fftPlan->envelope;
 
@@ -2052,9 +2091,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				rowPlan->forwardScale  = 1.0f;
 				rowPlan->backwardScale = 1.0f;
 				rowPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				rowPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				rowPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				rowPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				rowPlan->gen			= fftPlan->gen;
 				rowPlan->envelope			= fftPlan->envelope;
 
@@ -2110,9 +2147,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->forwardScale  = fftPlan->forwardScale;
 				colPlan->backwardScale = fftPlan->backwardScale;
 				colPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				colPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				colPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				colPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				colPlan->gen			= fftPlan->gen;
 				colPlan->envelope			= fftPlan->envelope;
 
@@ -2164,9 +2199,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				xyPlan->forwardScale  = 1.0f;
 				xyPlan->backwardScale = 1.0f;
 				xyPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				xyPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				xyPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				xyPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				xyPlan->gen			 = fftPlan->gen;
 				xyPlan->envelope			 = fftPlan->envelope;
 
@@ -2219,9 +2252,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->forwardScale  = fftPlan->forwardScale;
 				colPlan->backwardScale = fftPlan->backwardScale;
 				colPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				colPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				colPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				colPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				colPlan->gen			 = fftPlan->gen;
 				colPlan->envelope			 = fftPlan->envelope;
 
@@ -2287,9 +2318,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->forwardScale  = 1.0f;
 				colPlan->backwardScale = 1.0f;
 				colPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				colPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				colPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				colPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				colPlan->gen			 = fftPlan->gen;
 				colPlan->envelope			 = fftPlan->envelope;
 
@@ -2337,9 +2366,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				xyPlan->forwardScale  = fftPlan->forwardScale;
 				xyPlan->backwardScale = fftPlan->backwardScale;
 				xyPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				xyPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				xyPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				xyPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				xyPlan->gen			 = fftPlan->gen;
 				xyPlan->envelope			 = fftPlan->envelope;
 
@@ -2391,9 +2418,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				xyPlan->forwardScale  = 1.0f;
 				xyPlan->backwardScale = 1.0f;
 				xyPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				xyPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				xyPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				xyPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				xyPlan->gen			 = fftPlan->gen;
 				xyPlan->envelope			 = fftPlan->envelope;
 
@@ -2431,9 +2456,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				colPlan->forwardScale  = fftPlan->forwardScale;
 				colPlan->backwardScale = fftPlan->backwardScale;
 				colPlan->tmpBufSize    = fftPlan->tmpBufSize;
-				colPlan->bLdsComplex   = fftPlan->bLdsComplex;
-				colPlan->uLdsFraction  = fftPlan->uLdsFraction;
-				colPlan->ldsPadding    = fftPlan->ldsPadding;
+
 				colPlan->gen			 = fftPlan->gen;
 				colPlan->envelope			 = fftPlan->envelope;
 
@@ -2521,97 +2544,8 @@ clfftStatus FFTPlan::ConstructAndEnqueueConstantBuffers( cl_command_queue* commQ
 	cb_t ConstantBufferParams [CLFFT_CB_SIZE];
 	memset (& ConstantBufferParams, 0, sizeof (ConstantBufferParams));
 
-	cl_uint nY = 1;
-	cl_uint nZ = 0;
-	cl_uint nW = 0;
-	cl_uint n5 = 0;
-
-	switch( /*fftPlan->*/length.size() )
-	{
-	case 1:
-		nY = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
-		break;
-
-	case 2:
-		nY = (cl_uint)/*fftPlan->*/length[DimY];
-		nZ = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
-		break;
-
-	case 3:
-		nY = (cl_uint)/*fftPlan->*/length[DimY];
-		nZ = (cl_uint)/*fftPlan->*/length[DimZ];
-		nW = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
-		break;
-
-	case 4:
-		nY = (cl_uint)/*fftPlan->*/length[DimY];
-		nZ = (cl_uint)/*fftPlan->*/length[DimZ];
-		nW = (cl_uint)/*fftPlan->*/length[DimW];
-		n5 = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
-		break;
-	}
-	ConstantBufferParams[CLFFT_CB_NY ].u = nY;
-	ConstantBufferParams[CLFFT_CB_NZ ].u = nZ;
-	ConstantBufferParams[CLFFT_CB_NW ].u = nW;
-	ConstantBufferParams[CLFFT_CB_N5 ].u = n5;
-
-	assert (/*fftPlan->*/inStride.size() == /*fftPlan->*/outStride.size());
-
-	switch (/*fftPlan->*/inStride.size()) {
-	case 1:
-		ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
-		ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/iDist);
-		break;
-
-	case 2:
-		ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
-		ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
-		ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/iDist);
-		break;
-
-	case 3:
-		ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
-		ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
-		ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/inStride[2]);
-		ConstantBufferParams[CLFFT_CB_ISW].u = cl_uint (/*fftPlan->*/iDist);
-		break;
-
-	case 4:
-		ConstantBufferParams[CLFFT_CB_ISX].u = cl_uint (/*fftPlan->*/inStride[0]);
-		ConstantBufferParams[CLFFT_CB_ISY].u = cl_uint (/*fftPlan->*/inStride[1]);
-		ConstantBufferParams[CLFFT_CB_ISZ].u = cl_uint (/*fftPlan->*/inStride[2]);
-		ConstantBufferParams[CLFFT_CB_ISW].u = cl_uint (/*fftPlan->*/inStride[3]);
-		ConstantBufferParams[CLFFT_CB_IS5].u = cl_uint (/*fftPlan->*/iDist);
-		break;
-	}
+	ConstantBufferParams[0].u = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
 
-	switch (/*fftPlan->*/outStride.size()) {
-	case 1:
-		ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
-		ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/oDist);
-		break;
-
-	case 2:
-		ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
-		ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
-		ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/oDist);
-		break;
-
-	case 3:
-		ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
-		ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
-		ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/outStride[2]);
-		ConstantBufferParams[CLFFT_CB_OSW].u = cl_uint (/*fftPlan->*/oDist);
-		break;
-
-	case 4:
-		ConstantBufferParams[CLFFT_CB_OSX].u = cl_uint (/*fftPlan->*/outStride[0]);
-		ConstantBufferParams[CLFFT_CB_OSY].u = cl_uint (/*fftPlan->*/outStride[1]);
-		ConstantBufferParams[CLFFT_CB_OSZ].u = cl_uint (/*fftPlan->*/outStride[2]);
-		ConstantBufferParams[CLFFT_CB_OSW].u = cl_uint (/*fftPlan->*/outStride[3]);
-		ConstantBufferParams[CLFFT_CB_OS5].u = cl_uint (/*fftPlan->*/oDist);
-		break;
-	}
 
 	OPENCL_V(clEnqueueWriteBuffer( *commQueueFFT,
 		/*fftPlan->*/const_buffer,
@@ -2720,11 +2654,6 @@ clfftStatus clfftWritePlanToDisk( clfftPlanHandle plan_handle, const char* filen
 		// clfftReadPlanFromDisk will read the hex back in as float
 
 		planfile << " gen " << plan->gen;
-		planfile << " bLdsComplex " << plan->bLdsComplex;
-		planfile << " ldsPadding " << plan->ldsPadding;
-		planfile << " uLdsFraction " << plan->uLdsFraction;
-		planfile << " large1D_Xfactor " << plan->large1D_Xfactor;
-		planfile << " cacheSize " << plan->cacheSize;
 		planfile << " tmpBufSize " << plan->tmpBufSize;
 		planfile << " large1D " << plan->large1D;
 		planfile << " large2D " << plan->large2D;
@@ -2994,26 +2923,6 @@ clfftStatus clfftReadPlanFromDisk( clfftPlanHandle plan_handle, const char* file
 			planfile >> gen_read;
 			plan->gen = static_cast<clfftGenerators>(gen_read);
 		}
-		else if( next_word == "bLdsComplex" )
-		{
-			planfile >> plan->bLdsComplex;
-		}
-		else if( next_word == "ldsPadding" )
-		{
-			planfile >> plan->ldsPadding;
-		}
-		else if( next_word == "uLdsFraction" )
-		{
-			planfile >> plan->uLdsFraction;
-		}
-		else if( next_word == "large1D_Xfactor" )
-		{
-			planfile >> plan->large1D_Xfactor;
-		}
-		else if( next_word == "cacheSize" )
-		{
-			planfile >> plan->cacheSize;
-		}
 		else if( next_word == "tmpBufSize" )
 		{
 			planfile >> plan->tmpBufSize;
@@ -3097,6 +3006,8 @@ clfftStatus	clfftDestroyPlan( clfftPlanHandle* plHandle )
 		clfftDestroyPlan( &fftPlan->planTZ );
 	if( fftPlan->planRCcopy )
 		clfftDestroyPlan( &fftPlan->planRCcopy );
+	if( fftPlan->planCopy )
+		clfftDestroyPlan( &fftPlan->planCopy );
 
 	fftRepo.deletePlan( plHandle );
 
@@ -3241,7 +3152,7 @@ clfftStatus FFTPlan::ReleaseBuffers ()
 			result = tmp;
 	}
 
-	if( NULL != intBuffer )
+	if( (NULL != intBuffer) && libCreatedIntBuffer )
 	{
 		tmp = static_cast< clfftStatus >( clReleaseMemObject( intBuffer ) );
 		intBuffer = NULL;
@@ -3256,6 +3167,14 @@ clfftStatus FFTPlan::ReleaseBuffers ()
 		if( CLFFT_SUCCESS == result )
 			result = tmp;
 	}
+	
+	if( NULL != intBufferC2R )
+	{
+		tmp = static_cast< clfftStatus >( clReleaseMemObject( intBufferC2R ) );
+		intBufferC2R = NULL;
+		if( CLFFT_SUCCESS == result )
+			result = tmp;
+	}
 
 	return	result;
 }
@@ -3264,10 +3183,11 @@ clfftStatus  FFTPlan::GetWorkSizes (std::vector<size_t> & globalws, std::vector<
 {
 	switch(gen)
 	{
-	case Stockham:		return GetWorkSizesPvt<Stockham>(globalws, localws);
-	case Transpose:		return GetWorkSizesPvt<Transpose>(globalws, localws);
-	case Copy:			return GetWorkSizesPvt<Copy>(globalws, localws);
-	default:			assert(false); return CLFFT_NOTIMPLEMENTED;
+    case Stockham:		return GetWorkSizesPvt<Stockham>( globalws, localws );
+    case Transpose_VLIW:		return GetWorkSizesPvt<Transpose_VLIW>( globalws, localws );
+    case Transpose_GCN:		return GetWorkSizesPvt<Transpose_GCN>( globalws, localws );
+    case Copy:			return GetWorkSizesPvt<Copy>( globalws, localws );
+    default:			assert( false ); return CLFFT_NOTIMPLEMENTED;
 	}
 }
 
@@ -3276,8 +3196,9 @@ clfftStatus  FFTPlan::GetKernelGenKey (FFTKernelGenKeyParams & params) const
 	switch(gen)
 	{
 	case Stockham:		return GetKernelGenKeyPvt<Stockham>(params);
-	case Transpose:		return GetKernelGenKeyPvt<Transpose>(params);
-	case Copy:			return GetKernelGenKeyPvt<Copy>(params);
+	case Transpose_VLIW:		return GetKernelGenKeyPvt<Transpose_VLIW>(params);
+    case Transpose_GCN:		return GetKernelGenKeyPvt<Transpose_GCN>( params );
+    case Copy:			return GetKernelGenKeyPvt<Copy>( params );
 	default:			assert(false); return CLFFT_NOTIMPLEMENTED;
 	}
 }
@@ -3287,8 +3208,9 @@ clfftStatus  FFTPlan::GenerateKernel (FFTRepo & fftRepo, const cl_command_queue
 	switch(gen)
 	{
 	case Stockham:		return GenerateKernelPvt<Stockham>(fftRepo, commQueueFFT);
-	case Transpose:		return GenerateKernelPvt<Transpose>(fftRepo, commQueueFFT);
-	case Copy:			return GenerateKernelPvt<Copy>(fftRepo, commQueueFFT);
+	case Transpose_VLIW:		return GenerateKernelPvt<Transpose_VLIW>(fftRepo, commQueueFFT);
+    case Transpose_GCN:		return GenerateKernelPvt<Transpose_GCN>( fftRepo, commQueueFFT );
+    case Copy:			return GenerateKernelPvt<Copy>( fftRepo, commQueueFFT );
 	default:			assert(false); return CLFFT_NOTIMPLEMENTED;
 	}
 }
@@ -3298,16 +3220,22 @@ clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
 	switch(gen)
 	{
 	case Stockham:		return GetMax1DLengthPvt<Stockham>(longest);
-	//No restriction for transpose kernel
-	case Transpose:     *longest = 4096; return CLFFT_SUCCESS;
-	case Copy:			*longest = 4096; return CLFFT_SUCCESS;
+	//No restriction for Transpose_VLIW kernel
+	case Transpose_VLIW:     *longest = 4096; return CLFFT_SUCCESS;
+    case Transpose_GCN:     *longest = 4096; return CLFFT_SUCCESS;
+    case Copy:			*longest = 4096; return CLFFT_SUCCESS;
 	default:			assert(false); return CLFFT_NOTIMPLEMENTED;
 	}
 }
 
 clfftStatus FFTPlan::GetEnvelope (const FFTEnvelope ** ppEnvelope) const
 {
-	if(&envelope == NULL) assert(false);
+	if( &envelope == NULL )
+    { 
+        assert( false );
+        return CLFFT_NOTIMPLEMENTED;
+    }
+
 	*ppEnvelope = &envelope;
 	return CLFFT_SUCCESS;
 }
diff --git a/src/library/plan.h b/src/library/plan.h
index acafa07..56f5df4 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -70,57 +70,25 @@ namespace ARBITRARY {
 			//  The latter uses half as much LDS space, so twice as many wavefronts can be run
 			//  in parallel.
 
-		TWIDDLE_DEE = 4,
-			//  4 bits per row of matrix.
+		TWIDDLE_DEE = 8,
+			//  number of bits per row of matrix.
 	};
+
 };
 
-enum eConstantBuffer {
-	/*	Layout of a constant buffer passed to the generated kernel
-	 *	This needs to be know by the kernel generator and by the
-	 *	framework code that creates the buffer and fills it at execution time.
-	*/
-
-	//	 [0] uint  NY   This is the batchsize for a 1D Array,
-	//                    or the 2nd (Y dimension) for a 2D.
-	//	 [1] uint  NZ   This is the batchsize for a 2D Array,
-	//                    or the 3rd (Z dimension) for a 3D.
-	//	 [2] uint  NW   This is the batchsize for a 3D Array,
-	//                    or the 4th (W dimension) for a 4D.
-	//	 [3] uint  N5   This is the batchsize for a 4D Array,
-	//
-	CLFFT_CB_NY = 0,
-	CLFFT_CB_NZ,
-	CLFFT_CB_NW,
-	CLFFT_CB_N5,
-
-	//	 [4] uint  ISX  Input data X stride (== 1 for row-major compact data)
-	//	 [5] uint  ISY  Input data Y stride (== X for row-major compact data)
-	//	 [6] uint  ISZ  Input data Z stride (== X*Y for row-major compact data)
-	//	 [7] uint  ISW  Input data W stride (== X*Y*Z for row-major compact data)
-	//	 [8] uint  IS5  Input data 5th stride
-	//
-	CLFFT_CB_ISX,
-	CLFFT_CB_ISY,
-	CLFFT_CB_ISZ,
-	CLFFT_CB_ISW,
-	CLFFT_CB_IS5,
-
-	//	 [9] uint  OSX  Output data X stride
-	//	[10] uint  OSY  Output data Y stride
-	//	[11] uint  OSZ  Output data Z stride
-	//	[12] uint  OSW  Output data W stride
-	//	[13] uint  OS5  Output data 5th stride
-	//
-	CLFFT_CB_OSX,
-	CLFFT_CB_OSY,
-	CLFFT_CB_OSZ,
-	CLFFT_CB_OSW,
-	CLFFT_CB_OS5,
 
-	CLFFT_CB_SIZE  = 32,
+enum BlockComputeType
+{
+	BCT_C2C,	// Column to column
+	BCT_C2R,	// Column to row
+	BCT_R2C,	// Row to column
 };
 
+
+
+#define CLFFT_CB_SIZE 32
+#define CLFFT_MAX_INTERNAL_DIM 16
+
 struct FFTKernelGenKeyParams {
 	/*
 	 *	This structure distills a subset of the fftPlan data,
@@ -129,10 +97,10 @@ struct FFTKernelGenKeyParams {
 	 *	been compiled.
 	 */
 	size_t                   fft_DataDim;       // Dimensionality of the data
-	size_t                   fft_N[5];          // [0] is FFT size, e.g. 1024
+	size_t                   fft_N[CLFFT_MAX_INTERNAL_DIM];          // [0] is FFT size, e.g. 1024
 	                                            // This must be <= size of LDS!
-	size_t                   fft_inStride [5];  // input strides
-	size_t                   fft_outStride[5];  // output strides
+	size_t                   fft_inStride [CLFFT_MAX_INTERNAL_DIM];  // input strides
+	size_t                   fft_outStride[CLFFT_MAX_INTERNAL_DIM];  // output strides
 
 	clfftResultLocation   fft_placeness;
 	clfftLayout           fft_inputLayout;
@@ -145,18 +113,55 @@ struct FFTKernelGenKeyParams {
 	size_t                   fft_LDSsize;       // Limit the use of LDS to this many bytes.
 	size_t                   fft_R;             // # of complex values to keep in working registers
 	                                            // SIMD size * R must be <= size of LDS!
-	size_t                   fft_MaxRadix;      // Limit the radix to this value.
+
 	size_t					 fft_MaxWorkGroupSize; // Limit for work group size
-	bool                     fft_LdsComplex;    // If true, store complex values in LDS memory
-	                                            // If false, store scalare values in LDS.
-	                                            // Generally, false will provide more efficient kernels,
-	                                            // but not always.
-	                                            // see FFTPlan::bLdsComplex and ARBITRARY::LDS_COMPLEX
-	bool                     fft_ldsPadding;    // default padding is false
+
 	bool                     fft_3StepTwiddle;  // This is one pass of the "3-step" algorithm;
 	                                            // so extra twiddles are applied on output.
-	bool                     fft_UseFMA;        // *** TODO
+	bool					 fft_twiddleFront;	// do twiddle scaling at the beginning pass
+
+
 	bool                     fft_RCsimple;
+
+	bool					 transOutHorizontal;	// tiles traverse the output buffer in horizontal direction
+
+	bool					 blockCompute;
+	BlockComputeType		 blockComputeType;
+	size_t					 blockSIMD;
+	size_t					 blockLDS;
+
+
+	// Default constructor
+	FFTKernelGenKeyParams()
+	{
+		fft_DataDim = 0;
+		for(int i=0; i<CLFFT_MAX_INTERNAL_DIM; i++)
+		{
+			fft_N[i] = 0;
+			fft_inStride[i] = 0;
+			fft_outStride[i] = 0;
+		}
+
+		fft_placeness = CLFFT_OUTOFPLACE;
+		fft_inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+		fft_outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+		fft_precision = CLFFT_SINGLE;
+		fft_fwdScale = fft_backScale = 0.0;
+		fft_SIMD = 0;
+		fft_LDSsize = 0;
+		fft_R = 0;
+		fft_MaxWorkGroupSize = 0;
+		fft_3StepTwiddle = false;
+		fft_twiddleFront = false;
+
+		transOutHorizontal = false;
+
+		fft_RCsimple = false;
+		blockCompute = false;
+		blockComputeType = BCT_C2C;
+		blockSIMD = 0;
+		blockLDS = 0;
+	}
 };
 
 
@@ -185,7 +190,7 @@ struct FFTEnvelope {
 	,	limit_Dimensions (0)
 	,	limit_WorkGroupSize (0)
 	{
-		::memset (& limit_Size, 0, sizeof (limit_Size));
+		::memset( &limit_Size, 0, sizeof( limit_Size ) );
 	}
 };
 
@@ -202,14 +207,14 @@ class	FFTPlan
 	clfftStatus GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const;
 
 	template <clfftGenerators G>
-	clfftStatus GenerateKernelPvt (FFTRepo& fftRepo,  const cl_command_queue commQueueFFT ) const;
+	clfftStatus GenerateKernelPvt (FFTRepo& fftRepo,  const cl_command_queue& commQueueFFT ) const;
 
 	template <clfftGenerators G>
 	clfftStatus GetMax1DLengthPvt (size_t *longest ) const;
 
 public:
+
 	bool baked;
-	bool readFromFile;
 
 	//	Properties provided by the user.
 	clfftDim             dim;
@@ -227,8 +232,9 @@ public:
 	// TODO, change this logic for handling multiple GPUs/devices
 	cl_device_id bakeDevice;
 
+	// Disabling devices member, plan has 1-on-1 mapping with single device as identified by bakeDevice
 	//	Devices that the user specified in the context passed to the create function
-	std::vector< cl_device_id > devices;
+	// std::vector< cl_device_id > devices;
 
 	//	Length of the FFT in each dimension
 	std::vector< size_t >	length;
@@ -239,14 +245,11 @@ public:
 	//	Hardware Limits
 	FFTEnvelope                 envelope;
 
-	//	Performance Tuning parameters
-	bool                    bLdsComplex;	// see ARBITRARY::LDS_COMPLEX
-	bool                    ldsPadding;     // see ARBITRARY::LDS_PADDING
-	unsigned                uLdsFraction;	// see ARBITRARY::LDS_FRACTION_IDEAL
 
 	// Reserved copy for large 1d, 2d, and 3d plan
 	size_t tmpBufSize;
 	cl_mem intBuffer;
+	bool libCreatedIntBuffer;
 
 	// for RC copies
 	size_t	tmpBufSizeRC;
@@ -256,21 +259,23 @@ public:
 	size_t  tmpBufSizeC2R;
 	cl_mem  intBufferC2R;
 
-	//extra cache size for 2d and 3d
-	size_t  cacheSize;
+
 	size_t  large1D;
 	bool    large2D;
-	size_t  large1D_Xfactor;
+	bool	twiddleFront;
+
 	clfftPlanHandle planX;
 	clfftPlanHandle planY;
 	clfftPlanHandle planZ;
 
 	bool transflag;
+	bool transOutHorizontal;
 	clfftPlanHandle planTX;
 	clfftPlanHandle planTY;
 	clfftPlanHandle planTZ; //reserve for 3D transpose
 
 	clfftPlanHandle planRCcopy;
+	clfftPlanHandle planCopy;
 
 	// Plan resources
 	//
@@ -279,23 +284,25 @@ public:
 	// Generator type
 	clfftGenerators gen;
 
-	// stored binaries
-	size_t number_of_devices;
-
-//TODO caching kernel binaries for later reload
-#if 0
-	std::unique_ptr<size_t[]> binary_sizes;
-	std::vector< std::unique_ptr<char[]> > binaries;
-#endif
 
 	// Real-Complex simple flag
 	// if this is set we do real to-and-from full complex using simple algorithm
 	// where imaginary of input is set to zero in forward and imaginary not written in backward
 	bool RCsimple;
 
+
+	// User created plan
+	bool userPlan;
+
+	// A flag to say that blocked FFTs are going to be performed
+	// It can only be one of these: column to row, row to column or column to column
+	// row to row is just the normal case where blocking is not needed
+	bool blockCompute;
+	BlockComputeType blockComputeType;
+
+
 	FFTPlan ()
 	:	baked (false)
-	,	readFromFile (false)
 	,	dim (CLFFT_1D)
 	,	inputLayout (CLFFT_COMPLEX_INTERLEAVED)
 	,	outputLayout (CLFFT_COMPLEX_INTERLEAVED)
@@ -309,28 +316,29 @@ public:
 	,	batchsize (1)
 	,   tmpBufSize (0)
 	,	intBuffer( NULL )
+	,	libCreatedIntBuffer(false)
 	,	tmpBufSizeRC (0)
 	,	intBufferRC( NULL )
 	,	tmpBufSizeC2R (0)
 	,	intBufferC2R( NULL )
 	,   large1D(0)
 	,   large2D(false)
+	,	twiddleFront(false)
 	,   planX( 0 )
 	,   planY( 0 )
 	,   planZ( 0 )
 	,   transflag(false)
+	,	transOutHorizontal(false)
 	,	RCsimple(false)
+	,	userPlan(false)
+	,	blockCompute(false)
+	,	blockComputeType(BCT_C2C)
 	,   planTX( 0 )
 	,   planTY( 0 )
 	,   planTZ( 0 )
 	,	planRCcopy(0)
+	,	planCopy(0)
 	,	const_buffer( NULL )
-	,	bLdsComplex (ARBITRARY::LDS_COMPLEX)
-	,   ldsPadding  (ARBITRARY::LDS_PADDING)
-	,	uLdsFraction (0/*ARBITRARY::LDS_FRACTION_IDEAL*/)
-	,   large1D_Xfactor(0)
-	,   cacheSize(0)
-	,	number_of_devices(0)
 	,	gen(Stockham)
 	{};
 
diff --git a/src/library/private.h b/src/library/private.h
index 7c00ca3..000ab65 100644
--- a/src/library/private.h
+++ b/src/library/private.h
@@ -270,8 +270,6 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
 
 //	This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
 //	If an error occurs, we issue a return statement to exit the calling function.
-#if defined( _DEBUG )
-
 #define OPENCL_V( fn, msg ) \
 { \
 	clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
@@ -290,23 +288,6 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
 	} \
 }
 
-#else
-
-#define OPENCL_V( fn, msg ) \
-{ \
-	clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
-	switch( vclStatus ) \
-	{ \
-		case	CL_SUCCESS:		/**< No error */ \
-			break; \
-		default: \
-		{ \
-			return	vclStatus; \
-		} \
-	} \
-}
-#endif
-
 static inline bool IsPo2 (size_t u) {
 	return (u != 0) &&  (0 == (u & (u-1)));
 }
@@ -353,8 +334,7 @@ CLFFTAPI clfftStatus	clfftWritePlanToDisk( clfftPlanHandle plHandle, const char*
 */
 CLFFTAPI clfftStatus	clfftReadPlanFromDisk( clfftPlanHandle plHandle, const char* filename );
 
-/* internal api to set up some plan paramters */
-CLFFTAPI clfftStatus clfftSetInternal( const clfftPlanHandle plHandle, void* dataInternal );
+
 
 #ifdef __cplusplus
 }
diff --git a/src/library/repo.cpp b/src/library/repo.cpp
index 0b6e532..6d44985 100644
--- a/src/library/repo.cpp
+++ b/src/library/repo.cpp
@@ -91,11 +91,12 @@ clfftStatus FFTRepo::releaseResources( )
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const std::string& kernel, const cl_context& context )
+clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const std::string& kernel, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "setProgramCode" ) );
 
-  std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+	ClPair clPair = std::make_pair(planContext, device);
+	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
 	fftRepoKey key = std::make_pair( gen, Params );
 
 
@@ -126,11 +127,12 @@ clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelG
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, std::string& kernel, const cl_context& context )
+clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "getProgramCode" ) );
 
-  std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+	ClPair clPair = std::make_pair(planContext, device);
+	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
 	fftRepoKey key = std::make_pair( gen, Params );
 
 	fftRepo_iterator pos = mapFFTs.find( key);
@@ -142,11 +144,12 @@ clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelG
 }
 
 clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
-	const char * kernel_fwd, const char * kernel_back, const cl_context& context  )
+	const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext  )
 {
 	scopedLock sLock( lockRepo, _T( "setProgramEntryPoints" ) );
 
-  std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+	ClPair clPair = std::make_pair(planContext, device);
+	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
 	fftRepoKey key = std::make_pair( gen, Params );
 
 	fftRepoValue& fft  = mapFFTs[ key ];
@@ -157,11 +160,12 @@ clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFT
 }
 
 clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
-			clfftDirection dir, std::string& kernel, const cl_context& context )
+			clfftDirection dir, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "getProgramEntryPoint" ) );
 
-  std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, context);
+	ClPair clPair = std::make_pair(planContext, device);
+	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
 	fftRepoKey key = std::make_pair( gen, Params );
 
 	fftRepo_iterator pos = mapFFTs.find( key );
@@ -186,7 +190,7 @@ clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTK
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog )
+clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "setclProgram" ) );
 
@@ -196,7 +200,8 @@ clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGen
 
   OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
-  std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, ProgramContext);
+	ClPair clPair = std::make_pair(planContext, device);
+	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
 	fftRepoKey key = std::make_pair( gen, Params );
 
 	fftRepo_iterator pos = mapFFTs.find( key );
@@ -213,11 +218,12 @@ clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGen
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_context& PlanContext  )
+clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext  )
 {
 	scopedLock sLock( lockRepo, _T( "getclProgram" ) );
 
-  std::pair<FFTKernelGenKeyParams, cl_context> Params = std::make_pair(fftParam, PlanContext);
+	ClPair clPair = std::make_pair(planContext, device);
+	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
 	fftRepoKey key = std::make_pair( gen, Params );
 
 	fftRepo_iterator pos = mapFFTs.find( key );
@@ -227,9 +233,9 @@ clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGen
 	if (NULL == prog)
 		return	CLFFT_INVALID_PROGRAM;
   
-  cl_context ProgContext;
-  clGetProgramInfo(prog, CL_PROGRAM_CONTEXT, sizeof(cl_context), &ProgContext, NULL);
-  if (PlanContext!=ProgContext)
+  cl_context progContext;
+  clGetProgramInfo(prog, CL_PROGRAM_CONTEXT, sizeof(cl_context), &progContext, NULL);
+  if (planContext!=progContext)
     return	CLFFT_INVALID_PROGRAM;
 
 	return	CLFFT_SUCCESS;
diff --git a/src/library/repo.h b/src/library/repo.h
index f2619e7..9adc349 100644
--- a/src/library/repo.h
+++ b/src/library/repo.h
@@ -51,7 +51,8 @@ class	FFTRepo
 	//	has created
 	//typedef std::pair< clfftGenerators, FFTKernelGenKeyParams > fftRepoKey;
 
-  typedef std::pair< clfftGenerators, std::pair<FFTKernelGenKeyParams, cl_context> > fftRepoKey;
+	typedef std::pair< cl_context, cl_device_id > ClPair;
+	typedef std::pair< clfftGenerators, std::pair<FFTKernelGenKeyParams, ClPair> > fftRepoKey;
 	typedef std::map< fftRepoKey, fftRepoValue > fftRepoType;
 	typedef fftRepoType::iterator fftRepo_iterator;
 
@@ -139,15 +140,14 @@ public:
 
 	clfftStatus releaseResources( );
 
-	clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, const std::string& kernel, const cl_context& context);
-	clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, std::string& kernel, const cl_context& context );
+	clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, const std::string& kernel, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, std::string& kernel, const cl_device_id &device, const cl_context& planContext );
 
-	clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
-		const char * kernel_fwd, const char * kernel_back, const cl_context& context );
-	clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, clfftDirection dir, std::string& kernel , const cl_context& context);
+	clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, clfftDirection dir, std::string& kernel , const cl_device_id &device, const cl_context& planContext );
 
-	clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& kernel );
-	clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& kernel, const cl_context& PlanContext );
+	clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext );
 
 	clfftStatus setclKernel ( cl_program prog, clfftDirection dir, const cl_kernel& kernel );
 	clfftStatus getclKernel ( cl_program prog, clfftDirection dir, cl_kernel& kernel );
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 56a50dd..8ab7efa 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -89,6 +89,7 @@ clfftStatus clfftEnqueueTransform(
 		fftPlan->intBuffer = clCreateBuffer( fftPlan->context, CL_MEM_READ_WRITE,
 			fftPlan->tmpBufSize, 0, &status);
 		OPENCL_V( status, _T("Creating the intermediate buffer for large1D Failed") );
+		fftPlan->libCreatedIntBuffer = true;
 
 #if defined(DEBUGGING)
 		std::cout << "One intermediate buffer is created" << std::endl;
@@ -155,8 +156,6 @@ clfftStatus clfftEnqueueTransform(
 					_T("clfftEnqueueTransform large1D RC copy failed"));
 				clReleaseEvent(copyInEvents);
 
-				return	CLFFT_SUCCESS;
-
 			}
 			else if( fftPlan->outputLayout == CLFFT_REAL )
 			{
@@ -184,8 +183,6 @@ clfftStatus clfftEnqueueTransform(
 					_T("clfftEnqueueTransform large1D second column failed"));
 				clReleaseEvent(colOutEvents);
 
-
-				return	CLFFT_SUCCESS;
 			}
 			else
 			{
@@ -207,7 +204,7 @@ clfftStatus clfftEnqueueTransform(
 					//First time usage, we can initialize tmp buffer
 					OPENCL_V(clEnqueueWriteBuffer( *commQueues,
 						localIntBuffer,
-						1,		// blocking write
+						CL_TRUE,		// blocking write
 						0,
 						buffSizeBytes_complex,
 						&temp[0],
@@ -232,6 +229,15 @@ clfftStatus clfftEnqueueTransform(
 					else
 						mybuffers = clOutputBuffers;
 
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
 					//First Row
 					//tmp->output
 					cl_event rowXOutEvents = NULL;
@@ -240,6 +246,16 @@ clfftStatus clfftEnqueueTransform(
 						_T("clfftEnqueueTransform for large1D rowX failed"));
 					clReleaseEvent(transTXOutEvents);
 
+
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, *mybuffers, CL_TRUE, 0, 536870912, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
 					//Second Transpose
 					// output->tmp
 					cl_event transTYOutEvents = NULL;
@@ -248,6 +264,16 @@ clfftStatus clfftEnqueueTransform(
 						_T("clfftEnqueueTransform for large1D transTY failed"));
 					clReleaseEvent(rowXOutEvents);
 
+
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
 					//Second Row
 					//tmp->tmp, inplace
 					cl_event rowYOutEvents = NULL;
@@ -256,6 +282,15 @@ clfftStatus clfftEnqueueTransform(
 						_T("clfftEnqueueTransform for large1D rowY failed"));
 					clReleaseEvent(transTYOutEvents);
 
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
 					//Third Transpose
 					// tmp->output
 					OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
@@ -263,117 +298,185 @@ clfftStatus clfftEnqueueTransform(
 						_T("clfftEnqueueTransform for large1D transTZ failed"));
 					clReleaseEvent(rowYOutEvents);
 
-					if( fftRepo.pStatTimer )
+				}
+				else
+				{
+					if (fftPlan->large1D == 0)
 					{
-						fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
-					}
+						if(fftPlan->planCopy)
+						{
+							// Transpose OUTOFPLACE
+							cl_event transTXOutEvents = NULL;
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+								waitEvents, &transTXOutEvents, clInputBuffers, &localIntBuffer, NULL ),
+								_T("clfftEnqueueTransform for large1D transTX failed"));
 
-					return	CLFFT_SUCCESS;
-				}
+#if defined(DEBUGGING)
+									//  For debugging interleave data only,
+									//  read the input buffer back into memory.
+							clFinish(*commQueues);
+									OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+										NULL, NULL ),
+										_T("Reading the result buffer failed") );
+#endif
 
-				cl_event colOutEvents = NULL;
-				if (fftPlan->large1D == 0)
-				{
-					// First pass
-					// column with twiddle first, OUTOFPLACE, + transpose
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
-						waitEvents, &colOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer),
-						_T("clfftEnqueueTransform large1D col pass failed"));
+							// FFT INPLACE
+							cl_event rowXOutEvents = NULL;
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
+								&transTXOutEvents, &rowXOutEvents, &localIntBuffer, NULL, NULL),
+								_T("clfftEnqueueTransform large1D first row pass failed"));
+							clReleaseEvent(transTXOutEvents);
 
 #if defined(DEBUGGING)
-					// debug purpose, interleave input <-> interleave output
-					// read the intermediate buffer and print part of it.
-					OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
-						&colOutEvents, NULL ),
-						_T("Reading the result buffer failed") );
-					{
-						FFTPlan* fftPlanX	= NULL;
-						lockRAII* planLockX	= NULL;
-						OPENCL_V( fftRepo.getPlan( fftPlan->planX, fftPlanX, planLockX ), _T( "fftRepo.getPlan failed" ) );
-
-						size_t rows = fftPlanX->length[0];
-						size_t cols = fftPlanX->batchsize;
-						BUG_CHECK (rows * cols <= temp.size())
-						size_t print_cols = std::min<size_t> (4, cols);
-						size_t print_rows = std::min<size_t> (4, rows);
-						//std::cout << std::endl << "Intermediate buffer:" << std::endl;
-						//for (size_t jrow = 0; jrow < print_rows; ++jrow) {
-						//	for (size_t icol = 0; icol < print_cols; ++icol) {
-						//		size_t index = jrow *cols + icol;
-						//		std::complex<float> data = temp[index];
-						//		std::cout << data;
-						//	}
-						//	std::cout << std::endl;
-						//}
-					}
+									//  For debugging interleave data only,
+									//  read the input buffer back into memory.
+							clFinish(*commQueues);
+									OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+										NULL, NULL ),
+										_T("Reading the result buffer failed") );
 #endif
 
-					//another column FFT output, OUTOFPLACE
-					if (fftPlan->placeness == CLFFT_INPLACE)
-					{
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
-							outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
-							_T("clfftEnqueueTransform large1D second column failed"));
+							// FFT INPLACE
+							cl_event colYOutEvents = NULL;
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowXOutEvents,
+								&colYOutEvents, &localIntBuffer, NULL, NULL ),
+								_T("clfftEnqueueTransform large1D second column failed"));
+							clReleaseEvent(rowXOutEvents);
+									
+#if defined(DEBUGGING)
+									//  For debugging interleave data only,
+									//  read the input buffer back into memory.
+							clFinish(*commQueues);
+									OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+										NULL, NULL ),
+										_T("Reading the result buffer failed") );
+#endif
+
+							cl_mem *mybuffers;
+							if (fftPlan->placeness==CLFFT_INPLACE)
+								mybuffers = clInputBuffers;
+							else
+								mybuffers = clOutputBuffers;
+						
+							// Copy kernel
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planCopy, dir, numQueuesAndEvents, commQueues, 1, &colYOutEvents,
+								outEvents, &localIntBuffer, mybuffers, NULL ),
+								_T("clfftEnqueueTransform large1D copy failed"));
+							clReleaseEvent(colYOutEvents);
+						}
+						else
+						{
+							cl_event colOutEvents = NULL;
+							// First pass
+							// column with twiddle first, OUTOFPLACE, + transpose
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+								waitEvents, &colOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer),
+								_T("clfftEnqueueTransform large1D col pass failed"));
 
 #if defined(DEBUGGING)
-						//  For debugging interleave data only,
-						//  read the input buffer back into memory.
-						OPENCL_V( clEnqueueReadBuffer( *commQueues, clInputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
-							outEvents, NULL ),
-							_T("Reading the result buffer failed") );
+							// debug purpose, interleave input <-> interleave output
+							// read the intermediate buffer and print part of it.
+							OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+								&colOutEvents, NULL ),
+								_T("Reading the result buffer failed") );
 #endif
-					}
-					else
-					{
+							if(fftPlan->planTZ)
+							{
+								cl_event rowYOutEvents = NULL;
+								OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+									&rowYOutEvents, &localIntBuffer, NULL, NULL ),
+									_T("clfftEnqueueTransform large1D second row failed"));
+
+								if (fftPlan->placeness == CLFFT_INPLACE)
+								{
+									OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1, &rowYOutEvents,
+										outEvents, &localIntBuffer, clInputBuffers, NULL ),
+										_T("clfftEnqueueTransform large1D trans3 failed"));
+								}
+								else
+								{
+									OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1, &rowYOutEvents,
+										outEvents, &localIntBuffer, clOutputBuffers, NULL ),
+										_T("clfftEnqueueTransform large1D trans3 failed"));
+								}
+						
+								clReleaseEvent(rowYOutEvents);
+
+							}
+							else
+							{
+								//another column FFT output, OUTOFPLACE
+								if (fftPlan->placeness == CLFFT_INPLACE)
+								{
+									OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+										outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+										_T("clfftEnqueueTransform large1D second column failed"));
+
 #if defined(DEBUGGING)
-					// debug purpose, interleave input <-> interleave output
-					OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
-						&colOutEvents, NULL ),
-						_T("Reading the result buffer failed") );
+									//  For debugging interleave data only,
+									//  read the input buffer back into memory.
+									OPENCL_V( clEnqueueReadBuffer( *commQueues, clInputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+										outEvents, NULL ),
+										_T("Reading the result buffer failed") );
 #endif
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
-							outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
-							_T("clfftEnqueueTransform large1D second column failed"));
+								}
+								else
+								{
+#if defined(DEBUGGING)
+								// debug purpose, interleave input <-> interleave output
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+									&colOutEvents, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+									OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+										outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
+										_T("clfftEnqueueTransform large1D second column failed"));
 
 #if defined(DEBUGGING)
-						//  For debugging interleave data only, read back the output buffer
-						//
-						OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
-							outEvents, NULL ),
-							_T("Reading the result buffer failed") );
+									//  For debugging interleave data only, read back the output buffer
+									//
+									OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+										outEvents, NULL ),
+										_T("Reading the result buffer failed") );
 #endif
+								}
+							}
+
+							clReleaseEvent(colOutEvents);
+						}
 					}
-				}
-				else
-				{
-					// second pass for huge 1D
-					// column with twiddle first, OUTOFPLACE, + transpose
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
-						waitEvents, &colOutEvents, &localIntBuffer, clOutputBuffers, localIntBuffer),
-						_T("clfftEnqueueTransform Huge1D col pass failed"));
+					else
+					{
+						cl_event colOutEvents = NULL;
+
+						// second pass for huge 1D
+						// column with twiddle first, OUTOFPLACE, + transpose
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+							waitEvents, &colOutEvents, &localIntBuffer, clOutputBuffers, localIntBuffer),
+							_T("clfftEnqueueTransform Huge1D col pass failed"));
 #if defined(DEBUGGING)
-					// debug purpose, interleave input <-> interleave output
-					OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
-						&colOutEvents, NULL ),
-						_T("Reading the result buffer failed") );
+						// debug purpose, interleave input <-> interleave output
+						OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 1,
+							&colOutEvents, NULL ),
+							_T("Reading the result buffer failed") );
 #endif
 
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
-						outEvents, clOutputBuffers, clOutputBuffers, localIntBuffer ),
-						_T("clfftEnqueueTransform large1D second column failed"));
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+							outEvents, clOutputBuffers, clOutputBuffers, localIntBuffer ),
+							_T("clfftEnqueueTransform large1D second column failed"));
 
+						clReleaseEvent(colOutEvents);
+					}
 				}
+			}
 
-				clReleaseEvent(colOutEvents);
+			if( fftRepo.pStatTimer )
+			{
+				fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
+			}
 
-				if( fftRepo.pStatTimer )
-				{
-					fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
-				}
+			return	CLFFT_SUCCESS;
 
-				return	CLFFT_SUCCESS;
-			}
-			break;
 		}
 		case CLFFT_2D:
 		{
@@ -388,7 +491,8 @@ clfftStatus clfftEnqueueTransform(
 			//size_t buffSizeBytes=sizeof( std::complex< float > )*buffersize;
 			//std::vector< std::complex< float > > output2( buffersize );
 			size_t buffSizeBytes=sizeof( float) * buffersize;
-			std::vector<float> output2(buffersize*2);
+			//std::vector<float> output2(buffersize*2);
+			float *output2 = new float[buffersize*2];
 #endif
 #if defined(DEBUGGING)
 			OPENCL_V( clEnqueueReadBuffer( *commQueues, clInputBuffers[0], CL_TRUE, 0, buffSizeBytes, &output2[ 0 ], 0,
@@ -507,134 +611,132 @@ clfftStatus clfftEnqueueTransform(
 					}
 
 				}
-
-				if( fftRepo.pStatTimer )
-				{
-					fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
-				}
-
-				return CLFFT_SUCCESS;
-			}
-
-			if ( (fftPlan->large2D || fftPlan->length.size()>2) &&
-				(fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
-			{
-				if (fftPlan->placeness==CLFFT_INPLACE)
-				{
-					//deal with row first
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
-						waitEvents, &rowOutEvents, clInputBuffers, NULL, localIntBuffer ),
-						_T("clfftEnqueueTransform for row failed"));
-
-					//deal with column
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
-						outEvents, clInputBuffers, NULL, localIntBuffer ),
-						_T("clfftEnqueueTransform for column failed"));
-				}
-				else
-				{
-					//deal with row first
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
-						waitEvents, &rowOutEvents, clInputBuffers, clOutputBuffers, localIntBuffer ),
-						_T("clfftEnqueueTransform for row failed"));
-
-					//deal with column
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
-						outEvents, clOutputBuffers, NULL, localIntBuffer ),
-						_T("clfftEnqueueTransform for column failed"));
-
-				}
 			}
 			else
 			{
-				if(fftPlan->inputLayout == CLFFT_REAL)
+
+				if ( (fftPlan->large2D || fftPlan->length.size()>2) &&
+					(fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
 				{
 					if (fftPlan->placeness==CLFFT_INPLACE)
 					{
-						// deal with row
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+						//deal with row first
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
 							waitEvents, &rowOutEvents, clInputBuffers, NULL, localIntBuffer ),
 							_T("clfftEnqueueTransform for row failed"));
 
-						// deal with column
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+						//deal with column
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
 							outEvents, clInputBuffers, NULL, localIntBuffer ),
 							_T("clfftEnqueueTransform for column failed"));
 					}
 					else
 					{
-						// deal with row
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+						//deal with row first
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
 							waitEvents, &rowOutEvents, clInputBuffers, clOutputBuffers, localIntBuffer ),
 							_T("clfftEnqueueTransform for row failed"));
 
-						// deal with column
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+						//deal with column
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
 							outEvents, clOutputBuffers, NULL, localIntBuffer ),
 							_T("clfftEnqueueTransform for column failed"));
+
 					}
 				}
-				else if(fftPlan->outputLayout == CLFFT_REAL)
+				else
 				{
-					cl_mem *out_local, *int_local, *out_y;
-
-					if(fftPlan->length.size() > 2)
+					if(fftPlan->inputLayout == CLFFT_REAL)
 					{
-						out_local = clOutputBuffers;
-						int_local = NULL;
-						out_y = clInputBuffers;
+						if (fftPlan->placeness==CLFFT_INPLACE)
+						{
+							// deal with row
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+								waitEvents, &rowOutEvents, clInputBuffers, NULL, localIntBuffer ),
+								_T("clfftEnqueueTransform for row failed"));
+
+							// deal with column
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+								outEvents, clInputBuffers, NULL, localIntBuffer ),
+								_T("clfftEnqueueTransform for column failed"));
+						}
+						else
+						{
+							// deal with row
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_FORWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+								waitEvents, &rowOutEvents, clInputBuffers, clOutputBuffers, localIntBuffer ),
+								_T("clfftEnqueueTransform for row failed"));
+
+							// deal with column
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+								outEvents, clOutputBuffers, NULL, localIntBuffer ),
+								_T("clfftEnqueueTransform for column failed"));
+						}
 					}
-					else
+					else if(fftPlan->outputLayout == CLFFT_REAL)
 					{
-						out_local = (fftPlan->placeness==CLFFT_INPLACE) ? clInputBuffers : clOutputBuffers;
-						int_local = fftPlan->tmpBufSizeC2R ? &(fftPlan->intBufferC2R) : &localIntBuffer;
-						out_y = int_local;
-					}
+						cl_mem *out_local, *int_local, *out_y;
+
+						if(fftPlan->length.size() > 2)
+						{
+							out_local = clOutputBuffers;
+							int_local = NULL;
+							out_y = clInputBuffers;
+						}
+						else
+						{
+							out_local = (fftPlan->placeness==CLFFT_INPLACE) ? clInputBuffers : clOutputBuffers;
+							int_local = fftPlan->tmpBufSizeC2R ? &(fftPlan->intBufferC2R) : &localIntBuffer;
+							out_y = int_local;
+						}
 
 
-					// deal with column
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, numWaitEvents,
-						waitEvents, &rowOutEvents, clInputBuffers, int_local, localIntBuffer ),
-						_T("clfftEnqueueTransform for row failed"));
-
-					// deal with row
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
-						outEvents, out_y, out_local, localIntBuffer ),
-						_T("clfftEnqueueTransform for column failed"));
-
-				}
-				else
-				{
-					//deal with row first
-					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
-						waitEvents, &rowOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer ),
-						_T("clfftEnqueueTransform for row failed"));
-
+						// deal with column
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+							waitEvents, &rowOutEvents, clInputBuffers, int_local, localIntBuffer ),
+							_T("clfftEnqueueTransform for row failed"));
 
-					if (fftPlan->placeness==CLFFT_INPLACE)
-					{
-						//deal with column
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
-							outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+						// deal with row
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+							outEvents, out_y, out_local, localIntBuffer ),
 							_T("clfftEnqueueTransform for column failed"));
+
 					}
 					else
 					{
-						//deal with column
-						OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
-							outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
-							_T("clfftEnqueueTransform for column failed"));
+						//deal with row first
+						OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+							waitEvents, &rowOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer ),
+							_T("clfftEnqueueTransform for row failed"));
 
-		#if defined(DEBUGGING)
-						OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes, &output2[ 0 ], 1,
-							outEvents, NULL ),
-							_T("Reading the result buffer failed") );
-		#endif
+
+						if (fftPlan->placeness==CLFFT_INPLACE)
+						{
+							//deal with column
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+								outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+								_T("clfftEnqueueTransform for column failed"));
+						}
+						else
+						{
+							//deal with column
+							OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
+								outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
+								_T("clfftEnqueueTransform for column failed"));
+
+			#if defined(DEBUGGING)
+							OPENCL_V( clEnqueueReadBuffer( *commQueues, clOutputBuffers[0], CL_TRUE, 0, buffSizeBytes, &output2[ 0 ], 1,
+								outEvents, NULL ),
+								_T("Reading the result buffer failed") );
+			#endif
+						}
 					}
 				}
+
+				clReleaseEvent(rowOutEvents);
+
 			}
 
-			clReleaseEvent(rowOutEvents);
 
 			if( fftRepo.pStatTimer )
 			{
@@ -1162,7 +1264,7 @@ clfftStatus clfftEnqueueTransform(
 
 	cl_program	prog;
 	cl_kernel	kern;
-	OPENCL_V( fftRepo.getclProgram( fftPlan->gen, fftParams, prog, fftPlan->context ), _T( "fftRepo.getclProgram failed" ) );
+	OPENCL_V( fftRepo.getclProgram( fftPlan->gen, fftParams, prog, fftPlan->bakeDevice, fftPlan->context ), _T( "fftRepo.getclProgram failed" ) );
 	OPENCL_V( fftRepo.getclKernel( prog, dir, kern ), _T( "fftRepo.getclKernels failed" ) );
 
 
@@ -1215,11 +1317,11 @@ clfftStatus clfftEnqueueTransform(
 	}
 	BUG_CHECK (gWorkSize.size() == lWorkSize.size());
 
-	size_t *lwSize = NULL;
-	if(fftPlan->gen != Copy) lwSize = &lWorkSize[ 0 ];
+	//size_t *lwSize = NULL;
+	//if(fftPlan->gen != Copy) lwSize = &lWorkSize[ 0 ];
 
 	status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
-		NULL, &gWorkSize[ 0 ], lwSize, numWaitEvents, waitEvents, outEvents );
+		NULL, &gWorkSize[ 0 ], &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
 	OPENCL_V( status, _T( "clEnqueueNDRangeKernel failed" ) );
 
 	if( fftRepo.pStatTimer )
diff --git a/src/tests/accuracy_test_pow2.cpp b/src/tests/accuracy_test_pow2.cpp
index 56f6bfd..e395e04 100644
--- a/src/tests/accuracy_test_pow2.cpp
+++ b/src/tests/accuracy_test_pow2.cpp
@@ -1291,6 +1291,91 @@ TEST_F(accuracy_test_pow2_double, large_1D_forward_in_place_complex_planar_to_co
 	catch( const std::exception& err ) { handle_exception(err);	}
 }
 
+// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
+// ^^^^^^^^^^^^^^^^^^^^^^^ huge 1D ^^^^^^^^^^^^^^^^^^^^^^^ //
+// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
+
+// *****************************************************
+// *****************************************************
+
+//#define CLFFT_TEST_HUGE
+#ifdef CLFFT_TEST_HUGE
+
+#define HUGE_TEST_MAKE(test_name, len, bat) \
+template< class T, class cl_T, class fftw_T > \
+void test_name() \
+{ \
+	std::vector<size_t> lengths; \
+	lengths.push_back( len ); \
+	size_t batch = bat; \
+\
+	std::vector<size_t> input_strides; \
+	std::vector<size_t> output_strides; \
+	size_t input_distance = 0; \
+	size_t output_distance = 0; \
+	layout::buffer_layout_t in_layout = layout::complex_planar; \
+	layout::buffer_layout_t out_layout = layout::complex_planar; \
+	placeness::placeness_t placeness = placeness::in_place; \
+	direction::direction_t direction = direction::forward; \
+\
+	data_pattern pattern = sawtooth; \
+	complex_to_complex<T, cl_T, fftw_T>( pattern, direction, lengths, batch, input_strides, output_strides, input_distance, output_distance, in_layout, out_layout, placeness ); \
+}
+
+#define SP_HUGE_TEST(test_name, len, bat) \
+\
+	HUGE_TEST_MAKE(test_name, len, bat) \
+\
+	TEST_F(accuracy_test_pow2_single, test_name) \
+	{ \
+		try { test_name< float, cl_float, fftwf_complex >(); } \
+		catch( const std::exception& err ) { handle_exception(err);	} \
+	}
+
+#define DP_HUGE_TEST(test_name, len, bat) \
+\
+	HUGE_TEST_MAKE(test_name, len, bat) \
+\
+	TEST_F(accuracy_test_pow2_double, test_name) \
+	{ \
+		try { test_name< double, cl_double, fftw_complex >(); } \
+		catch( const std::exception& err ) { handle_exception(err);	} \
+	}
+
+SP_HUGE_TEST( huge_sp_test_1, 1048576,    11 )
+SP_HUGE_TEST( huge_sp_test_2, 1048576*2,  7  )
+SP_HUGE_TEST( huge_sp_test_3, 1048576*4,  3  )
+SP_HUGE_TEST( huge_sp_test_4, 1048576*8,  5  )
+SP_HUGE_TEST( huge_sp_test_5, 1048576*16, 3  )
+SP_HUGE_TEST( huge_sp_test_6, 1048576*32, 2  )
+SP_HUGE_TEST( huge_sp_test_7, 1048576*64, 1  )
+
+DP_HUGE_TEST( huge_dp_test_1, 524288,    11 )
+DP_HUGE_TEST( huge_dp_test_2, 524288*2,  7  )
+DP_HUGE_TEST( huge_dp_test_3, 524288*4,  3  )
+DP_HUGE_TEST( huge_dp_test_4, 524288*8,  5  )
+DP_HUGE_TEST( huge_dp_test_5, 524288*16, 3  )
+DP_HUGE_TEST( huge_dp_test_6, 524288*32, 2  )
+DP_HUGE_TEST( huge_dp_test_7, 524288*64, 1  )
+
+SP_HUGE_TEST( large_sp_test_1, 8192,    11 )
+SP_HUGE_TEST( large_sp_test_2, 8192*2,  7  )
+SP_HUGE_TEST( large_sp_test_3, 8192*4,  3  )
+SP_HUGE_TEST( large_sp_test_4, 8192*8,  5  )
+SP_HUGE_TEST( large_sp_test_5, 8192*16, 3  )
+SP_HUGE_TEST( large_sp_test_6, 8192*32, 21  )
+SP_HUGE_TEST( large_sp_test_7, 8192*64, 17  )
+
+DP_HUGE_TEST( large_dp_test_1, 4096,    11 )
+DP_HUGE_TEST( large_dp_test_2, 4096*2,  7  )
+DP_HUGE_TEST( large_dp_test_3, 4096*4,  3  )
+DP_HUGE_TEST( large_dp_test_4, 4096*8,  5  )
+DP_HUGE_TEST( large_dp_test_5, 4096*16, 3  )
+DP_HUGE_TEST( large_dp_test_6, 4096*32, 21  )
+DP_HUGE_TEST( large_dp_test_7, 4096*64, 17  )
+
+#endif
+
 // *****************************************************
 // *****************************************************
 template< class T, class cl_T, class fftw_T >

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git