[clfft] 25/107: adding the binary caching feature to store plans on disk

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Jul 30 18:06:29 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 6ab7f558a5d3eeeac830b69251e27593d3c3ef35
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Wed Mar 4 12:14:30 2015 -0600

    adding the binary caching feature to store plans on disk
---
 src/library/CMakeLists.txt               |   6 +
 src/library/action.cpp                   | 762 +++++++++++++++++++++++++++++++
 src/library/action.h                     | 257 +++++++++++
 src/library/fft_binary_lookup.cpp        | 688 ++++++++++++++++++++++++++++
 src/library/fft_binary_lookup.h          | 276 +++++++++++
 src/library/generator.copy.cpp           | 155 ++++---
 src/library/generator.stockham.cpp       | 220 +++++----
 src/library/generator.transpose.gcn.cpp  | 168 ++++---
 src/library/generator.transpose.vliw.cpp | 154 +++++--
 src/library/lifetime.cpp                 |   2 +
 src/library/mainpage.h                   |   8 +-
 src/library/md5sum.c                     | 312 +++++++++++++
 src/library/md5sum.h                     |  50 ++
 src/library/plan.cpp                     | 639 +++-----------------------
 src/library/plan.h                       | 151 +++++-
 src/library/private.h                    |  21 +
 src/library/repo.cpp                     |  51 +--
 src/library/repo.h                       |  73 ++-
 src/library/transform.cpp                | 493 +-------------------
 19 files changed, 3137 insertions(+), 1349 deletions(-)

diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 0c81ae3..ebd38a7 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -24,6 +24,9 @@ set( clFFT.Source	transform.cpp
 								generator.transpose.gcn.cpp 
 								generator.copy.cpp
 								lifetime.cpp 
+								fft_binary_lookup.cpp
+								md5sum.c
+								action.cpp
 								stdafx.cpp )
 
 # Windows only uses dllmain
@@ -32,6 +35,7 @@ if( MSVC )
 endif( )
 
 set( clFFT.Headers	private.h 
+					action.h
 					repo.h 
 					plan.h 
 					lock.h 
@@ -40,6 +44,8 @@ set( clFFT.Headers	private.h
 					generator.stockham.h 
 					generator.transpose.vliw.h 
 					generator.transpose.gcn.h 
+					fft_binary_lookup.h
+					md5sum.h
 					../include/stdafx.h 
 					../include/unicode.compatibility.h 
 					../include/targetver.h 
diff --git a/src/library/action.cpp b/src/library/action.cpp
new file mode 100644
index 0000000..b9b9884
--- /dev/null
+++ b/src/library/action.cpp
@@ -0,0 +1,762 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include "stdafx.h"
+#include <math.h>
+#include "private.h"
+#include "repo.h"
+#include "plan.h"
+#include "generator.stockham.h"
+#include "../include/convenienceFunctions.h"
+
+#include "action.h"
+#include "fft_binary_lookup.h"
+
+#define FFT_CACHE_DEBUG 0
+
+
+
+FFTCopyAction::FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTAction(plan, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTAction() failed, exit constructor
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+FFTTransposeVLIWAction::FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTAction(plan, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTAction() failed, exit constructor
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+FFTTransposeGCNAction::FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTAction(plan, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTAction() failed, exit constructor
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+
+FFTStockhamAction::FFTStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTAction(plan, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTAction() failed, exit constructor
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+
+
+FFTAction::FFTAction(FFTPlan * fftPlan, clfftStatus & err)
+    : plan(fftPlan)
+{
+    err = CLFFT_SUCCESS;
+}
+
+clfftStatus FFTAction::selectBufferArguments(FFTPlan * fftPlan,
+                                             cl_mem* clInputBuffers,
+                                             cl_mem* clOutputBuffers,
+                                             std::vector< cl_mem > &inputBuff,
+                                             std::vector< cl_mem > &outputBuff)
+{
+    
+    // 1d with normal length will fall into the below category
+    // add: 2d transpose kernel will fall into here too.
+    inputBuff.reserve( 2 );
+    outputBuff.reserve( 2 );
+
+    //	Decode the relevant properties from the plan paramter to figure out how many input/output buffers we have
+    switch( fftPlan->inputLayout )
+    {
+    case CLFFT_COMPLEX_INTERLEAVED:
+    {
+        switch( fftPlan->outputLayout )
+        {
+        case CLFFT_COMPLEX_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_COMPLEX_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                //	Invalid to be an inplace transform, and go from 1 to 2 buffers
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        case CLFFT_REAL:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        default:
+        {
+            //	Don't recognize output layout
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        }
+
+        break;
+    }
+    case CLFFT_COMPLEX_PLANAR:
+    {
+        switch( fftPlan->outputLayout )
+        {
+        case CLFFT_COMPLEX_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_COMPLEX_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        case CLFFT_REAL:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        default:
+        {
+            //	Don't recognize output layout
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        }
+
+        break;
+    }
+    case CLFFT_HERMITIAN_INTERLEAVED:
+    {
+        switch( fftPlan->outputLayout )
+        {
+        case CLFFT_COMPLEX_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_COMPLEX_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_INTERLEAVED:
+        {
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        case CLFFT_HERMITIAN_PLANAR:
+        {
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        case CLFFT_REAL:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        default:
+        {
+            //	Don't recognize output layout
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        }
+
+        break;
+    }
+    case CLFFT_HERMITIAN_PLANAR:
+    {
+        switch( fftPlan->outputLayout )
+        {
+        case CLFFT_COMPLEX_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_COMPLEX_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_INTERLEAVED:
+        {
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        case CLFFT_HERMITIAN_PLANAR:
+        {
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        case CLFFT_REAL:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                inputBuff.push_back( clInputBuffers[ 1 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        default:
+        {
+            //	Don't recognize output layout
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        }
+
+        break;
+    }
+    case CLFFT_REAL:
+    {
+        switch( fftPlan->outputLayout )
+        {
+        case CLFFT_COMPLEX_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_COMPLEX_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_INTERLEAVED:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+            }
+
+            break;
+        }
+        case CLFFT_HERMITIAN_PLANAR:
+        {
+            if( fftPlan->placeness == CLFFT_INPLACE )
+            {
+                return CLFFT_INVALID_ARG_VALUE;
+            }
+            else
+            {
+                inputBuff.push_back( clInputBuffers[ 0 ] );
+
+                outputBuff.push_back( clOutputBuffers[ 0 ] );
+                outputBuff.push_back( clOutputBuffers[ 1 ] );
+            }
+
+            break;
+        }
+        default:
+        {
+            //	Don't recognize output layout
+            return CLFFT_INVALID_ARG_VALUE;
+        }
+        }
+
+        break;
+    }
+    default:
+    {
+        //	Don't recognize output layout
+        return CLFFT_INVALID_ARG_VALUE;
+    }
+    }
+
+    return CLFFT_SUCCESS;
+}
+
+
+clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
+                               clfftDirection dir,
+                               cl_uint numQueuesAndEvents,
+                               cl_command_queue* commQueues,
+                               cl_uint numWaitEvents,
+                               const cl_event* waitEvents,
+                               cl_event* outEvents,
+                               cl_mem* clInputBuffers,
+                               cl_mem* clOutputBuffers)
+{
+    FFTRepo & fftRepo = FFTRepo::getInstance();
+
+    std::vector< cl_mem > inputBuff;
+    std::vector< cl_mem > outputBuff;
+
+
+    clfftStatus status = selectBufferArguments(this->plan,
+                                               clInputBuffers, clOutputBuffers,
+                                               inputBuff, outputBuff);
+
+    if (status != CLFFT_SUCCESS)
+    {
+        return status;
+    }
+    
+    //	TODO:  In the case of length == 1, FFT is a trivial NOP, but we still need to apply the forward and backwards tranforms
+    //	TODO:  Are map lookups expensive to call here?  We can cache a pointer to the cl_program/cl_kernel in the plan
+
+    //	Translate the user plan into the structure that we use to map plans to clPrograms
+
+    cl_program	prog;
+    cl_kernel	kern;
+    OPENCL_V( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), prog, this->plan->bakeDevice, this->plan->context ), _T( "fftRepo.getclProgram failed" ) );
+    OPENCL_V( fftRepo.getclKernel( prog, dir, kern ), _T( "fftRepo.getclKernels failed" ) );
+
+    cl_uint uarg = 0;
+    if (!this->plan->transflag && !(this->plan->gen == Copy))
+    {
+        //	::clSetKernelArg() is not thread safe, according to the openCL spec for the same cl_kernel object
+        //	TODO:  Need to verify that two different plans (which would get through our lock above) with exactly the same
+        //	parameters would NOT share the same cl_kernel objects
+
+        /* constant buffer */
+        OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->const_buffer ), _T( "clSetKernelArg failed" ) );
+    }
+
+    //	Input buffer(s)
+    //	Input may be 1 buffer  (CLFFT_COMPLEX_INTERLEAVED)
+    //	          or 2 buffers (CLFFT_COMPLEX_PLANAR)
+
+    for (size_t i = 0; i < inputBuff.size(); ++i)
+    {
+        OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&inputBuff[i] ), _T( "clSetKernelArg failed" ) );
+    }
+    //	Output buffer(s)
+    //	Output may be 0 buffers (CLFFT_INPLACE)
+    //	           or 1 buffer  (CLFFT_COMPLEX_INTERLEAVED)
+    //	           or 2 buffers (CLFFT_COMPLEX_PLANAR)
+    for (size_t o = 0; o < outputBuff.size(); ++o)
+    {
+        OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
+    }
+
+    std::vector< size_t > gWorkSize;
+    std::vector< size_t > lWorkSize;
+    clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
+
+    // TODO:  if getWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
+    // that this multidimensional input data array is too large to be transformed
+    // with a single call to clEnqueueNDRangeKernel.  For now, we will just return
+    // the error code back up the call stack.
+    // The *correct* course of action would be to split the work into mutliple
+    // calls to clEnqueueNDRangeKernel.
+    if (CLFFT_INVALID_GLOBAL_WORK_SIZE == result)
+    {
+        OPENCL_V( result, _T("Work size too large for clEnqueNDRangeKernel()"));
+    }
+    else
+    {
+        OPENCL_V( result, _T("FFTAction::getWorkSizes failed"));
+    }
+    BUG_CHECK (gWorkSize.size() == lWorkSize.size());
+
+
+    cl_int call_status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
+                                            NULL, &gWorkSize[ 0 ],  &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
+    OPENCL_V( call_status, _T( "clEnqueueNDRangeKernel failed" ) );
+
+    if( fftRepo.pStatTimer )
+    {
+        fftRepo.pStatTimer->AddSample( plHandle, this->plan, kern, numQueuesAndEvents, outEvents, gWorkSize );
+    }
+
+    return CLFFT_SUCCESS;
+}
+
+
+
+//	Read the kernels that this plan uses from file, and store into the plan
+clfftStatus FFTAction::writeKernel( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device )
+{
+    FFTRepo& fftRepo	= FFTRepo::getInstance( );
+
+    std::string kernelPath = getKernelName(gen, plHandle, true);
+
+    //	Logic to write string contents out to file
+    tofstreamRAII< std::ofstream, std::string > kernelFile( kernelPath.c_str( ) );
+    if( !kernelFile.get( ) )
+    {
+        std::cerr << "Failed to open kernel file for writing: " << kernelPath.c_str( ) << std::endl;
+        return CLFFT_FILE_CREATE_FAILURE;
+    }
+
+    std::string kernel;
+    OPENCL_V( fftRepo.getProgramCode( gen, data, kernel, device, context ), _T( "fftRepo.getProgramCode failed." ) );
+
+    kernelFile.get( ) << kernel << std::endl;
+
+    return	CLFFT_SUCCESS;
+}
+
+
+// **************** TODO TODO TODO ***********************
+// Making compileKernels function take in command queue parameter so we can build for 1 particular device only;
+// this may not be desirable for persistent plans, where we may have to compile for all devices in the context;
+// make changes appropriately before enabling persistent plans and then remove this comment
+
+//	Compile the kernels that this plan uses, and store into the plan
+clfftStatus FFTAction::compileKernels( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan )
+{
+    cl_int status = 0;
+    size_t deviceListSize = 0;
+
+    FFTRepo& fftRepo	= FFTRepo::getInstance( );
+
+    // create a cl program executable for the device associated with command queue
+    // Get the device
+    cl_device_id &q_device = fftPlan->bakeDevice;
+
+    cl_program program;
+    if( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
+    {
+        FFTBinaryLookup lookup (this->getGenerator(), plHandle, fftPlan->context, q_device);
+
+        lookup.variantRaw(this->getSignatureData(), this->getSignatureData()->datasize);
+
+        if (lookup.found())
+        {
+#if FFT_CACHE_DEBUG
+            // debug message in debug mode to ensure that the cache is used
+            fprintf(stderr, "Kernel loaded from cache\n");
+#endif
+
+            program = lookup.getProgram();
+        }
+        else
+        {
+#if FFT_CACHE_DEBUG
+            fprintf(stderr, "Kernel built from source\n");
+#endif
+
+            //	If the user wishes us to write the kernels out to disk, we do so
+            if( fftRepo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
+            {
+				OPENCL_V( writeKernel( plHandle, this->getGenerator(), this->getSignatureData(), fftPlan->context, fftPlan->bakeDevice ), _T( "writeKernel failed." ) );
+            }
+
+            std::string programCode;
+            OPENCL_V( fftRepo.getProgramCode( this->getGenerator(), this->getSignatureData(), programCode, q_device, fftPlan->context  ), _T( "fftRepo.getProgramCode failed." ) );
+
+            const char* source = programCode.c_str();
+            program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
+            OPENCL_V( status, _T( "clCreateProgramWithSource failed." ) );
+
+            // create a cl program executable for the device associated with command queue
+
+#if defined(DEBUGGING)
+            status = clBuildProgram( program, 1, &q_device, "-g -cl-opt-disable", NULL, NULL); // good for debugging kernels
+
+// if you have trouble creating smbols that GDB can pick up to set a breakpoint after kernels are loaded into memory
+// this can be used to stop execution to allow you to set a breakpoint in a kernel after kernel symbols are in memory.
+#ifdef DEBUG_BREAK_GDB
+            __debugbreak();
+#endif
+#else
+            status = clBuildProgram( program, 1, &q_device, NULL, NULL, NULL);
+#endif
+            if( status != CL_SUCCESS )
+            {
+                if( status == CL_BUILD_PROGRAM_FAILURE )
+                {
+                    size_t buildLogSize = 0;
+                    OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
+                              _T( "clGetProgramBuildInfo failed" ) );
+
+                    std::vector< char > buildLog( buildLogSize );
+                    ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
+
+                    OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+                              _T( "clGetProgramBuildInfo failed" ) );
+
+                    std::cerr << "\n\t\t\tBUILD LOG\n";
+                    std::cerr << "************************************************\n";
+                    std::cerr << &buildLog[ 0 ] << std::endl;
+                    std::cerr << "************************************************\n";
+                }
+
+                OPENCL_V( status, _T( "clBuildProgram failed" ) );
+            }
+
+            lookup.setProgram(program, source);
+            lookup.populateCache();
+        }
+
+        fftRepo.setclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context );
+
+
+        // For real transforms we compile either forward or backward kernel
+        bool buildFwdKernel = buildForwardKernel();
+        bool buildBwdKernel = buildBackwardKernel();
+
+        // get a kernel object handle for a kernel with the given name
+        cl_kernel kernel;
+        if( buildFwdKernel )
+        {
+            if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
+            {
+                std::string entryPoint;
+                OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+
+                kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
+                OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+                fftRepo.setclKernel( program, CLFFT_FORWARD, kernel );
+            }
+        }
+
+        if( buildBwdKernel )
+        {
+            if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
+            {
+                std::string entryPoint;
+                OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+
+                kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
+                OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+                fftRepo.setclKernel( program, CLFFT_BACKWARD, kernel );
+            }
+        }
+    }
+
+    return	CLFFT_SUCCESS;
+}
+
+
diff --git a/src/library/action.h b/src/library/action.h
new file mode 100644
index 0000000..83b3142
--- /dev/null
+++ b/src/library/action.h
@@ -0,0 +1,257 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( AMD_CLFFT_action_H )
+#define AMD_CLFFT_action_H
+
+#include "plan.h"
+
+
+//
+// FFTCopyAction
+//
+// Base class for every Copy action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclass FFTGeneratedCopyAction
+// 
+class FFTCopyAction : public FFTAction
+{
+public:
+    FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+    clfftGenerators getGenerator() { return Copy; }
+};
+
+
+//
+// FFTStockhamAction
+//
+// Base class for every Stockham action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclasses FFTGeneratedStockhamAction or FFTStaticStockhamAction
+// 
+class FFTStockhamAction : public FFTAction
+{
+public:
+    FFTStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+    clfftGenerators getGenerator() { return Stockham; }
+};
+
+
+//
+// FFTTransposeVLIWAction
+//
+// Base class for every TransposeVLIW action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclass FFTGeneratedTransposeVLIWAction
+// 
+class FFTTransposeVLIWAction : public FFTAction
+{
+public:
+    FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+    clfftGenerators getGenerator() { return Transpose_VLIW; }
+};
+
+
+//
+// FFTTransposeGCNAction
+//
+// Base class for every TransposeGCN action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclass FFTGeneratedTransposeGCNAction
+// 
+class FFTTransposeGCNAction : public FFTAction
+{
+public:
+    FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+    clfftGenerators getGenerator() { return Transpose_GCN; }
+};
+
+//
+// FFTGeneratedCopyAction
+//
+// Implements a Copy action for the FFT
+// Its signature is represented by FFTKernelGenKeyParams structure
+// 
+// This class implements:
+//  - the generation of the kernel string
+//  - the build of the kernel
+//
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate copy kernel. That structure is used for the signature of
+// this action. It is common to Stockham, copy and transpose methods. For
+// convenience, this structure is used for every FFTGenerated*Action class,
+// but in practice the copy action only use a few information of that
+// structure, so a proper structure should be used instead.
+//
+class FFTGeneratedCopyAction : public FFTCopyAction
+{
+public:
+    FFTGeneratedCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+    typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_COPY_ACTION> Signature;
+
+private:
+    Signature signature;
+
+    clfftStatus generateKernel  (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+    clfftStatus getWorkSizes    (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+    clfftStatus initParams      ();
+
+    bool buildForwardKernel();
+    bool buildBackwardKernel();
+
+public:
+
+    virtual const Signature * getSignatureData()
+    {
+        return &this->signature;
+    }
+};
+
+
+//
+// FFTGeneratedStockhamAction
+//
+// Represents a Stockham action for the FFT. This class implements the former
+// mechanism of kernel generation and compilation for Stockham method.
+// 
+// This class implements:
+//  - the generation of the kernel string
+//  - the build of the kernel
+//
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate kernel. That structure is used for the signature of this
+// action. For convenience, this structure is used for every
+// FFTGenerated*Action class, but a "Stockham-specific" version of that
+// structure should be used instead.
+//
+class FFTGeneratedStockhamAction : public FFTStockhamAction
+{
+public:
+    FFTGeneratedStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+    
+    typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_STOCKHAM_ACTION> Signature;
+
+private:
+    Signature signature;
+
+    clfftStatus generateKernel  (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+    clfftStatus getWorkSizes    (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+    clfftStatus initParams      ();
+
+    bool buildForwardKernel();
+    bool buildBackwardKernel();
+
+public:
+
+    virtual const Signature * getSignatureData()
+    {
+        return &this->signature;
+    }
+};
+
+//
+// FFTGeneratedTransposeVLIWAction
+//
+// Implements a TransposeVLIW action for the FFT
+// Its signature is represented by FFTKernelGenKeyParams structure
+// 
+// This class implements:
+//  - the generation of the kernel string
+//  - the build of the kernel
+// 
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate transpose kernel. That structure is used for the signature of
+// this action. It is common to Stockham, copy and transpose methods. For
+// convenience, this structure is used for every FFTGenerated*Action class,
+// but in practice the transpose action only use a few information of that
+// structure, so a proper structure should be used instead.
+//
+class FFTGeneratedTransposeVLIWAction : public FFTTransposeVLIWAction
+{
+public:
+    FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+    typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_TRANSPOSE_ACTION> Signature;
+
+private:
+    Signature signature;
+
+    clfftStatus generateKernel  (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+    clfftStatus getWorkSizes    (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+    clfftStatus initParams      ();
+
+    bool buildForwardKernel();
+    bool buildBackwardKernel();
+
+public:
+
+    virtual const Signature * getSignatureData()
+    {
+        return &this->signature;
+    }
+};
+
+
+
+// FFTGeneratedTransposeGCNAction
+//
+// Implements a TransposeGCN action for the FFT
+// Its signature is represented by FFTKernelGenKeyParams structure
+// 
+// This class implements:
+//  - the generation of the kernel string
+//  - the build of the kernel
+// 
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate transpose kernel. That structure is used for the signature of
+// this action. It is common to Stockham, copy and transpose methods. For
+// convenience, this structure is used for every FFTGenerated*Action class,
+// but in practice the transpose action only use a few information of that
+// structure, so a proper structure should be used instead.
+//
+class FFTGeneratedTransposeGCNAction : public FFTTransposeGCNAction
+{
+public:
+    FFTGeneratedTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+    typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_TRANSPOSE_ACTION> Signature;
+
+private:
+    Signature signature;
+
+    clfftStatus generateKernel  (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+    clfftStatus getWorkSizes    (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+    clfftStatus initParams      ();
+
+    bool buildForwardKernel();
+    bool buildBackwardKernel();
+
+public:
+
+    virtual const Signature * getSignatureData()
+    {
+        return &this->signature;
+    }
+};
+
+#endif // AMD_CLFFT_action_H
diff --git a/src/library/fft_binary_lookup.cpp b/src/library/fft_binary_lookup.cpp
new file mode 100644
index 0000000..96f2872
--- /dev/null
+++ b/src/library/fft_binary_lookup.cpp
@@ -0,0 +1,688 @@
+/* ************************************************************************
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include "fft_binary_lookup.h"
+
+#include <iostream>
+#include <fstream>
+#include <cassert>
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#ifdef _WIN32
+#include <windows.h>
+#include <direct.h> // for _mkdir
+#else
+#include <unistd.h>
+#endif
+
+extern "C"
+{
+#include "md5sum.h"
+}
+
+// size for clGetDeviceInfo queries
+#define SIZE 256
+
+#define ENABLE_SOURCE_DUMP 0
+
+
+#define CAPS_DEBUG 0
+
+#include <string.h>
+
+char * sep()
+{
+#ifdef __WIN32
+    return (char*)"\\";
+#else
+    return (char*)"/";
+#endif
+}
+
+static std::string cache_path;
+static bool cache_enabled(false);
+
+void clfftInitBinaryCache()
+{
+    const char * path = getenv("CLFFT_CACHE_PATH");
+    if (path)
+    {
+        cache_path = std::string(path) + sep();
+        cache_enabled = true;
+    }
+    else
+    {
+        cache_path = "";
+    }
+}
+
+FFTBinaryLookup::CacheEntry::CacheEntry(const std::string & filename)
+    : m_filename(filename), m_successful_creation(false)
+{
+
+}
+
+void FFTBinaryLookup::CacheEntry::close()
+{
+#ifdef _WIN32
+    CloseHandle(this->m_handle);
+#else
+    ::close(*(int*)this->m_handle);
+    //delete (int*)this->m_handle;
+#endif
+}
+
+bool FFTBinaryLookup::CacheEntry::successful_creation()
+{
+    return this->m_successful_creation;
+}
+
+bool FFTBinaryLookup::CacheEntry::exclusive_create()
+{
+#ifdef _WIN32
+    std::wstring tmp;
+    tmp.assign(this->m_filename.begin(), this->m_filename.end());
+
+    HANDLE handle = CreateFile(tmp.c_str(), 
+                               GENERIC_WRITE, 
+                               0, // no share with other process
+                               NULL,
+                               CREATE_NEW,
+                               FILE_ATTRIBUTE_NORMAL,
+                               NULL);
+
+    this->m_handle = handle;
+    this->m_successful_creation = (handle != INVALID_HANDLE_VALUE);
+    return this->m_successful_creation;
+#else
+    int * fd = new int;
+    *fd = open (this->m_filename.c_str(),
+                O_CREAT | O_EXCL, 
+                S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+    this->m_handle = fd;
+    this->m_successful_creation = (*fd != -1);
+    return *fd >= 0;
+#endif
+}
+
+FFTBinaryLookup::FFTBinaryLookup(const clfftGenerators gen, const clfftPlanHandle plHandle, cl_context ctxt, cl_device_id device)
+    : m_context(ctxt), m_device(device), m_program(NULL), m_binary(0), m_signature(0), m_cache_enabled(cache_enabled)
+{
+    // initialize the entry name
+    this->m_cache_entry_name = getKernelName(gen, plHandle, false);
+
+    if (this->m_cache_enabled)
+    {
+        // retrieve device informations to compute the path of the cache
+        cl_int err = this->retrieveDeviceAndDriverInfo();
+
+        if (err != CL_SUCCESS)
+        {
+            cache_enabled = false;
+            this->m_cache_enabled = false;
+        }
+    }
+}
+
+FFTBinaryLookup::~FFTBinaryLookup()
+{
+    delete [] this->m_binary;
+    delete [] this->m_signature;
+}
+
+FFTBinaryLookup::Variant::Variant()
+    : m_kind((VariantKind)0), m_size(0), m_data(0)
+{
+}
+
+FFTBinaryLookup::Variant::Variant(VariantKind kind, char * data, size_t size)
+    : m_kind(kind), m_size(size)
+{
+    this->m_data = new char[this->m_size];
+    memcpy(this->m_data, data, size);
+}
+
+FFTBinaryLookup::Variant::~Variant()
+{
+    // delete this->m_data;
+}
+
+void FFTBinaryLookup::variantInt(int num)
+{
+    m_variants.push_back(Variant(INT, (char*)&num, sizeof(num)));
+}
+ 
+void FFTBinaryLookup::variantDouble(double num)
+{
+    m_variants.push_back(Variant(DOUBLE, (char*)&num, sizeof(num)));
+}
+
+void FFTBinaryLookup::variantCompileOptions(const std::string & opts)
+{
+    m_variants.push_back(Variant(STRING, (char*)opts.c_str(), opts.size()));
+}
+
+void FFTBinaryLookup::variantRaw(const void * data, size_t bytes)
+{
+    m_variants.push_back(Variant(DATA, (char*)data, bytes));
+}
+
+enum BinaryRepresentation
+{
+    LSB,
+    MSB,
+    UNKNOWN
+};
+
+enum BinaryRepresentation getStorageMode(char * data)
+{
+    if (data[0] == 'C' && 
+        data[1] == 'L' && 
+        data[2] == 'B' &&
+        data[3] == '\0')
+        return LSB;
+
+    if (data[0] == 'B' && 
+        data[1] == 'L' && 
+        data[2] == 'C' &&
+        data[3] == '\0')
+        return MSB;
+
+    return UNKNOWN;
+}
+
+void FFTBinaryLookup::finalizeVariant()
+{
+    // serialize variants
+    size_t whole_variant_size_in_bytes = 0;
+
+    // store 1 byte for the variant kind
+    whole_variant_size_in_bytes += this->m_variants.size() * sizeof(int); // for the variant kind
+    whole_variant_size_in_bytes += this->m_variants.size() * sizeof(size_t); // for the variant size
+
+    // add every variant sizes
+    for(size_t i=0 ; i<this->m_variants.size() ; ++i)
+    {
+        const Variant & v = this->m_variants[i];
+
+        // compute the whole size of the signature
+        whole_variant_size_in_bytes += v.m_size;
+    }
+
+    this->m_header.signature_size = whole_variant_size_in_bytes;
+
+    this->m_signature = new char[whole_variant_size_in_bytes];
+    char * current_address = this->m_signature;
+    for(size_t i=0 ; i<this->m_variants.size() ; ++i)
+    {
+        Variant v = this->m_variants[i];
+
+        // write the variant kind
+        memcpy(current_address, &v.m_kind, sizeof(int));
+        current_address += sizeof(v.m_kind);
+
+        // write the variant size
+        memcpy(current_address, &v.m_size, sizeof(v.m_size));
+        current_address += sizeof(v.m_size);
+
+        // write the variant itself
+        memcpy(current_address, v.m_data, v.m_size);
+        current_address += v.m_size;
+    }
+
+    // Update the cache entry name if there are variants...
+    if (whole_variant_size_in_bytes != 0)
+    {
+        char md5_sum[33];
+        md5sum(this->m_signature, this->m_header.signature_size, md5_sum);
+        this->m_cache_entry_name = md5_sum;
+    }
+    else
+    {
+        this->m_cache_entry_name += ".db";
+    }
+}
+
+bool FFTBinaryLookup::loadHeader(std::ifstream &file, size_t length)
+{
+    file.read ((char*)&this->m_header, sizeof(Header));
+
+    // FIXME: Consider LSB Vs MSB number representation
+    assert(getStorageMode(this->m_header.magic_key) == LSB);
+
+    if (this->m_header.whole_file_size != (int)length)
+    {
+        // the file has not been correctly initialized (yet)
+        return false;
+    }
+
+    return true;
+}
+
+bool FFTBinaryLookup::loadBinaryAndSignature(std::ifstream &file)
+{
+    {
+        this->m_binary    = new unsigned char [this->m_header.binary_size];
+        const std::istream& res = file.read((char*)this->m_binary, this->m_header.binary_size);
+        if (!res.good())
+            return false;
+    }
+
+    {
+        this->m_signature = new char [this->m_header.signature_size];
+        const std::istream& res = file.read((char*)this->m_signature, this->m_header.signature_size);
+
+        if (!res.good())
+            return false;
+
+        this->m_variants.clear();
+
+        char * current = this->m_signature;
+        for (int i=0 ; i<this->m_header.signature_size ; ++i)
+        {
+            Variant v;
+            v.m_kind = *(VariantKind*) current;
+            i += sizeof(int);
+            current += sizeof(int);
+
+            v.m_size = *(size_t*) current;
+            i += sizeof(size_t);
+            current += sizeof(size_t);
+
+            v.m_data = new char[v.m_size];
+            memcpy(v.m_data, current, v.m_size);
+            i += v.m_size;
+            current += v.m_size;
+
+            this->m_variants.push_back(v);
+        }
+    }
+
+    return true;
+}
+
+bool FFTBinaryLookup::tryLoadCacheFile()
+{
+    // may create empty file or may wait until file is ready
+    const std::string & filename = this->m_path + this->m_cache_entry_name;
+    std::ifstream file (filename.c_str(), std::ios_base::binary);
+
+    if (file.is_open())
+    {
+        file.seekg (0, file.end);
+        size_t length = file.tellg();
+        file.seekg (0, file.beg);
+
+        if (length == 0)
+        {
+            // the file is corrupted, so return false
+            return false;
+        }
+
+        bool st;
+        st = loadHeader(file, length);
+
+        if (! st)
+            return false;
+
+        st = loadBinaryAndSignature(file);
+
+        if (! st)
+            return false;
+
+        file.close();
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+bool FFTBinaryLookup::found()
+{
+    // if we could not create the directory, it is useless to 
+    if (! this->m_cache_enabled)
+    {
+        return false; // not found
+    }
+
+    this->finalizeVariant(); // serialize variant and cumpute checksum on it
+    // also compute the tree to search from the cache entry (this->m_cache_entry_name, cache path ??)
+
+    if (tryLoadCacheFile())
+    {
+        cl_int err = buildFromBinary(this->m_binary,
+                                     this->m_header.binary_size);
+
+        // return false if the buildFromBinary failed, true else
+        return err==CL_SUCCESS;
+    }
+
+    return false;
+}
+
+static cl_int getSingleBinaryFromProgram(cl_program program,
+                                         std::vector<unsigned char*> & binary)
+{
+    // 3 - Determine the size of each program binary
+    size_t size;
+    cl_int err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+                                  sizeof(size_t),
+                                  &size, NULL);
+    if (err != CL_SUCCESS)
+    {
+        std::cerr << "Error querying for program binary sizes" << std::endl;
+        return err;
+    }
+
+    binary.resize(size);
+    binary[0] = new unsigned char[size];
+
+    unsigned char * binary_address[1] = { binary[0] };
+
+    // 4 - Get all of the program binaries
+    err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 1 * sizeof(unsigned char*),
+                           binary_address, NULL);
+
+
+    if (err != CL_SUCCESS)
+    {
+#if CAPS_DEBUG
+        std::cerr << "Error querying for program binaries" << std::endl;
+#endif
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+cl_int FFTBinaryLookup::writeCacheFile(std::vector<unsigned char*> &data)
+{
+    if (! this->m_cache_enabled)
+    {
+        return 0;
+    }
+
+    // exclusive open to ensure that only one thread will write the file
+    const std::string & filename = this->m_path + this->m_cache_entry_name;
+
+    CacheEntry cache_file(filename);
+    bool created = cache_file.exclusive_create();
+
+    // try to exclusively create the cache file on the disk
+    if (created)
+    {
+        // if it was created by the current thread, this one will write into cache file
+        cache_file.close();
+
+        const std::string & filename = this->m_path + this->m_cache_entry_name;
+        std::ofstream file (filename.c_str(), std::ios_base::binary);
+
+        file.write((char*)&this->m_header, sizeof(m_header));
+        file.write((char*)data[0], this->m_header.binary_size);
+        file.write((char*)this->m_signature, this->m_header.signature_size);
+        file.close();
+
+#if ENABLE_SOURCE_DUMP
+        const std::string & srcFilename = this->m_path + this->m_cache_entry_name + ".cl";
+        std::ofstream srcFile (srcFilename.c_str());
+        srcFile << this->m_source;
+
+        srcFile.close();
+#endif
+
+        return CL_SUCCESS;
+    }
+
+    // other thread do not write the cache file
+    return 1;
+}
+
+cl_int FFTBinaryLookup::populateCache()
+{
+    // FIXME: support MSB
+    this->m_header.magic_key[0] = 'C';
+    this->m_header.magic_key[1] = 'L';
+    this->m_header.magic_key[2] = 'B';
+    this->m_header.magic_key[3] = '\0';
+
+    std::vector<unsigned char*> data;
+    cl_int err = getSingleBinaryFromProgram(this->m_program, data);
+
+    if (err != CL_SUCCESS)
+    {
+        return err;
+    }
+
+    this->m_header.header_size = sizeof(Header);
+    this->m_header.binary_size = data.size();
+    this->m_header.whole_file_size = this->m_header.header_size + this->m_header.binary_size + this->m_header.signature_size;
+
+    writeCacheFile(data); // ignore return code, because it does nothing if
+                          // the file could not be written (i.e the current
+                          // thread did not create the file
+
+    return CL_SUCCESS;
+}
+
+cl_int FFTBinaryLookup::buildFromSource(const char * source)
+{
+    cl_int err;
+    this->m_program = FFTBinaryLookup::buildProgramFromSource(source,
+                                                           this->m_context,
+                                                           this->m_device,
+                                                           err);
+
+    if (err != CL_SUCCESS)
+    {
+        return err;
+    }
+
+    // write to the cache
+    this->populateCache();
+
+    return CL_SUCCESS;
+}
+
+cl_int FFTBinaryLookup::buildFromLoadedBinary(const void * data,
+                                            size_t len)
+{
+    cl_int err;
+    this->m_program = FFTBinaryLookup::buildProgramFromBinary((char*) data,
+                                                           len,
+                                                           this->m_context,
+                                                           this->m_device,
+                                                           err);
+
+    return err;
+}
+
+cl_int FFTBinaryLookup::buildFromBinary(const void * data,
+                                      size_t len)
+{
+    cl_int err = buildFromLoadedBinary(data, len);
+    if (err != CL_SUCCESS)
+        return err;
+
+    // write to the cache
+    this->populateCache();
+
+    return CL_SUCCESS;
+}
+
+cl_program FFTBinaryLookup::buildProgramFromSource(const char * source,
+                                                cl_context context,
+                                                cl_device_id device,
+                                                cl_int & err,
+                                                const char * options)
+{
+    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &err);
+
+    if (err != CL_SUCCESS)
+        return NULL;
+
+    err = clBuildProgram(program,
+                         1, /* FIXME: 1 device */
+                         &device,
+                         options, 
+                         NULL, 
+                         NULL);
+
+    if (err != CL_SUCCESS)
+        return NULL;
+
+    return program;
+}
+
+
+
+cl_program FFTBinaryLookup::buildProgramFromBinary(const char * data,
+                                                size_t data_size,
+                                                cl_context context,
+                                                cl_device_id device,
+                                                cl_int & err,
+                                                const char * options)
+{
+    cl_program program = clCreateProgramWithBinary(context,
+                                                   1, // num_device
+                                                   &device, // device_list
+                                                   &data_size, // lengths
+                                                   (const unsigned char **)&data,
+                                                   NULL,
+                                                   &err);
+    if (err != CL_SUCCESS)
+    {
+        // FIXME: emit an internal message for OPENCL errors
+        return NULL;
+    }
+
+    err = clBuildProgram(program,
+                         1, /* FIXME: 1 device */
+                         &device,
+                         options,
+                         NULL,
+                         NULL);
+
+    if (err != CL_SUCCESS)
+    {
+        return NULL;
+    }
+
+    return program;
+}
+
+cl_program FFTBinaryLookup::getProgram()
+{
+    return this->m_program;
+}
+
+void FFTBinaryLookup::setProgram(cl_program program, const char * source)
+{
+    this->m_program = program;
+    this->m_source = source;
+}
+
+
+static int make_directory(const std::string &path)
+{
+#ifdef _WIN32
+    return _mkdir (path.c_str());
+#else
+    return mkdir (path.c_str(), S_IRWXU);
+#endif
+}
+
+static void do_mkdir(const std::string &path)
+{
+    int st = make_directory (path.c_str());
+
+    if (st != 0)
+    {
+        if ( errno != EEXIST )
+        {
+            std::string tmp = "Cannot not create directory '" + std::string(path) + "': ";
+            throw tmp;
+        }
+    }
+}
+
+cl_int FFTBinaryLookup::retrieveDeviceAndDriverInfo()
+{
+    char m_device_vendor[SIZE];
+    char m_device_name[SIZE];
+    char m_driver_version[SIZE];
+
+    cl_int err = clGetDeviceInfo(this->m_device, CL_DEVICE_VENDOR, sizeof(m_device_vendor),
+                                 &m_device_vendor, NULL);
+    if (err != CL_SUCCESS)
+    {
+        return err;
+    }
+
+    err = clGetDeviceInfo(this->m_device, CL_DEVICE_NAME, sizeof(m_device_name),
+                          &m_device_name, NULL);
+    if (err != CL_SUCCESS)
+    {
+        return err;
+    }
+
+    err = clGetDeviceInfo(this->m_device, CL_DRIVER_VERSION, sizeof(m_driver_version),
+                          &m_driver_version, NULL);
+    if (err != CL_SUCCESS)
+    {
+        return err;
+    }
+
+#if CAPS_DEBUG
+    fprintf(stderr, "device vendor = %s\n", this->m_device_vendor);
+    fprintf(stderr, "device name = %s\n", this->m_device_name);
+    fprintf(stderr, "driver version = %s\n", this->m_driver_version);
+#endif
+
+    try
+    {
+        const std::string & root = (std::string(cache_path) + m_device_vendor + sep());
+        do_mkdir(root.c_str());
+
+        const std::string & root2 = (root + m_device_name + sep());
+        do_mkdir(root2.c_str());
+
+        const std::string & root3 = (root2 + m_driver_version + sep());
+        do_mkdir(root3.c_str());
+
+        const std::string & root4 = (root3 + this->m_cache_entry_name + sep());
+        do_mkdir(root4.c_str());
+
+        this->m_path = root4;
+        
+        return CL_SUCCESS;
+    }
+    catch (std::string & e)
+    {
+        fprintf(stderr, "%s\n", e.c_str());
+        cache_enabled = false;
+        this->m_cache_enabled = false;
+
+        return CL_INVALID_VALUE;
+    }
+}
diff --git a/src/library/fft_binary_lookup.h b/src/library/fft_binary_lookup.h
new file mode 100644
index 0000000..2819751
--- /dev/null
+++ b/src/library/fft_binary_lookup.h
@@ -0,0 +1,276 @@
+/* ************************************************************************
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef __CLFFT_CLFFT_BINARY_LOOKUP__
+#define __CLFFT_CLFFT_BINARY_LOOKUP__
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <string>
+#include <vector>
+
+#include "generator.h"
+#include "plan.h"
+
+//
+// FFTBinaryLookup defines an API to manage the kernel cache on the disk
+//
+// The FFTBinaryLookup object provides methods to:
+//  * check if a cache file exists on the disk or not
+//  * fill-up the signature to characterize the program beeing built on the disk
+//  * build a cl_program from a string kernel or from a binary
+// 
+// A cache entry is a file stored on the disk which contains 3 sections:
+//  * A header section (providing information about file structure)
+//  * The binary contained in the cl_program
+//  * A signature which provides additionnal informations about the kernel
+//    and allows to characterize the kernel in the disk cache
+//
+// The environment variable CLFFT_CACHE_PATH defines the location of the
+// cache on the disk. If the variable CLFFT_CACHE_PATH is not defined, no
+// cache file is written on the disk, but the cl_program can be built and
+// remains on memory
+//
+// Concerning multithreading, the policy is that every thread build the
+// cl_program from the source, but only the first one writes it on the
+// disk. Other threads continue with the cl_program in memory.
+//
+// A typical cache query shall be composed of the following steps:
+//
+//  (1) Create a local instance of FFTBinaryLookup 
+//
+//  (2) Specify the additional characteristics (i.e. variants) of the
+//      requested program. Those information combined with the program
+//      name and the OpenCL context and device shall form a unique
+//      signature for the binary program.
+// 
+//  (3) Perform the effective search by calling the 'found' method
+// 
+//  (4) if the search was successfull then cl_program can be retreived 
+//      by a call to the 'getProgram' method
+// 
+//  (5) if the search was not successfull then a cl_program 
+//      must be created  and populated in the cache by a call 
+//      to the 'setProgram' method.
+// 
+//  (6) Destroy the FFTBinaryLookup local instance.
+// 
+// For instance, that could be 
+//
+//     cl_program program  ;
+//   
+//     The program name is part of the signature and shall be unique 
+//     const char * program_name = "... my unique program name ... " ;
+//   
+//     FFTBinaryLookup bl(context, device, program_name);
+//   
+//     //  Specify additionnal information used to build a
+//     //  signature signature for that cache entry 
+//                  
+//     bl.variantInt( vectorSize );
+//     bl.variantInt( hasBorder );
+//     ... 
+//   
+//     // Perform the query 
+//     if ( bl.found() ) 
+//     {
+//        //   Success! use the cl_program retreived from the cache
+//        program = bl.getProgram();
+//     }
+//     else 
+//     {
+//        //   Failure! we need to build the program ourself
+//        program = build_the_program(context,device,vectorSize,...) ; 
+//        //   Inform the lookup object of the program
+//        bl.setProgram(program);  
+//        //   And populate the cache
+//        bl.populateCache() 
+//     }
+// 
+// Remark: The members buildFromSource, buildFromBinary etc are utility 
+//         functions that can be used to build the cl_program from either 
+//         sources or binary (e.g. SPIR). Their use is optionnal. 
+//
+//
+class FFTBinaryLookup
+ {
+public:
+    // Constructor
+    // \param ctxt the context for which the cl_program should be built
+    // \param device the device for which the cl_program should be built
+    // \param kernel_name the kernel identifier
+    FFTBinaryLookup(const clfftGenerators gen, const clfftPlanHandle plHandle, cl_context ctxt, cl_device_id device);
+    ~FFTBinaryLookup();
+
+    // Methods to fill up the signature of the cache entry
+    void variantInt(int num);
+    void variantDouble(double num);
+    void variantCompileOptions(const std::string & opts);
+    void variantRaw(const void * data, size_t bytes);
+
+    // Indicates whether or not the cache entry was found on the disk
+    // If the cache entry was found and is complete on the disk, its content
+    // is loaded
+    // \return true if a cache entry was found, false else
+    bool found();
+
+    // Build a cl_program from the source code and init attributes
+    // of the current structure
+    // so that the program can be accessed with the getProgram method
+    // Write the file to the cache
+    cl_int buildFromSource(const char * source);
+
+    // Build a cl_program from the source code and init attributes
+    // so that the program can be accessed with the getProgram method
+    // Write the file to the cache
+    cl_int buildFromBinary(const void * data, 
+                           size_t len);
+
+    // Returns the cl_program built from binary or loaded from disk
+    cl_program getProgram();
+
+    // Set the current m_program to the given program
+    void setProgram(cl_program program, const char * source);
+
+    // Build a cl_program from a text
+    static cl_program buildProgramFromSource(const char * filename,
+                                             cl_context context,
+                                             cl_device_id device,
+                                             cl_int & err,
+                                             const char * options = 0);
+
+    // Build a cl_program from binary
+    static cl_program buildProgramFromBinary(const char * data,
+                                             size_t data_size,
+                                             cl_context context,
+                                             cl_device_id device,
+                                             cl_int & err,
+                                             const char * options = 0);
+
+    // Initialize the whole cache file information (magic_key, header and program)
+    // and dump on the disk
+    cl_int populateCache();
+
+private:
+
+    // Serialize variants and compute the checksum to load the file from cache
+    void finalizeVariant();
+
+    // Build a cl_program from the source code and init attributes
+    // so that the program can be accessed with the getProgram method
+    // Do not write the file to the cache
+    cl_int buildFromLoadedBinary(const void * data, 
+                                 size_t len);
+
+    // Try to retrieve the header of the cache file
+    // Returns: ok if the header sections was successfully loaded, false else
+    bool loadHeader(std::ifstream &file, size_t length);
+
+    // Try to retrieve the cl_program and its signature in file
+    // Returns: ok if the binary and signature sections were successfully loaded, false else
+    bool loadBinaryAndSignature(std::ifstream &file);
+
+    // Try to create a file associated to the current program/variant in the cache folder
+    // Returns true if the file was successfully opened and loaded, false else
+    bool tryLoadCacheFile();
+
+    // Dump the file on the disk with the name stored in this->m_cache_entry_name
+    cl_int writeCacheFile(std::vector<unsigned char*> &data);
+
+    // Retrieve device name, device vendor and driver number by calling
+    // clGetDeviceInfo
+    cl_int retrieveDeviceAndDriverInfo();
+
+    // Cache entry name 
+    std::string m_cache_entry_name;
+
+    // Path for the cache entry name
+    std::string m_path;
+
+    // Header structure of a cache entry
+    typedef struct Header_
+    {
+        char magic_key[4]; // = |C|L|F|\0, useful to know that we are loading a clfft cache entry
+        int whole_file_size; // the whole file of the size to know if the current file is complete or not
+        int header_size; // = sizeof(Header)
+        int binary_size; // kernel binary size
+        int signature_size; // variant information
+    } Header;
+
+    Header m_header;
+
+    cl_context   m_context;
+    cl_device_id m_device;
+
+    cl_program   m_program;
+
+    std::string  m_source;
+
+    unsigned char * m_binary;
+    char *          m_signature;
+
+    enum VariantKind
+    {
+        INT,
+        DOUBLE,
+        STRING,
+        DATA
+    };
+
+    struct Variant
+    {
+        Variant();
+        Variant(VariantKind kind, char * data, size_t size);
+
+        ~Variant();
+
+        VariantKind m_kind;
+        size_t      m_size;
+        char *      m_data;
+        
+        static char * serialize(VariantKind kind, char * data, size_t size);
+        static Variant unserialize(char * data);
+    };
+
+    // Cache entry, useful to abstract Windows and linux 
+    // cache entry file descriptor
+    struct CacheEntry
+    {
+        CacheEntry(const std::string & filename);
+        bool exclusive_create();
+        void close();
+        bool successful_creation();
+
+    private:
+        std::string m_filename;
+        bool        m_successful_creation;
+        void *      m_handle;
+    };
+
+    // Variants
+    std::vector<Variant> m_variants;
+
+    // Indicates whether the cache should be used or not
+    bool m_cache_enabled;
+};
+
+#undef SIZE
+
+#endif
diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
index 1221882..3315e0d 100644
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@ -19,6 +19,71 @@
 #include <math.h>
 #include <list>
 #include "generator.stockham.h"
+#include "action.h"
+
+FFTGeneratedCopyAction::FFTGeneratedCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTCopyAction(plHandle, plan, queue, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTCopyAction() failed, exit
+        fprintf(stderr, "FFTCopyAction() failed!\n");
+        return;
+    }
+
+    // Initialize the FFTAction::FFTKernelGenKeyParams member
+    err = this->initParams();
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedCopyAction::initParams() failed!\n");
+        return;
+    }
+
+    FFTRepo &fftRepo = FFTRepo::getInstance();
+
+    err = this->generateKernel(fftRepo, queue);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedCopyAction::generateKernel failed\n");
+        return;
+    }
+
+    err = compileKernels( queue, plHandle, plan);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedCopyAction::compileKernels failed\n");
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+bool FFTGeneratedCopyAction::buildForwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool r2c_transform = (inputLayout == CLFFT_REAL);
+	bool h2c = (inputLayout == CLFFT_HERMITIAN_PLANAR) || (inputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+    bool c2h = (outputLayout == CLFFT_HERMITIAN_PLANAR) || (outputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+
+    return (r2c_transform || c2h) || (!(h2c || c2h));
+}
+
+bool FFTGeneratedCopyAction::buildBackwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool c2r_transform = (outputLayout == CLFFT_REAL);
+    bool h2c = (inputLayout == CLFFT_HERMITIAN_PLANAR) || (inputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+	bool c2h = (outputLayout == CLFFT_HERMITIAN_PLANAR) || (outputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+
+    return (c2r_transform || h2c) || (!(h2c || c2h));
+}
 
 using namespace StockhamGenerator;
 
@@ -30,7 +95,7 @@ namespace CopyGenerator
     {
         size_t N;
 		size_t Nt;
-		const FFTKernelGenKeyParams params;
+		const FFTGeneratedCopyAction::Signature & params;
 		bool h2c, c2h;
 		bool general;
 
@@ -59,7 +124,7 @@ namespace CopyGenerator
 		}
 
     public:
-        CopyKernel( const FFTKernelGenKeyParams &paramsVal) :
+        CopyKernel( const FFTGeneratedCopyAction::Signature &paramsVal) :
 					params(paramsVal)
 
         {
@@ -338,66 +403,58 @@ namespace CopyGenerator
 };
 
 
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Copy> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTGeneratedCopyAction::initParams ()
 {
 
     //    Query the devices in this context for their local memory sizes
     //    How we generate a kernel depends on the *minimum* LDS size for all devices.
     //
     const FFTEnvelope * pEnvelope = NULL;
-    OPENCL_V(const_cast<FFTPlan*>(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+    OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
     BUG_CHECK (NULL != pEnvelope);
 
-    ::memset( &params, 0, sizeof( params ) );
-    params.fft_precision    = this->precision;
-    params.fft_placeness    = this->placeness;
-    params.fft_inputLayout  = this->inputLayout;
-	params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
+    this->signature.fft_precision    = this->plan->precision;
+    this->signature.fft_placeness    = this->plan->placeness;
+    this->signature.fft_inputLayout  = this->plan->inputLayout;
+	this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
 
-    ARG_CHECK (this->inStride.size() == this->outStride.size())
+    ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
 
-    params.fft_outputLayout = this->outputLayout;
+    this->signature.fft_outputLayout = this->plan->outputLayout;
 
-	params.fft_DataDim = this->length.size() + 1;
+	this->signature.fft_DataDim = this->plan->length.size() + 1;
 	int i = 0;
-	for(i = 0; i < (params.fft_DataDim - 1); i++)
+	for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
 	{
-        params.fft_N[i]         = this->length[i];
-        params.fft_inStride[i]  = this->inStride[i];
-        params.fft_outStride[i] = this->outStride[i];
+        this->signature.fft_N[i]         = this->plan->length[i];
+        this->signature.fft_inStride[i]  = this->plan->inStride[i];
+        this->signature.fft_outStride[i] = this->plan->outStride[i];
 
 	}
-    params.fft_inStride[i]  = this->iDist;
-    params.fft_outStride[i] = this->oDist;
+    this->signature.fft_inStride[i]  = this->plan->iDist;
+    this->signature.fft_outStride[i] = this->plan->oDist;
 
-    params.fft_fwdScale  = this->forwardScale;
-    params.fft_backScale = this->backwardScale;
+    this->signature.fft_fwdScale  = this->plan->forwardScale;
+    this->signature.fft_backScale = this->plan->backwardScale;
 
     return CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus FFTGeneratedCopyAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
 {
-    FFTKernelGenKeyParams fftParams;
-	OPENCL_V( this->GetKernelGenKeyPvt<Copy>( fftParams ), _T("GetKernelGenKey() failed!") );
-
 	bool h2c, c2h;
-	h2c = (	(fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
-			(fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
-	c2h = (	(fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
-			(fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+	h2c = (	(this->signature.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
+	c2h = (	(this->signature.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
 
 	bool general = !(h2c || c2h);
 
-	size_t count = this->batchsize;
+	size_t count = this->plan->batchsize;
 
-	switch(fftParams.fft_DataDim)
+	switch(this->signature.fft_DataDim)
 	{
 	case 5: assert(false);
-	case 4: count *= fftParams.fft_N[2];
-	case 3: count *= fftParams.fft_N[1];
+	case 4: count *= this->signature.fft_N[2];
+	case 3: count *= this->signature.fft_N[1];
 	case 2:
 			{
 				if(general)
@@ -406,7 +463,7 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std:
 				}
 				else
 				{
-					count *= (1 + fftParams.fft_N[0]/2); 
+					count *= (1 + this->signature.fft_N[0]/2); 
 				}
 			}
 			break;
@@ -419,40 +476,30 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std:
     return    CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetMax1DLengthPvt<Copy> (size_t * longest) const
-{
-	return FFTPlan::GetMax1DLengthPvt<Stockham>(longest);
-}
 
 using namespace CopyGenerator;
 
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedCopyAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
 {
-  FFTKernelGenKeyParams params;
-  OPENCL_V( this->GetKernelGenKeyPvt<Copy> (params), _T("GetKernelGenKey() failed!") );
 
   bool h2c, c2h;
-  h2c = (	(params.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
-  			(params.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
-  c2h = (	(params.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
-  			(params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+  h2c = ( (this->signature.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
+  c2h = ( (this->signature.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
   
   bool general = !(h2c || c2h);
 
   std::string programCode;
-  Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+  Precision pr = (this->signature.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
   switch(pr)
   {
   case P_SINGLE:
     {
-      CopyKernel<P_SINGLE> kernel(params);
+      CopyKernel<P_SINGLE> kernel(this->signature);
       kernel.GenerateKernel(programCode);
     } break;
   case P_DOUBLE:
     {
-      CopyKernel<P_DOUBLE> kernel(params);
+      CopyKernel<P_DOUBLE> kernel(this->signature);
       kernel.GenerateKernel(programCode);
     } break;
   }
@@ -466,15 +513,15 @@ clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_
     status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
     OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
-  OPENCL_V( fftRepo.setProgramCode( Copy, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+  OPENCL_V( fftRepo.setProgramCode( this->getGenerator(), this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
 
   if(general)
   {
-  OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_general", "copy_general", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+  OPENCL_V( fftRepo.setProgramEntryPoints( this->getGenerator(), this->getSignatureData(), "copy_general", "copy_general", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
   }
   else
   {
-  OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_c2h", "copy_h2c", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+  OPENCL_V( fftRepo.setProgramEntryPoints( this->getGenerator(), this->getSignatureData(), "copy_c2h", "copy_h2c", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
   }
 
   return CLFFT_SUCCESS;
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 08f173c..bd45288 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -19,6 +19,73 @@
 #include <math.h>
 #include "generator.stockham.h"
 #include <list>
+#include "action.h"
+
+
+FFTGeneratedStockhamAction::FFTGeneratedStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTStockhamAction(plHandle, plan, queue, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTAction() failed, exit
+        fprintf(stderr, "FFTStockhamAction() failed!\n");
+        return;
+    }
+
+    // Initialize the FFTAction::FFTKernelGenKeyParams member
+    err = this->initParams();
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedStockhamAction::initParams() failed!\n");
+        return;
+    }
+
+    FFTRepo &fftRepo = FFTRepo::getInstance();
+
+    err = this->generateKernel(fftRepo, queue);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedStockhamAction::generateKernel failed\n");
+        return;
+    }
+
+    err = compileKernels( queue, plHandle, plan);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedStockhamAction::compileKernels failed\n");
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+bool FFTGeneratedStockhamAction::buildForwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool r2c_transform = (inputLayout == CLFFT_REAL);
+    bool c2r_transform = (outputLayout == CLFFT_REAL);
+    bool real_transform = (r2c_transform || c2r_transform);
+
+    return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedStockhamAction::buildBackwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool r2c_transform = (inputLayout == CLFFT_REAL);
+    bool c2r_transform = (outputLayout == CLFFT_REAL);
+    bool real_transform = (r2c_transform || c2r_transform);
+
+    return (!real_transform) || c2r_transform;
+}
+
 
 // FFT Stockham Autosort Method
 //
@@ -3182,64 +3249,64 @@ namespace StockhamGenerator
 
 using namespace StockhamGenerator;
 
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTGeneratedStockhamAction::initParams ()
 {
 
     //    Query the devices in this context for their local memory sizes
     //    How we generate a kernel depends on the *minimum* LDS size for all devices.
     //
     const FFTEnvelope * pEnvelope = NULL;
-    OPENCL_V(const_cast<FFTPlan*>(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+    OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
     BUG_CHECK (NULL != pEnvelope);
 
-    ::memset( &params, 0, sizeof( params ) );
-    params.fft_precision    = this->precision;
-    params.fft_placeness    = this->placeness;
-    params.fft_inputLayout  = this->inputLayout;
-	params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
+    // Remainder: params was properly cleared by its constructor
+    //            clearing it again would destroy datasize and id!!
+    this->signature.fft_precision    = this->plan->precision;
+    this->signature.fft_placeness    = this->plan->placeness;
+    this->signature.fft_inputLayout  = this->plan->inputLayout;
+	this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
 
-    ARG_CHECK(this->length.size()    > 0);
-	ARG_CHECK(this->inStride.size()  > 0);
-    ARG_CHECK(this->outStride.size() > 0);
+    ARG_CHECK(this->plan->length.size()    > 0);
+	ARG_CHECK(this->plan->inStride.size()  > 0);
+    ARG_CHECK(this->plan->outStride.size() > 0);
 
-    ARG_CHECK (this->inStride.size() == this->outStride.size())
+    ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
 
-	bool real_transform = ((this->inputLayout == CLFFT_REAL) || (this->outputLayout == CLFFT_REAL));
+	bool real_transform = ((this->plan->inputLayout == CLFFT_REAL) || (this->plan->outputLayout == CLFFT_REAL));
 
-    if ( (CLFFT_INPLACE == this->placeness) && (!real_transform) ) {
+    if ( (CLFFT_INPLACE == this->plan->placeness) && (!real_transform) ) {
         //    If this is an in-place transform the
         //    input and output layout, dimensions and strides
         //    *MUST* be the same.
         //
-        ARG_CHECK (this->inputLayout == this->outputLayout)
-        params.fft_outputLayout = this->inputLayout;
-        for (size_t u = this->inStride.size(); u-- > 0; ) {
-            ARG_CHECK (this->inStride[u] == this->outStride[u]);
+        ARG_CHECK (this->plan->inputLayout == this->plan->outputLayout)
+        this->signature.fft_outputLayout = this->plan->inputLayout;
+        for (size_t u = this->plan->inStride.size(); u-- > 0; ) {
+            ARG_CHECK (this->plan->inStride[u] == this->plan->outStride[u]);
         }
     } else {
-        params.fft_outputLayout = this->outputLayout;
+        this->signature.fft_outputLayout = this->plan->outputLayout;
     }
 
-	params.fft_DataDim = this->length.size() + 1;
+	this->signature.fft_DataDim = this->plan->length.size() + 1;
 	int i = 0;
-	for(i = 0; i < (params.fft_DataDim - 1); i++)
+	for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
 	{
-        params.fft_N[i]         = this->length[i];
-        params.fft_inStride[i]  = this->inStride[i];
-        params.fft_outStride[i] = this->outStride[i];
+        this->signature.fft_N[i]         = this->plan->length[i];
+        this->signature.fft_inStride[i]  = this->plan->inStride[i];
+        this->signature.fft_outStride[i] = this->plan->outStride[i];
 
 	}
-    params.fft_inStride[i]  = this->iDist;
-    params.fft_outStride[i] = this->oDist;
+    this->signature.fft_inStride[i]  = this->plan->iDist;
+    this->signature.fft_outStride[i] = this->plan->oDist;
 
 
-	params.fft_RCsimple = this->RCsimple;
+	this->signature.fft_RCsimple = this->plan->RCsimple;
 
-	params.blockCompute = this->blockCompute;
-	params.blockComputeType = this->blockComputeType;
+	this->signature.blockCompute = this->plan->blockCompute;
+	this->signature.blockComputeType = this->plan->blockComputeType;
 
-	params.fft_twiddleFront = this->twiddleFront;
+	this->signature.fft_twiddleFront = this->plan->twiddleFront;
 
 	size_t wgs, nt;
 #ifdef PARMETERS_TO_BE_READ
@@ -3249,106 +3316,101 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
 	nt = pr.numTransformsPerWg;
 #else
 	size_t t_wgs, t_nt;
-	Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+	Precision pr = (this->signature.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
 	switch(pr)
 	{
 	case P_SINGLE:
 		{
 			KernelCoreSpecs<P_SINGLE> kcs;
-			kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
-			if(params.blockCompute)
+			kcs.GetWGSAndNT(this->signature.fft_N[0], t_wgs, t_nt);
+			if(this->signature.blockCompute)
 			{
-				params.blockSIMD = Kernel<P_SINGLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
-				params.blockLDS  = Kernel<P_SINGLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+				this->signature.blockSIMD = Kernel<P_SINGLE>::BlockSizes::BlockWorkGroupSize(this->signature.fft_N[0]);
+				this->signature.blockLDS  = Kernel<P_SINGLE>::BlockSizes::BlockLdsSize(this->signature.fft_N[0]);
 			}
 		} break;
 	case P_DOUBLE:
 		{
 			KernelCoreSpecs<P_DOUBLE> kcs;
-			kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
-			if(params.blockCompute)
+			kcs.GetWGSAndNT(this->signature.fft_N[0], t_wgs, t_nt);
+			if(this->signature.blockCompute)
 			{
-				params.blockSIMD = Kernel<P_DOUBLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
-				params.blockLDS  = Kernel<P_DOUBLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+				this->signature.blockSIMD = Kernel<P_DOUBLE>::BlockSizes::BlockWorkGroupSize(this->signature.fft_N[0]);
+				this->signature.blockLDS  = Kernel<P_DOUBLE>::BlockSizes::BlockLdsSize(this->signature.fft_N[0]);
 			}
 		} break;
 	}
 
-	if((t_wgs != 0) && (t_nt != 0) && (this->envelope.limit_WorkGroupSize >= 256))
+	if((t_wgs != 0) && (t_nt != 0) && (this->plan->envelope.limit_WorkGroupSize >= 256))
 	{
 		wgs = t_wgs;
 		nt = t_nt;
 	}
 	else
-		DetermineSizes(this->envelope.limit_WorkGroupSize, params.fft_N[0], wgs, nt);
+		DetermineSizes(this->plan->envelope.limit_WorkGroupSize, this->signature.fft_N[0], wgs, nt);
 #endif
 
-	assert((nt * params.fft_N[0]) >= wgs);
-	assert((nt * params.fft_N[0])%wgs == 0);
+	assert((nt * this->signature.fft_N[0]) >= wgs);
+	assert((nt * this->signature.fft_N[0])%wgs == 0);
 
-	params.fft_R = (nt * params.fft_N[0])/wgs;
-	params.fft_SIMD = wgs;
+	this->signature.fft_R = (nt * this->signature.fft_N[0])/wgs;
+	this->signature.fft_SIMD = wgs;
 
 
-    if (this->large1D != 0) {
-        ARG_CHECK (params.fft_N[0] != 0)
-        ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
-        params.fft_3StepTwiddle = true;
-		ARG_CHECK ( this->large1D  == (params.fft_N[1] * params.fft_N[0]) );
+    if (this->plan->large1D != 0) {
+        ARG_CHECK (this->signature.fft_N[0] != 0)
+        ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
+        this->signature.fft_3StepTwiddle = true;
+		ARG_CHECK ( this->plan->large1D  == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
     }
 
-    params.fft_fwdScale  = this->forwardScale;
-    params.fft_backScale = this->backwardScale;
+    this->signature.fft_fwdScale  = this->plan->forwardScale;
+    this->signature.fft_backScale = this->plan->backwardScale;
 
     return CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Stockham> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus FFTGeneratedStockhamAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
 {
     //    How many complex numbers in the input mutl-dimensional array?
     //
     unsigned long long count = 1;
-    for (unsigned u = 0; u < length.size(); ++u) {
-        count *= std::max<size_t> (1, this->length[ u ]);
+    for (unsigned u = 0; u < this->plan->length.size(); ++u) {
+        count *= std::max<size_t> (1, this->plan->length[ u ]);
     }
-    count *= this->batchsize;
+    count *= this->plan->batchsize;
 
-    FFTKernelGenKeyParams fftParams;
-    //    Translate the user plan into the structure that we use to map plans to clPrograms
-    OPENCL_V( this->GetKernelGenKeyPvt<Stockham>( fftParams ), _T("GetKernelGenKey() failed!") );
 
-	if(fftParams.blockCompute)
+	if(this->signature.blockCompute)
 	{
-		count = DivRoundingUp<unsigned long long> (count, fftParams.blockLDS); 
-		count = count * fftParams.blockSIMD; 
+		count = DivRoundingUp<unsigned long long> (count, this->signature.blockLDS); 
+		count = count * this->signature.blockSIMD; 
 
 		globalWS.push_back( static_cast< size_t >( count ) );
-		localWS.push_back( fftParams.blockSIMD );
+		localWS.push_back( this->signature.blockSIMD );
 
 		return    CLFFT_SUCCESS;
 	}
 
-    count = DivRoundingUp<unsigned long long> (count, fftParams.fft_R);      // count of WorkItems
-    count = DivRoundingUp<unsigned long long> (count, fftParams.fft_SIMD);   // count of WorkGroups
+    count = DivRoundingUp<unsigned long long> (count, this->signature.fft_R);      // count of WorkItems
+    count = DivRoundingUp<unsigned long long> (count, this->signature.fft_SIMD);   // count of WorkGroups
 
 	// for real transforms we only need half the work groups since we do twice the work in 1 work group
-	if( !(fftParams.fft_RCsimple) && ((fftParams.fft_inputLayout == CLFFT_REAL) || (fftParams.fft_outputLayout == CLFFT_REAL)) )
+	if( !(this->signature.fft_RCsimple) && ((this->signature.fft_inputLayout == CLFFT_REAL) || (this->signature.fft_outputLayout == CLFFT_REAL)) )
 		count = DivRoundingUp<unsigned long long> (count, 2);
 
-    count = std::max<unsigned long long> (count, 1) * fftParams.fft_SIMD;
+    count = std::max<unsigned long long> (count, 1) * this->signature.fft_SIMD;
         // .. count of WorkItems, rounded up to next multiple of fft_SIMD.
 
 	// 1 dimension work group size
 	globalWS.push_back( static_cast< size_t >( count ) );
 
-    localWS.push_back( fftParams.fft_SIMD );
+    localWS.push_back( this->signature.fft_SIMD );
 
     return    CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetMax1DLengthPvt<Stockham> (size_t * longest) const
+clfftStatus FFTPlan::GetMax1DLengthStockham (size_t * longest) const
 {
 	// TODO  The caller has already acquired the lock on *this
 	//	However, we shouldn't depend on it.
@@ -3370,12 +3432,8 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Stockham> (size_t * longest) const
 	return CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedStockhamAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
 {
-    FFTKernelGenKeyParams params;
-    OPENCL_V( this->GetKernelGenKeyPvt<Stockham> (params), _T("GetKernelGenKey() failed!") );
-
     cl_int status = CL_SUCCESS;
     cl_device_id Device = NULL;
     status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
@@ -3386,17 +3444,17 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
     OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
 	std::string programCode;
-	Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+	Precision pr = (this->signature.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
 	switch(pr)
 	{
 	case P_SINGLE:
 		{
-			Kernel<P_SINGLE> kernel(params);
+			Kernel<P_SINGLE> kernel(this->signature);
 			kernel.GenerateKernel(programCode, Device);
 		} break;
 	case P_DOUBLE:
 		{
-			Kernel<P_DOUBLE> kernel(params);
+			Kernel<P_DOUBLE> kernel(this->signature);
 			kernel.GenerateKernel(programCode, Device);
 		} break;
 	}
@@ -3405,8 +3463,8 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
 	ReadKernelFromFile(programCode);
 #endif
 
-    OPENCL_V( fftRepo.setProgramCode( Stockham, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-    OPENCL_V( fftRepo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+    OPENCL_V( fftRepo.setProgramCode( this->getGenerator(), this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+    OPENCL_V( fftRepo.setProgramEntryPoints( this->getGenerator(), this->getSignatureData(), "fft_fwd", "fft_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
 
     return CLFFT_SUCCESS;
 }
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
index 1d4a46d..c1a3952 100644
--- a/src/library/generator.transpose.gcn.cpp
+++ b/src/library/generator.transpose.gcn.cpp
@@ -28,6 +28,74 @@
 #include "generator.transpose.gcn.h"
 #include "generator.stockham.h"
 
+#include "action.h"
+
+FFTGeneratedTransposeGCNAction::FFTGeneratedTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTTransposeGCNAction(plHandle, plan, queue, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTTransposeGCNAction() failed, exit
+        fprintf(stderr, "FFTTransposeGCNAction() failed!\n");
+        return;
+    }
+
+    // Initialize the FFTAction::FFTKernelGenKeyParams member
+    err = this->initParams();
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedTransposeGCNAction::initParams() failed!\n");
+        return;
+    }
+
+    FFTRepo &fftRepo = FFTRepo::getInstance();
+
+    err = this->generateKernel(fftRepo, queue);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedTransposeGCNAction::generateKernel failed\n");
+        return;
+    }
+
+    err = compileKernels( queue, plHandle, plan);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedTransposeGCNAction::compileKernels failed\n");
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+
+bool FFTGeneratedTransposeGCNAction::buildForwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool r2c_transform = (inputLayout == CLFFT_REAL);
+    bool c2r_transform = (outputLayout == CLFFT_REAL);
+    bool real_transform = (r2c_transform || c2r_transform);
+
+    return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedTransposeGCNAction::buildBackwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool r2c_transform = (inputLayout == CLFFT_REAL);
+    bool c2r_transform = (outputLayout == CLFFT_REAL);
+    bool real_transform = (r2c_transform || c2r_transform);
+
+    return (!real_transform) || c2r_transform;
+}
+
+
 // A structure that represents a bounding box or tile, with convenient names for the row and column addresses
 // local work sizes
 struct tile
@@ -142,7 +210,7 @@ const std::string pmImagOut( "pmImagOut" );
 const std::string pmComplexIn( "pmComplexIn" );
 const std::string pmComplexOut( "pmComplexOut" );
 
-static clfftStatus genTransposePrototype( const FFTKernelGenKeyParams& params, const tile& lwSize, const std::string& dtPlanar, const std::string& dtComplex, 
+static clfftStatus genTransposePrototype( const FFTGeneratedTransposeGCNAction::Signature & params, const tile& lwSize, const std::string& dtPlanar, const std::string& dtComplex, 
                                          const std::string &funcName, std::stringstream& transKernel, std::string& dtInput, std::string& dtOutput )
 {
 
@@ -231,7 +299,7 @@ static clfftStatus genTransposePrototype( const FFTKernelGenKeyParams& params, c
     return CLFFT_SUCCESS;
 }
 
-static clfftStatus genTransposeKernel( const FFTKernelGenKeyParams& params, std::string& strKernel, const tile& lwSize, const size_t reShapeFactor, 
+static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Signature & params, std::string& strKernel, const tile& lwSize, const size_t reShapeFactor, 
                                             const size_t loopCount, const size_t outRowPadding )
 {
     strKernel.reserve( 4096 );
@@ -497,67 +565,67 @@ static clfftStatus genTransposeKernel( const FFTKernelGenKeyParams& params, std:
     return CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_GCN> (FFTKernelGenKeyParams & params) const
+
+clfftStatus FFTGeneratedTransposeGCNAction::initParams ()
 {
-    ::memset( &params, 0, sizeof( params ) );
-    params.fft_precision    = this->precision;
-    params.fft_placeness    = this->placeness;
-    params.fft_inputLayout  = this->inputLayout;
-    params.fft_outputLayout = this->outputLayout;
-    params.fft_3StepTwiddle = false;
-
-	params.transOutHorizontal = this->transOutHorizontal;	// using the twiddle front flag to specify horizontal write
+
+    this->signature.fft_precision    = this->plan->precision;
+    this->signature.fft_placeness    = this->plan->placeness;
+    this->signature.fft_inputLayout  = this->plan->inputLayout;
+    this->signature.fft_outputLayout = this->plan->outputLayout;
+    this->signature.fft_3StepTwiddle = false;
+
+	this->signature.transOutHorizontal = this->plan->transOutHorizontal;	// using the twiddle front flag to specify horizontal write
 														// we do this so as to reuse flags in FFTKernelGenKeyParams
 														// and to avoid making a new one 
 
-    ARG_CHECK( this->inStride.size( ) == this->outStride.size( ) );
+    ARG_CHECK( this->plan->inStride.size( ) == this->plan->outStride.size( ) );
 
-    if( CLFFT_INPLACE == params.fft_placeness )
+    if( CLFFT_INPLACE == this->signature.fft_placeness )
     {
         //	If this is an in-place transform the
         //	input and output layout, dimensions and strides
         //	*MUST* be the same.
         //
-        ARG_CHECK( params.fft_inputLayout == params.fft_outputLayout )
+        ARG_CHECK( this->signature.fft_inputLayout == this->signature.fft_outputLayout )
 
-        for( size_t u = this->inStride.size(); u-- > 0; )
+        for( size_t u = this->plan->inStride.size(); u-- > 0; )
         {
-            ARG_CHECK( this->inStride[u] == this->outStride[u] );
+            ARG_CHECK( this->plan->inStride[u] == this->plan->outStride[u] );
         }
     }
 
-	params.fft_DataDim = this->length.size() + 1;
+	this->signature.fft_DataDim = this->plan->length.size() + 1;
 	int i = 0;
-	for(i = 0; i < (params.fft_DataDim - 1); i++)
+	for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
 	{
-        params.fft_N[i]         = this->length[i];
-        params.fft_inStride[i]  = this->inStride[i];
-        params.fft_outStride[i] = this->outStride[i];
+        this->signature.fft_N[i]         = this->plan->length[i];
+        this->signature.fft_inStride[i]  = this->plan->inStride[i];
+        this->signature.fft_outStride[i] = this->plan->outStride[i];
 
 	}
-    params.fft_inStride[i]  = this->iDist;
-    params.fft_outStride[i] = this->oDist;
-
-    if (this->large1D != 0) {
-        ARG_CHECK (params.fft_N[0] != 0)
-        ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
-        params.fft_3StepTwiddle = true;
-		ARG_CHECK ( this->large1D  == (params.fft_N[1] * params.fft_N[0]) );
+    this->signature.fft_inStride[i]  = this->plan->iDist;
+    this->signature.fft_outStride[i] = this->plan->oDist;
+
+    if (this->plan->large1D != 0) {
+        ARG_CHECK (this->signature.fft_N[0] != 0)
+        ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
+        this->signature.fft_3StepTwiddle = true;
+		ARG_CHECK ( this->plan->large1D  == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
     }
 
     //	Query the devices in this context for their local memory sizes
     //	How we generate a kernel depends on the *minimum* LDS size for all devices.
     //
     const FFTEnvelope * pEnvelope = NULL;
-    OPENCL_V( this->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
+    OPENCL_V( this->plan->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
     BUG_CHECK( NULL != pEnvelope );
 
     // TODO:  Since I am going with a 2D workgroup size now, I need a better check than this 1D use
     // Check:  CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
     // CL_DEVICE_MAX_WORK_ITEM_SIZES
-    params.fft_R = 1; // Dont think i'll use
-    params.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+    this->signature.fft_R = 1; // Dont think i'll use
+    this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
 
     return CLFFT_SUCCESS;
 }
@@ -577,13 +645,10 @@ size_t loopCount = 0;
 
 //	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
 //	Feed this generator the FFTPlan, and it returns the generated program as a string
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedTransposeGCNAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
 {
-    FFTKernelGenKeyParams params;
-    OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( params ), _T( "GetKernelGenKey() failed!" ) );
 
-    switch( params.fft_precision )
+    switch( this->signature.fft_precision )
     {
     case CLFFT_SINGLE:
     case CLFFT_SINGLE_FAST:
@@ -600,7 +665,7 @@ clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const
     }
 
     std::string programCode;
-    OPENCL_V( genTransposeKernel( params, programCode, lwSize, reShapeFactor, loopCount, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
+    OPENCL_V( genTransposeKernel( this->signature, programCode, lwSize, reShapeFactor, loopCount, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
 
     cl_int status = CL_SUCCESS;
     cl_device_id Device = NULL;
@@ -612,39 +677,38 @@ clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const
     OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
 
-    OPENCL_V( fftRepo.setProgramCode( Transpose_GCN, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+    OPENCL_V( fftRepo.setProgramCode( Transpose_GCN, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
 
     // Note:  See genFunctionPrototype( )
-    if( params.fft_3StepTwiddle )
+    if( this->signature.fft_3StepTwiddle )
     {
-        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn_tw_fwd", "transpose_gcn_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, this->getSignatureData(), "transpose_gcn_tw_fwd", "transpose_gcn_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
     }
     else
     {
-        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn", "transpose_gcn", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, this->getSignatureData(), "transpose_gcn", "transpose_gcn", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
     }
 
     return CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_GCN>( std::vector< size_t >& globalWS, std::vector< size_t >& localWS ) const
+
+clfftStatus FFTGeneratedTransposeGCNAction::getWorkSizes( std::vector< size_t >& globalWS, std::vector< size_t >& localWS )
 {
-	FFTKernelGenKeyParams parameters;
-    OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( parameters ), _T( "GetKernelGenKey() failed!" ) );
+
     // We need to make sure that the global work size is evenly divisible by the local work size
     // Our transpose works in tiles, so divide tiles in each dimension to get count of blocks, rounding up for remainder items
-    size_t numBlocksX = NumBlocksX(parameters.fft_N[ 0 ]);
-    size_t numBlocksY = DivRoundingUp( parameters.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
+    size_t numBlocksX = NumBlocksX(this->signature.fft_N[ 0 ]);
+    size_t numBlocksY = DivRoundingUp( this->signature.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
     size_t numWIX = numBlocksX * lwSize.x;
 
     // Batches of matrices are lined up along the Y axis, 1 after the other
-	size_t numWIY = numBlocksY * lwSize.y * this->batchsize;
+	size_t numWIY = numBlocksY * lwSize.y * this->plan->batchsize;
 	// fft_DataDim has one more dimension than the actual fft data, which is devoted to batch.
 	// dim from 2 to fft_DataDim - 2 are lined up along the Y axis
-	for(int i = 2; i < parameters.fft_DataDim - 1; i++)
+	for(int i = 2; i < this->signature.fft_DataDim - 1; i++)
 	{
-		numWIY *= parameters.fft_N[i];
+		numWIY *= this->signature.fft_N[i];
 	}
 
 
diff --git a/src/library/generator.transpose.vliw.cpp b/src/library/generator.transpose.vliw.cpp
index f715c7d..b075314 100644
--- a/src/library/generator.transpose.vliw.cpp
+++ b/src/library/generator.transpose.vliw.cpp
@@ -23,6 +23,75 @@
 #include "stdafx.h"
 #include <math.h>
 #include "generator.transpose.vliw.h"
+#include "action.h"
+
+
+FFTGeneratedTransposeVLIWAction::FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+    : FFTTransposeVLIWAction(plHandle, plan, queue, err)
+{
+    if (err != CLFFT_SUCCESS)
+    {
+        // FFTTransposeVLIWAction() failed, exit
+        fprintf(stderr, "FFTTransposeVLIWAction() failed!\n");
+        return;
+    }
+
+    // Initialize the FFTAction::FFTKernelGenKeyParams member
+    err = this->initParams();
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedTransposeVLIWAction::initParams() failed!\n");
+        return;
+    }
+
+    FFTRepo &fftRepo = FFTRepo::getInstance();
+
+    err = this->generateKernel(fftRepo, queue);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedTransposeVLIWAction::generateKernel failed\n");
+        return;
+    }
+
+    err = compileKernels( queue, plHandle, plan);
+
+    if (err != CLFFT_SUCCESS)
+    {
+        fprintf(stderr, "FFTGeneratedTransposeVLIWAction::compileKernels failed\n");
+        return;
+    }
+
+    err = CLFFT_SUCCESS;
+}
+
+
+bool FFTGeneratedTransposeVLIWAction::buildForwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool r2c_transform = (inputLayout == CLFFT_REAL);
+    bool c2r_transform = (outputLayout == CLFFT_REAL);
+    bool real_transform = (r2c_transform || c2r_transform);
+
+    return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedTransposeVLIWAction::buildBackwardKernel()
+{
+    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+    bool r2c_transform = (inputLayout == CLFFT_REAL);
+    bool c2r_transform = (outputLayout == CLFFT_REAL);
+    bool real_transform = (r2c_transform || c2r_transform);
+
+    return (!real_transform) || c2r_transform;
+}
+
+
 
 #define QUOTEMARK(x) #x
 
@@ -75,7 +144,7 @@ typedef enum inputoutputflag_
 	ENDTRANSIO
 } transio;
 
-static clfftStatus GenerateTransposeKernel (FFTKernelGenKeyParams & params,
+static clfftStatus GenerateTransposeKernel (FFTGeneratedTransposeVLIWAction::Signature  & params,
 	std::string & kernel)
 {
 	kernel.reserve (8000);
@@ -736,84 +805,78 @@ static clfftStatus GenerateTransposeKernel (FFTKernelGenKeyParams & params,
 	return CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_VLIW> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTGeneratedTransposeVLIWAction::initParams ()
 {
 
 	//	Query the devices in this context for their local memory sizes
 	//	How we generate a kernel depends on the *minimum* LDS size for all devices.
 	//
 	const FFTEnvelope * pEnvelope = NULL;
-	OPENCL_V(const_cast<FFTPlan*>(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+	OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
 	BUG_CHECK (NULL != pEnvelope);
 
-	::memset( &params, 0, sizeof( params ) );
-	params.fft_precision    = this->precision;
-	params.fft_placeness    = this->placeness;
-	params.fft_inputLayout  = this->inputLayout;
 
-	ARG_CHECK (this->inStride.size() == this->outStride.size())
+	this->signature.fft_precision    = this->plan->precision;
+	this->signature.fft_placeness    = this->plan->placeness;
+	this->signature.fft_inputLayout  = this->plan->inputLayout;
+
+	ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
 
-	if (CLFFT_INPLACE == this->placeness) {
+	if (CLFFT_INPLACE == this->plan->placeness) {
 		//	If this is an in-place transform the
 		//	input and output layout, dimensions and strides
 		//	*MUST* be the same.
 		//
-		ARG_CHECK (this->inputLayout == this->outputLayout)
-		params.fft_outputLayout = this->inputLayout;
-		for (size_t u = this->inStride.size(); u-- > 0; ) {
-			ARG_CHECK (this->inStride[u] == this->outStride[u]);
+		ARG_CHECK (this->plan->inputLayout == this->plan->outputLayout)
+		this->signature.fft_outputLayout = this->plan->inputLayout;
+		for (size_t u = this->plan->inStride.size(); u-- > 0; ) {
+			ARG_CHECK (this->plan->inStride[u] == this->plan->outStride[u]);
 		}
 	} else {
-		params.fft_outputLayout = this->outputLayout;
+		this->signature.fft_outputLayout = this->plan->outputLayout;
 	}
 
 	//we only support 2D transpose
-	switch (this->inStride.size()) {
+	switch (this->plan->inStride.size()) {
 		//	2-D array is a 3-D data structure
 		//	2-D unit is a speical case of 2-D array.
 	case 2:
-		ARG_CHECK(this->length   .size() > 1);
-		ARG_CHECK(this->outStride.size() > 1);
-		params.fft_DataDim      = 3;
-		params.fft_N[0]         = this->length[0];
-		params.fft_N[1]         = this->length[1];
-		params.fft_inStride[0]  = this->inStride[0];
-		params.fft_inStride[1]  = this->inStride[1];
-		params.fft_inStride[2]  = this->iDist;
-		params.fft_outStride[0] = this->outStride[0];
-		params.fft_outStride[1] = this->outStride[1];
-		params.fft_outStride[2] = this->oDist;
+		ARG_CHECK(this->plan->length   .size() > 1);
+		ARG_CHECK(this->plan->outStride.size() > 1);
+		this->signature.fft_DataDim      = 3;
+		this->signature.fft_N[0]         = this->plan->length[0];
+		this->signature.fft_N[1]         = this->plan->length[1];
+		this->signature.fft_inStride[0]  = this->plan->inStride[0];
+		this->signature.fft_inStride[1]  = this->plan->inStride[1];
+		this->signature.fft_inStride[2]  = this->plan->iDist;
+		this->signature.fft_outStride[0] = this->plan->outStride[0];
+		this->signature.fft_outStride[1] = this->plan->outStride[1];
+		this->signature.fft_outStride[2] = this->plan->oDist;
 		break;
 	default:
 		ARG_CHECK (false);
 	}
 
 	//ToDO: work group size setup
-	params.fft_R = 32; // divide the element into 32x32 blocks
-	params.fft_SIMD = 64; //work group size
+	this->signature.fft_R = 32; // divide the element into 32x32 blocks
+	this->signature.fft_SIMD = 64; //work group size
 
 	return CLFFT_SUCCESS;
 }
 
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_VLIW> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus  FFTGeneratedTransposeVLIWAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
 {
-	//	How many numbers per workitem in the generated kernel?
-	FFTKernelGenKeyParams fftParams;
-	//	Translate the user plan into the structure that we use to map plans to clPrograms
-	OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW>( fftParams ), _T("GetKernelGenKey() failed!") );
 
 	unsigned long long count, count0, count1;
-	count0 = DivRoundingUp<unsigned long long> (this->length[0], fftParams.fft_R);
-	count1 = DivRoundingUp<unsigned long long> (this->length[1], fftParams.fft_R);
+	count0 = DivRoundingUp<unsigned long long> (this->plan->length[0], this->signature.fft_R);
+	count1 = DivRoundingUp<unsigned long long> (this->plan->length[1], this->signature.fft_R);
 	count  = (count0>count1) ? count0 : count1;
 	count  = (count * (count+1)) /2;
-	count *= fftParams.fft_SIMD;
-	count *= this->batchsize;
+	count *= this->signature.fft_SIMD;
+	count *= this->plan->batchsize;
 
 	globalWS.push_back( static_cast< size_t >( count ) );
-	localWS.push_back( fftParams.fft_SIMD );
+	localWS.push_back( this->signature.fft_SIMD );
 
 	return	CLFFT_SUCCESS;
 }
@@ -821,14 +884,11 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_VLIW> (std::vector<size_t> & glob
 
 //	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
 //	Feed this generator the FFTPlan, and it returns the generated program as a string
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Transpose_VLIW> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedTransposeVLIWAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
 {
-	FFTKernelGenKeyParams params;
-	OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW> (params), _T("GetKernelGenKey() failed!") );
 
 	std::string programCode;
-	OPENCL_V( GenerateTransposeKernel( params, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
+	OPENCL_V( GenerateTransposeKernel(  this->signature, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
 
     cl_int status = CL_SUCCESS;
     cl_device_id Device = NULL;
@@ -839,8 +899,8 @@ clfftStatus FFTPlan::GenerateKernelPvt<Transpose_VLIW> ( FFTRepo& fftRepo, const
     status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
     OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
 
-	OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-	OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW, params, "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+	OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+	OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW,  this->getSignatureData(), "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
 
 	return CLFFT_SUCCESS;
 }
diff --git a/src/library/lifetime.cpp b/src/library/lifetime.cpp
index 7548e9a..f53f670 100644
--- a/src/library/lifetime.cpp
+++ b/src/library/lifetime.cpp
@@ -34,6 +34,8 @@ clfftStatus	clfftSetup( const clfftSetupData* sData )
 	//	First invocation of this function will allocate the FFTRepo singleton; thereafter the object always exists
 	FFTRepo& fftRepo	= FFTRepo::getInstance( );
 
+	clfftInitBinaryCache();
+
 	//	Discover and load the timer module if present
 	fftRepo.timerHandle = LoadSharedLibrary( "lib", "StatTimer", true );
 	if( fftRepo.timerHandle )
diff --git a/src/library/mainpage.h b/src/library/mainpage.h
index 70e014c..5256a65 100644
--- a/src/library/mainpage.h
+++ b/src/library/mainpage.h
@@ -35,11 +35,11 @@ components as a pair contiguous in memory) formats.
 @section InstallFFT Installation of clFFT library
 
 @subsection DownBinaries Downloadable Binaries
-AMD provides clFFT library pre-compiled packages for recent versions of Microsoft Windows operating systems
-and several flavors of Linux.
+clFFT library pre-compiled packages for recent versions of Microsoft Windows operating systems
+and several flavors of Linux are available.
 
-The downloadable binary packages are freely available from AMD at
-http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/
+The downloadable binary packages are freely available at
+https://github.com/clMathLibraries/clFFT/releases
 
 Once the appropriate package for the respective OS has finished downloading,
 uncompress the package using the native tools available on the platform in a
diff --git a/src/library/md5sum.c b/src/library/md5sum.c
new file mode 100644
index 0000000..d5f9417
--- /dev/null
+++ b/src/library/md5sum.c
@@ -0,0 +1,312 @@
+/*
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001.  No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * (This is a heavily cut-down "BSD license".)
+ *
+ * This differs from Colin Plumb's older public domain implementation in that
+ * no exactly 32-bit integer data type is required (any 32-bit or wider
+ * unsigned integer data type will do), there's no compile-time endianness
+ * configuration, and the function prototypes match OpenSSL's.  No code from
+ * Colin Plumb's implementation has been reused; this comment merely compares
+ * the properties of the two independent implementations.
+ *
+ * The primary goals of this implementation are portability and ease of use.
+ * It is meant to be fast, but not as fast as possible.  Some known
+ * optimizations are not included to reduce source code size and avoid
+ * compile-time configuration.
+ */
+ 
+#ifndef HAVE_OPENSSL
+ 
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+ 
+#include "md5sum.h"
+ 
+/*
+ * The basic MD5 functions.
+ *
+ * F and G are optimized compared to their RFC 1321 definitions for
+ * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
+ * implementation.
+ */
+#define F(x, y, z)			((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z)			((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z)			(((x) ^ (y)) ^ (z))
+#define H2(x, y, z)			((x) ^ ((y) ^ (z)))
+#define I(x, y, z)			((y) ^ ((x) | ~(z)))
+ 
+/*
+ * The MD5 transformation for all four rounds.
+ */
+#define STEP(f, a, b, c, d, x, t, s) \
+	(a) += f((b), (c), (d)) + (x) + (t); \
+	(a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
+	(a) += (b);
+ 
+/*
+ * SET reads 4 input bytes in little-endian byte order and stores them
+ * in a properly aligned word in host byte order.
+ *
+ * The check for little-endian architectures that tolerate unaligned
+ * memory accesses is just an optimization.  Nothing will break if it
+ * doesn't work.
+ */
+#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
+#define SET(n) \
+	(*(MD5_u32plus *)&ptr[(n) * 4])
+#define GET(n) \
+	SET(n)
+#else
+#define SET(n) \
+	(ctx->block[(n)] = \
+	(MD5_u32plus)ptr[(n) * 4] | \
+	((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \
+	((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \
+	((MD5_u32plus)ptr[(n) * 4 + 3] << 24))
+#define GET(n) \
+	(ctx->block[(n)])
+#endif
+ 
+/*
+ * This processes one or more 64-byte data blocks, but does NOT update
+ * the bit counters.  There are no alignment requirements.
+ */
+static const void *body(MD5_CTX *ctx, const void *data, unsigned long size)
+{
+	const unsigned char *ptr;
+	MD5_u32plus a, b, c, d;
+	MD5_u32plus saved_a, saved_b, saved_c, saved_d;
+ 
+	ptr = (const unsigned char *)data;
+ 
+	a = ctx->a;
+	b = ctx->b;
+	c = ctx->c;
+	d = ctx->d;
+ 
+	do {
+		saved_a = a;
+		saved_b = b;
+		saved_c = c;
+		saved_d = d;
+ 
+/* Round 1 */
+		STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
+		STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
+		STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
+		STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
+		STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
+		STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
+		STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
+		STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
+		STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
+		STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
+		STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
+		STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
+		STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
+		STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
+		STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
+		STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
+ 
+/* Round 2 */
+		STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+		STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+		STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+		STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+		STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+		STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+		STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+		STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+		STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+		STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+		STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+		STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+		STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+		STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+		STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+		STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+ 
+/* Round 3 */
+		STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+		STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
+		STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+		STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
+		STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+		STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+		STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+		STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
+		STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+		STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
+		STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+		STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
+		STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+		STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
+		STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+		STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
+ 
+/* Round 4 */
+		STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+		STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+		STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+		STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+		STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+		STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+		STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+		STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+		STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+		STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+		STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+		STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+		STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+		STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+		STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+		STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+ 
+		a += saved_a;
+		b += saved_b;
+		c += saved_c;
+		d += saved_d;
+ 
+		ptr += 64;
+	} while (size -= 64);
+ 
+	ctx->a = a;
+	ctx->b = b;
+	ctx->c = c;
+	ctx->d = d;
+ 
+	return ptr;
+}
+ 
+void MD5_Init(MD5_CTX *ctx)
+{
+	ctx->a = 0x67452301;
+	ctx->b = 0xefcdab89;
+	ctx->c = 0x98badcfe;
+	ctx->d = 0x10325476;
+ 
+	ctx->lo = 0;
+	ctx->hi = 0;
+}
+ 
+void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size)
+{
+	MD5_u32plus saved_lo;
+	unsigned long used, available;
+ 
+	saved_lo = ctx->lo;
+	if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
+		ctx->hi++;
+	ctx->hi += size >> 29;
+ 
+	used = saved_lo & 0x3f;
+ 
+	if (used) {
+		available = 64 - used;
+ 
+		if (size < available) {
+			memcpy(&ctx->buffer[used], data, size);
+			return;
+		}
+ 
+		memcpy(&ctx->buffer[used], data, available);
+		data = (const unsigned char *)data + available;
+		size -= available;
+		body(ctx, ctx->buffer, 64);
+	}
+ 
+	if (size >= 64) {
+		data = body(ctx, data, size & ~(unsigned long)0x3f);
+		size &= 0x3f;
+	}
+ 
+	memcpy(ctx->buffer, data, size);
+}
+ 
+void MD5_Final(unsigned char *result, MD5_CTX *ctx)
+{
+	unsigned long used, available;
+ 
+	used = ctx->lo & 0x3f;
+ 
+	ctx->buffer[used++] = 0x80;
+ 
+	available = 64 - used;
+ 
+	if (available < 8) {
+		memset(&ctx->buffer[used], 0, available);
+		body(ctx, ctx->buffer, 64);
+		used = 0;
+		available = 64;
+	}
+ 
+	memset(&ctx->buffer[used], 0, available - 8);
+ 
+	ctx->lo <<= 3;
+	ctx->buffer[56] = ctx->lo;
+	ctx->buffer[57] = ctx->lo >> 8;
+	ctx->buffer[58] = ctx->lo >> 16;
+	ctx->buffer[59] = ctx->lo >> 24;
+	ctx->buffer[60] = ctx->hi;
+	ctx->buffer[61] = ctx->hi >> 8;
+	ctx->buffer[62] = ctx->hi >> 16;
+	ctx->buffer[63] = ctx->hi >> 24;
+ 
+	body(ctx, ctx->buffer, 64);
+ 
+	result[0] = ctx->a;
+	result[1] = ctx->a >> 8;
+	result[2] = ctx->a >> 16;
+	result[3] = ctx->a >> 24;
+	result[4] = ctx->b;
+	result[5] = ctx->b >> 8;
+	result[6] = ctx->b >> 16;
+	result[7] = ctx->b >> 24;
+	result[8] = ctx->c;
+	result[9] = ctx->c >> 8;
+	result[10] = ctx->c >> 16;
+	result[11] = ctx->c >> 24;
+	result[12] = ctx->d;
+	result[13] = ctx->d >> 8;
+	result[14] = ctx->d >> 16;
+	result[15] = ctx->d >> 24;
+ 
+	memset(ctx, 0, sizeof(*ctx));
+}
+ 
+#endif
+
+void md5sum(const void * data, unsigned long size, char * md5string)
+{
+    unsigned char digest[16];
+    int i;
+    MD5_CTX context;
+    MD5_Init(&context);
+    MD5_Update(&context, data, size);
+    MD5_Final(digest, &context);
+
+    for(i = 0; i < 16; ++i)
+        sprintf(&md5string[i*2], "%02x", (unsigned int)digest[i]);
+}
+
diff --git a/src/library/md5sum.h b/src/library/md5sum.h
new file mode 100644
index 0000000..2f3d739
--- /dev/null
+++ b/src/library/md5sum.h
@@ -0,0 +1,50 @@
+/*
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001.  No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * See md5.c for more information.
+ */
+
+#ifdef HAVE_OPENSSL
+#include <openssl/md5.h>
+#elif !defined(_MD5_H)
+#define _MD5_H
+
+/* Any 32-bit or wider unsigned integer data type will do */
+typedef unsigned int MD5_u32plus;
+
+typedef struct {
+	MD5_u32plus lo, hi;
+	MD5_u32plus a, b, c, d;
+	unsigned char buffer[64];
+	MD5_u32plus block[16];
+} MD5_CTX;
+
+extern void MD5_Init(MD5_CTX *ctx);
+extern void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size);
+extern void MD5_Final(unsigned char *result, MD5_CTX *ctx);
+
+void md5sum (const void * data, unsigned long size, char * md5sum);
+
+
+
+#endif
+
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 60389ad..1cfe054 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -26,6 +26,8 @@
 #include "plan.h"
 #include "generator.stockham.h"
 #include "../include/convenienceFunctions.h"
+#include "action.h"
+#include "fft_binary_lookup.h"
 
 using std::vector;
 
@@ -33,16 +35,6 @@ const std::string beginning_of_binary( "<[
 const std::string end_of_binary( "<[�_I_may_be_a_sorry_case,_but_I_don't_write_jokes_in_base_13_�]>" );
 const std::string end_of_file( "<[�_You're_off_the_edge_of_the_map,_mate._Here_there_be_monsters_�]>" );
 
-//	This operator is used to sort FFTKernelGenKeyParams structs inside of a std::map
-bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs)
-{
-	int ret = ::memcmp( &lhs, &rhs, sizeof( FFTKernelGenKeyParams ) );
-
-	if( ret < 0 )
-		return true;
-
-	return false;
-}
 
 // Returns CLFFT_SUCCESS if the fp64 is present, CLFFT_DEVICE_NO_DOUBLE if it is not found.  
 clfftStatus checkDevExt( std::string ext, const cl_device_id &device )
@@ -213,128 +205,84 @@ clfftStatus	clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
 			break;
 	}
 
+	fftPlan->plHandle = *plHandle;
+
 	return	CLFFT_SUCCESS;
 }
 
 
-
-// **************** TODO TODO TODO ***********************
-// Making CompileKernels function take in command queue parameter so we can build for 1 particular device only;
-// this may not be desirable for persistent plans, where we may have to compile for all devices in the context;
-// make changes appropriately before enabling persistent plans and then remove this comment
-
-//	Compile the kernels that this plan uses, and store into the plan
-clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, const clfftGenerators gen, FFTPlan* fftPlan )
+std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle)
 {
-	cl_int status = 0;
-	size_t deviceListSize = 0;
-
-	FFTRepo& fftRepo	= FFTRepo::getInstance( );
-
-
-	// create a cl program executable for the device associated with command queue
-	// Get the device
-	cl_device_id &q_device = fftPlan->bakeDevice;
-	//clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &q_device, NULL);
-
-	FFTKernelGenKeyParams fftParams;
-	OPENCL_V( fftPlan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
+    //	Logic to define a sensible filename
+    const std::string kernelPrefix( "clfft.kernel." );
+    std::string generatorName;
+    std::stringstream kernelPath;
+
+    switch( gen )
+    {
+    case Stockham:			generatorName = "Stockham"; break;
+	case Transpose_GCN:		generatorName = "Transpose"; break;
+    case Transpose_VLIW:	generatorName = "Transpose"; break;
+	case Copy:				generatorName = "Copy"; break;
+    }
 
-	cl_program program;
-	if( fftRepo.getclProgram( gen, fftParams, program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
-	{
+    kernelPath << kernelPrefix << generatorName ;
 
-		std::string programCode;
-		OPENCL_V( fftRepo.getProgramCode( gen, fftParams, programCode, q_device, fftPlan->context  ), _T( "fftRepo.getProgramCode failed." ) );
+    if (withPlHandle)
+        kernelPath << plHandle;
 
-		const char* source = programCode.c_str();
-		program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
-		OPENCL_V( status, _T( "clCreateProgramWithSource failed." ) );
+    kernelPath << ".cl";
 
-		// create a cl program executable for the device associated with command queue
+    return kernelPath.str();
+}
 
-#if defined(DEBUGGING)
-		status = clBuildProgram( program, 1, &q_device, "-g -cl-opt-disable", NULL, NULL); // good for debugging kernels
 
-// if you have trouble creating smbols that GDB can pick up to set a breakpoint after kernels are loaded into memory
-// this can be used to stop execution to allow you to set a breakpoint in a kernel after kernel symbols are in memory.
-#ifdef DEBUG_BREAK_GDB
-		__debugbreak();
-#endif
-#else
-		status = clBuildProgram( program, 1, &q_device, NULL, NULL, NULL);
-#endif
-		if( status != CL_SUCCESS )
+clfftStatus selectAction(FFTPlan * fftPlan, FFTAction *& action, cl_command_queue* commQueueFFT)
+{
+    // set the action we are baking a leaf
+    clfftStatus err;
+    switch (fftPlan->gen)
+    {
+    case Stockham:  
 		{
-			if( status == CL_BUILD_PROGRAM_FAILURE )
-			{
-				size_t buildLogSize = 0;
-				OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
-						_T( "clGetProgramBuildInfo failed" ) );
-
-				vector< char > buildLog( buildLogSize );
-				::memset( &buildLog[ 0 ], 0x0, buildLogSize );
-
-				OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
-						_T( "clGetProgramBuildInfo failed" ) );
-
-				std::cerr << "\n\t\t\tBUILD LOG\n";
-				std::cerr << "************************************************\n";
-				std::cerr << &buildLog[ 0 ] << std::endl;
-				std::cerr << "************************************************\n";
-			}
-
-			OPENCL_V( status, _T( "clBuildProgram failed" ) );
+			// Instantiate the default stockham generator
+			action = new FFTGeneratedStockhamAction (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+			OPENCL_V( err, "FFTGeneratedStockhamAction() failed");
 		}
+		break;
 
-		fftRepo.setclProgram( gen, fftParams, program, q_device, fftPlan->context );
-
-		// For real transforms we comppile either forward or backward kernel
-		bool r2c_transform = (fftParams.fft_inputLayout == CLFFT_REAL);
-		bool c2r_transform = (fftParams.fft_outputLayout == CLFFT_REAL);
-		bool h2c = (gen == Copy) && ((fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED));
-		bool c2h = (gen == Copy) && ((fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED));
-		bool generalCopy = !(h2c || c2h) && (gen == Copy);
-		bool complexTransform = ( !(r2c_transform || c2r_transform) && (gen != Copy) );
-
-		// get a kernel object handle for a kernel with the given name
-		cl_kernel kernel;
-		if( complexTransform || r2c_transform || c2h || generalCopy)
+    case Transpose_GCN: 
 		{
-			if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
-			{
-				std::string entryPoint;
-				OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
-
-				kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
-				OPENCL_V( status, _T( "clCreateKernel failed" ) );
-
-				fftRepo.setclKernel( program, CLFFT_FORWARD, kernel );
-			}
+			action = new FFTGeneratedTransposeGCNAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+			OPENCL_V( err, "FFTGeneratedTransposeGCNAction() failed");
 		}
+		break;
 
-		if( complexTransform || c2r_transform || h2c || generalCopy)
+    case Transpose_VLIW: 
 		{
-			if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
-			{
-				std::string entryPoint;
-				OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
-
-				kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
-				OPENCL_V( status, _T( "clCreateKernel failed" ) );
+			action = new FFTGeneratedTransposeVLIWAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+			OPENCL_V( err, "FFTGeneratedTransposeVLIWAction() failed");
+		}
+		break;
 
-				fftRepo.setclKernel( program, CLFFT_BACKWARD, kernel );
-			}
+    case Copy:
+		{
+			action = new FFTGeneratedCopyAction     (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+			OPENCL_V( err, "FFTGeneratedCopyAction() failed");
 		}
-	}
+		break;
 
+    default:
+		{
+			assert(false);
+			OPENCL_V( CLFFT_NOTIMPLEMENTED, "selectAction() failed");
+		}
+    }
 
-	return	CLFFT_SUCCESS;
+	return CLFFT_SUCCESS;
 }
 
 
-
-
 inline size_t PrecisionWidth(clfftPrecision pr)
 {
 	switch(pr)
@@ -345,6 +293,8 @@ inline size_t PrecisionWidth(clfftPrecision pr)
 	}
 }
 
+
+
 clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
 							void (CL_CALLBACK *pfn_notify)( clfftPlanHandle plHandle, void *user_data ), void* user_data )
 {
@@ -427,8 +377,9 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 	if(fftPlan->gen == Copy)
 	{
-		OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateKernel() failed" ) );
-		OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+        clfftStatus err;
+        fftPlan->action = new FFTGeneratedCopyAction(plHandle, fftPlan, *commQueueFFT, err);
+        OPENCL_V( err, _T( "FFTGeneratedCopyAction() failed" ) );
 		fftPlan->baked		= true;
 		return	CLFFT_SUCCESS;
 	}
@@ -1559,8 +1510,9 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				//break;
 				if (fftPlan->transflag) //Transpose for 2D
 				{
-					OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateTransposeProgram() failed" ) );
-					OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+                    clfftStatus err;
+                    fftPlan->action = new FFTGeneratedTransposeVLIWAction(plHandle, fftPlan, *commQueueFFT, err);
+                    OPENCL_V( err, "FFTGeneratedTransposeVLIWAction failed");
 
 					fftPlan->baked		= true;
 					return	CLFFT_SUCCESS;
@@ -2484,11 +2436,8 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 		}
 	}
 
-	//	For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
-	OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateKernel() failed" ) );
-
-	//	For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
-	OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+	
+	clfftStatus err = selectAction(fftPlan, fftPlan->action, commQueueFFT);
 
 	//	Allocate resources
 	OPENCL_V( fftPlan->AllocateBuffers (), _T("AllocateBuffers() failed"));
@@ -2560,428 +2509,6 @@ clfftStatus FFTPlan::ConstructAndEnqueueConstantBuffers( cl_command_queue* commQ
 	return CLFFT_SUCCESS;
 }
 
-//TODO caching kernel binaries for later reload
-#if 0
-typedef std::pair<std::string, clfftPlanHandle> plan_tree_node_t;
-typedef std::vector< std::pair<std::string, clfftPlanHandle> > plan_tree_t;
-
-void make_plan_tree( plan_tree_t & tree, std::string name, clfftPlanHandle handle )
-{
-	tree.push_back( plan_tree_node_t(name, handle) );
-
-	FFTPlan* plan = NULL;
-	FFTRepo& repo = FFTRepo::getInstance();
-	lockRAII* lock = NULL;
-	clfftStatus status = repo.getPlan( handle, plan, lock );
-	if( status != CLFFT_SUCCESS )
-	{
-		throw( "make_plan_tree failure: repo.getPlan" );
-	}
-
-	if( plan->planX )
-	{
-		std::string subplan(name);
-		subplan += "X";
-		make_plan_tree(tree, subplan, plan->planX );
-	}
-
-	if( plan->planY )
-	{
-		std::string subplan(name);
-		subplan += "Y";
-		make_plan_tree(tree, subplan, plan->planY );
-	}
-
-	if( plan->planZ )
-	{
-		std::string subplan(name);
-		subplan += "Z";
-		make_plan_tree(tree, subplan, plan->planZ );
-	}
-}
-
-clfftStatus clfftWritePlanToDisk( clfftPlanHandle plan_handle, const char* filename )
-{
-	plan_tree_t plan_tree;
-	make_plan_tree( plan_tree, "plan", plan_handle );
-
-	std::ofstream planfile;
-	planfile.open(filename, std::ios::binary);
-
-	while( !plan_tree.empty() )
-	{
-		plan_tree_node_t node( *plan_tree.begin() );
-		plan_tree.erase( plan_tree.begin() );
-
-		FFTPlan* plan = NULL;
-		FFTRepo& repo = FFTRepo::getInstance();
-		lockRAII* lock = NULL;
-		OPENCL_V(repo.getPlan( node.second, plan, lock ), _T("getPlan failure"));
-
-		// print the name of the node (plan, planX, planXX, planY, plan XY, etc)
-		planfile << node.first << " ";
-
-		planfile << "dimensions " << plan->dim << " " << plan->length.size();
-		// dimensions must be listed first because clfftReadPlanFromDisk
-		// will need to use dimensions for reading in strides and such
-
-		for( int i = 0; i < plan->length.size(); ++i )
-		{
-			planfile << " " << plan->length[i];
-		}
-
-		planfile << " batch " << plan->batchsize;
-
-		planfile << " instride " << plan->inStride.size();
-		for( int i = 0; i < plan->inStride.size(); ++i )
-		{
-			planfile << " " << plan->inStride[i];
-		}
-
-		planfile << " outstride " << plan->outStride.size();
-		for( int i = 0; i < plan->outStride.size(); ++i )
-		{
-			planfile << " " << plan->outStride[i];
-		}
-
-		planfile << " in-out-distances " << plan->iDist << " " << plan->oDist;
-		planfile << " in-out-layouts " << plan->inputLayout << " " << plan->outputLayout;
-		planfile << " resultlocation " << plan->placeness;
-		planfile << " precision " << plan->precision;
-		planfile << " forwardscale " << float_as_hex<double>(plan->forwardScale);
-		planfile << " backwardscale " << float_as_hex<double>(plan->backwardScale);
-		// we need to stash scales as hex so that we don't have any roundoff error
-		// clfftReadPlanFromDisk will read the hex back in as float
-
-		planfile << " gen " << plan->gen;
-		planfile << " tmpBufSize " << plan->tmpBufSize;
-		planfile << " large1D " << plan->large1D;
-		planfile << " large2D " << plan->large2D;
-
-		if( plan->baked == true )
-		{
-			planfile << " number-of-devices " << plan->number_of_devices;
-
-			if( plan->number_of_devices > 0 )
-			{
-				planfile << " binary-sizes";
-				for( int i = 0; i < plan->number_of_devices; i++ )
-				{
-					planfile << " " << *(plan->binary_sizes.get() + i);
-				}
-
-				planfile << " binaries ";
-				for( int i = 0; i < plan->number_of_devices; i++ )
-				{
-					planfile << beginning_of_binary;
-					planfile.write( plan->binaries[i].get(), plan->binary_sizes[i] );
-					planfile << end_of_binary;
-				}
-			}
-			else
-			{
-				planfile << " ";
-			}
-		}
-	}
-
-	planfile << " " << end_of_file;
-	planfile.close();
-	return CLFFT_SUCCESS;
-}
-
-void FFTPlan::ResetBinarySizes()
-{
-	binary_sizes.reset(new size_t[number_of_devices]);
-}
-
-void FFTPlan::ResetBinaries()
-{
-	binaries.clear();
-	for( int i = 0; i < number_of_devices; i++ )
-	{
-		binaries.push_back( std::unique_ptr<char[]>(new char[binary_sizes[i]] ) );
-	}
-}
-
-std::string pop_next_word( std::string & str )
-{
-	size_t next_space = str.find_first_of(' ');
-
-	std::string next_word( str.substr( 0, next_space ) );
-	str.erase( 0, next_space+1 ); // we need the extra +1 to munch off the space
-
-	return next_word;
-}
-
-int my_string_to_int( std::string str )
-{
-	int i;
-	std::stringstream string_to_int( str );
-	string_to_int >> i;
-	return i;
-}
-
-bool start_of_a_plan( std::string word )
-{
-	if( word.substr(0,4) == "plan" )
-		return true;
-	else
-		return false;
-}
-
-clfftStatus clfftReadPlanFromDisk( clfftPlanHandle plan_handle, const char* filename )
-{
-	plan_tree_t tree;
-
-	FFTPlan* plan = NULL;
-	FFTRepo& repo = FFTRepo::getInstance();
-	lockRAII* lock = NULL;
-	OPENCL_V(repo.getPlan( plan_handle, plan, lock ), _T("getPlan failure"));
-
-	std::ifstream planfile;
-	planfile.open(filename, std::ios::in | std::ios::binary);
-
-	unsigned int dimensions = 0;
-	std::string next_word;
-
-	while( planfile >> next_word )
-	{
-		if( start_of_a_plan( next_word ) )
-		{
-			if( next_word.length() > 4 )
-			// if true, this is not a base plan
-			{
-				clfftDim temp_dimension = CLFFT_1D;
-				size_t temp_lengths[3] = {1,1,1};
-
-				// let's create the plan to represent the child plan
-				clfftPlanHandle child_plan;
-				OPENCL_V(clfftCreateDefaultPlan( &child_plan, plan->context, temp_dimension, temp_lengths ),
-					"clfftReadPlanFromDisk(): error calling clfftCreateDefaultPlan()");
-
-				tree.push_back( plan_tree_node_t( next_word, child_plan ) );
-
-				// we need to update the planX, Y, or Z pointer to point at the child plan
-				char child_plan_name = next_word.rbegin()[0]; // this tells us if this is planX, Y, or Z
-				next_word.erase( next_word.end()-1 ); // this tells us the parent plan
-				std::string parent_plan_name = next_word;
-
-				clfftPlanHandle parent_plan = 0;
-
-				for( int i = 0; i < tree.size(); i++ )
-				{
-					if( tree[i].first == parent_plan_name )
-					{
-						parent_plan = tree[i].second;
-					}
-				}
-
-				plan = NULL;
-				OPENCL_V(repo.getPlan( parent_plan, plan, lock ), _T("getPlan failure"));
-
-				if( child_plan_name == 'X' )
-					plan->planX = child_plan;
-				else if( child_plan_name == 'Y' )
-					plan->planY = child_plan;
-				else if( child_plan_name == 'Z' )
-					plan->planZ = child_plan;
-				else
-					OPENCL_V(CLFFT_INVALID_PLAN, "clfftReadPlanFromDisk(): could not identify child plan" );
-
-				// our child plan is now the active plan
-				plan = NULL;
-				OPENCL_V(repo.getPlan( child_plan, plan, lock ), _T("getPlan failure"));
-				plan_handle = child_plan;
-			}
-			else
-			// if this is a base plan, we don't need to do anything fancy.
-			// just add the node to the tree
-			{
-				tree.push_back( plan_tree_node_t( next_word, plan_handle ) );
-			}
-
-			plan->readFromFile = true;
-		}
-		else if( next_word == "dimensions" )
-		{
-			size_t lengths[3];
-
-			// read number of dimensions
-			planfile >> dimensions;
-
-			// number of length values that follow (subplans have some really strange things going on,
-			// so this might not always match the dimension of the transform)
-			size_t number_of_lengths = 0;
-			planfile >> number_of_lengths;
-
-			OPENCL_V( clfftSetPlanDim(plan_handle, static_cast<clfftDim>(dimensions)), _T("clfftReadPlanFromDisk: clfftSetPlanDim") );
-
-			for( unsigned int i = 0; i < number_of_lengths; ++i )
-			{
-				planfile >> lengths[i]; // read one dimension
-
-				// We have to explicitly set the lengths instead of using clfftSetPlanLength here.
-				// Because the number of values to add might be greater than the number of dimensions in plan->dimension,
-				// we don't want to miss out on any super awesome numbers getting added to plan->length with clfftSetPlanLength
-				if( i >= plan->length.size() ) plan->length.push_back(1);
-				plan->length[i] = lengths[i];
-			}
-		}
-		else if( next_word == "batch" )
-		{
-			unsigned int batch;
-			planfile >> batch;
-
-			OPENCL_V( clfftSetPlanBatchSize(plan_handle, batch), _T("clfftReadPlanFromDisk: clfftSetPlanBatchSize") );
-		}
-		else if( next_word == "instride" )
-		{
-			size_t strides[3];
-
-			// number of stride values that follow (subplans have some really strange things going on,
-			// so this might not always match the dimension of the transform)
-			size_t number_of_strides = 0;
-			planfile >> number_of_strides;
-
-			for( unsigned int i = 0; i < number_of_strides; ++i )
-			{
-				planfile >> strides[i]; // read one dimension
-
-				// We have to explicitly set inStride instead of using clfftSetPlanInStride here.
-				// Because the number of values to add might be greater than the number of dimensions in plan->dimension,
-				// we don't want to miss out on any super awesome numbers getting added to plan->inStride with clfftSetPlanInStride
-				if( i >= plan->inStride.size() ) plan->inStride.push_back(1);
-				plan->inStride[i] = strides[i];
-			}
-		}
-		else if( next_word == "outstride" )
-		{
-			size_t strides[3];
-
-			// number of stride values that follow (subplans have some really strange things going on,
-			// so this might not always match the dimension of the transform)
-			size_t number_of_strides = 0;
-			planfile >> number_of_strides;
-
-			for( unsigned int i = 0; i < number_of_strides; ++i )
-			{
-				planfile >> strides[i]; // read one dimension
-
-				// We have to explicitly set outStride instead of using clfftSetPlanOutStride here.
-				// Because the number of values to add might be greater than the number of dimensions in plan->dimension,
-				// we don't want to miss out on any super awesome numbers getting added to plan->outStride with clfftSetPlanOutStride
-				if( i >= plan->outStride.size() ) plan->outStride.push_back(1);
-				plan->outStride[i] = strides[i];
-			}
-		}
-		else if( next_word == "in-out-distances" )
-		{
-			size_t indistance, outdistance;
-			planfile >> indistance >> outdistance;
-
-			OPENCL_V( clfftSetPlanDistance( plan_handle, indistance, outdistance ), _T("clfftReadPlanFromDisk: clfftSetPlanDistance" ) );
-		}
-		else if( next_word == "in-out-layouts" )
-		{
-			size_t inlayout, outlayout;
-			planfile >> inlayout >> outlayout;
-
-			OPENCL_V( clfftSetLayout( plan_handle, static_cast<clfftLayout>(inlayout), static_cast<clfftLayout>(outlayout) ), _T("clfftReadPlanFromDisk: clfftSetLayout") );
-		}
-		else if( next_word == "resultlocation" )
-		{
-			size_t location;
-			planfile >> location;
-
-			OPENCL_V( clfftSetResultLocation( plan_handle, static_cast<clfftResultLocation>(location) ), _T("clfftReadPlanFromDisk: clfftSetResultLocation") );
-		}
-		else if( next_word == "precision" )
-		{
-			size_t precision;
-			planfile >> precision;
-
-			OPENCL_V( clfftSetPlanPrecision( plan_handle, static_cast<clfftPrecision>(precision) ), _T("clfftReadPlanFromDisk: clfftSetPlanPrecision") );
-		}
-		else if( next_word == "forwardscale" || next_word == "backwardscale" )
-		{
-			size_t scale;
-			planfile >> scale;
-
-			if( next_word == "forwardscale" )
-			{
-				OPENCL_V( clfftSetPlanScale( plan_handle, CLFFT_FORWARD, hex_as_float<float>((unsigned int)scale) ), _T("clfftReadPlanFromDisk: clfftSetPlanScale") );
-			}
-			else
-			{
-				OPENCL_V( clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, hex_as_float<float>((unsigned int)scale) ), _T("clfftReadPlanFromDisk: clfftSetPlanScale") );
-			}
-		}
-		else if( next_word == "gen" )
-		{
-			int gen_read;
-			planfile >> gen_read;
-			plan->gen = static_cast<clfftGenerators>(gen_read);
-		}
-		else if( next_word == "tmpBufSize" )
-		{
-			planfile >> plan->tmpBufSize;
-		}
-		else if( next_word == "large1D" )
-		{
-			planfile >> plan->large1D;
-		}
-		else if( next_word == "large2D" )
-		{
-			planfile >> plan->large2D;
-		}
-		else if( next_word == "number-of-devices" )
-		{
-			planfile >> plan->number_of_devices;
-		}
-		else if( next_word == "binary-sizes" )
-		{
-			plan->ResetBinarySizes();
-			for( int i = 0; i < plan->number_of_devices; i++ )
-			{
-				planfile >> plan->binary_sizes[i];
-			}
-		}
-		else if( next_word == "binaries" )
-		{
-			plan->ResetBinaries();
-
-			size_t number_of_devices = plan->number_of_devices;
-
-			while( static_cast<char>(planfile.peek()) == ' ' )
-				planfile.ignore();
-
-			// consume the beginning of binary message. the binary will begin with the character immediately following
-			std::unique_ptr<char[]> beginning_message( new char[beginning_of_binary.size()] );
-			planfile.read( beginning_message.get(), beginning_of_binary.size() );
-
-			for( int i = 0; i < plan->number_of_devices; i++ )
-			{
-				planfile.read( plan->binaries[i].get(), plan->binary_sizes[i] );
-			}
-
-			std::unique_ptr<char[]> end_message( new char[end_of_binary.size()] );
-			planfile.read( end_message.get(), end_of_binary.size() );
-		}
-		else if( next_word == end_of_file )
-		{
-			// we're at the end of the file
-		}
-		else
-		{
-			std::cout << next_word << std::endl;
-			OPENCL_V( CLFFT_INVALID_ARG_VALUE, _T("clfftReadPlanFromDisk: unrecognized parameter") );
-		}
-	}
-
-	return CLFFT_SUCCESS;
-}
-#endif
 
 clfftStatus	clfftDestroyPlan( clfftPlanHandle* plHandle )
 {
@@ -3179,47 +2706,13 @@ clfftStatus FFTPlan::ReleaseBuffers ()
 	return	result;
 }
 
-clfftStatus  FFTPlan::GetWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws) const
-{
-	switch(gen)
-	{
-    case Stockham:		return GetWorkSizesPvt<Stockham>( globalws, localws );
-    case Transpose_VLIW:		return GetWorkSizesPvt<Transpose_VLIW>( globalws, localws );
-    case Transpose_GCN:		return GetWorkSizesPvt<Transpose_GCN>( globalws, localws );
-    case Copy:			return GetWorkSizesPvt<Copy>( globalws, localws );
-    default:			assert( false ); return CLFFT_NOTIMPLEMENTED;
-	}
-}
-
-clfftStatus  FFTPlan::GetKernelGenKey (FFTKernelGenKeyParams & params) const
-{
-	switch(gen)
-	{
-	case Stockham:		return GetKernelGenKeyPvt<Stockham>(params);
-	case Transpose_VLIW:		return GetKernelGenKeyPvt<Transpose_VLIW>(params);
-    case Transpose_GCN:		return GetKernelGenKeyPvt<Transpose_GCN>( params );
-    case Copy:			return GetKernelGenKeyPvt<Copy>( params );
-	default:			assert(false); return CLFFT_NOTIMPLEMENTED;
-	}
-}
 
-clfftStatus  FFTPlan::GenerateKernel (FFTRepo & fftRepo, const cl_command_queue commQueueFFT) const
-{
-	switch(gen)
-	{
-	case Stockham:		return GenerateKernelPvt<Stockham>(fftRepo, commQueueFFT);
-	case Transpose_VLIW:		return GenerateKernelPvt<Transpose_VLIW>(fftRepo, commQueueFFT);
-    case Transpose_GCN:		return GenerateKernelPvt<Transpose_GCN>( fftRepo, commQueueFFT );
-    case Copy:			return GenerateKernelPvt<Copy>( fftRepo, commQueueFFT );
-	default:			assert(false); return CLFFT_NOTIMPLEMENTED;
-	}
-}
 
 clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
 {
 	switch(gen)
 	{
-	case Stockham:		return GetMax1DLengthPvt<Stockham>(longest);
+	case Stockham:		return GetMax1DLengthStockham(longest);
 	//No restriction for Transpose_VLIW kernel
 	case Transpose_VLIW:     *longest = 4096; return CLFFT_SUCCESS;
     case Transpose_GCN:     *longest = 4096; return CLFFT_SUCCESS;
diff --git a/src/library/plan.h b/src/library/plan.h
index 56f5df4..787755c 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -23,6 +23,8 @@
 #include "lock.h"
 #include "generator.h"
 
+std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle);
+
 namespace ARBITRARY {
 	// TODO:  These arbitrary parameters should be tuned for the type of GPU
 	//	being used.  These values are probably OK for Radeon 58xx and 68xx.
@@ -168,6 +170,130 @@ struct FFTKernelGenKeyParams {
 //	Sorting operator for struct FFTKernelGenKeyParams, such that it can be used in a map
 bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs);
 
+class	FFTPlan;
+class   FFTRepo;
+
+// Action ID
+enum FFTActionImplID
+{
+    FFT_DEFAULT_STOCKHAM_ACTION,
+    FFT_DEFAULT_TRANSPOSE_ACTION,
+    FFT_DEFAULT_COPY_ACTION,
+    FFT_STATIC_STOCKHAM_ACTION
+};
+
+// 
+// FFTKernelSignatureHeader
+// 
+// This structure is a wrapper for the FFTKernelSignature.  
+// It stores the signature size and the action ID. This ensure that every 
+// FFTKernelSignature (even with an empty DATA) is unique
+// 
+// This class is used as the return type of FFTAction::getSignatureData()
+// 
+struct FFTKernelSignatureHeader
+{
+    int datasize;
+    FFTActionImplID id;
+
+    //clfftLayout           fft_inputLayout;
+    //clfftLayout           fft_outputLayout;
+
+    FFTKernelSignatureHeader(int size_, FFTActionImplID id_)
+    {
+        // Set to 0 the whole signature structure
+        ::memset(this, 0, size_);
+        datasize = size_;
+        id = id_;
+    }
+};
+
+// 
+// FFTKernelSignature
+// 
+// This struct represents the signature of an action. An action signature 
+// stores (by inheritage):
+//  - the action ID
+//  - its signature data size
+//  - a set of bytes caracterizes a FFT action
+// 
+// This template class FFTKernelSignature provides a simple mechanism to
+// build a proper signature (see also in src/library/repo.h) from any POD type.
+//  
+// It is used as a key in the different cache mecanisms (binary cache and
+// dynamic cache managed by FFTRepo)
+// 
+template <typename DATA, FFTActionImplID ID>
+struct FFTKernelSignature : public FFTKernelSignatureHeader, public DATA
+{
+    FFTKernelSignature()
+        : FFTKernelSignatureHeader(sizeof(FFTKernelSignature<DATA, ID>), ID)
+    {
+    }
+};
+
+
+
+// 
+// FFTAction is the base class for all actions used to implement FFT computes
+// 
+// An action basically implements some OpenCL related actions, for instance:
+//  - Generating a kernel source code from kernel characteristics
+//  - Computing the work-group local sizes according to a kernel
+//  - Enqueuing arguments to the kernel call
+// 
+// Kernels generated and compiled by an action are stored in the different
+// cache mechanism (see repo.h for the dynamic cache and fft_binary_lookup.h
+// for disk cache for more information). Each cache entry can be obtained from
+// the action signature which is set of byte characterizing the action itself.
+// 
+// At the moment, FFTAction only implements the enqueue method which is
+// inherited by every action subclasses. But it may change in the time. There
+// are no clear rules and the choices made until now are still subject to
+// change.
+// 
+class FFTAction
+{
+public:
+    FFTAction(FFTPlan * plan, clfftStatus & err);
+
+    virtual clfftStatus enqueue(clfftPlanHandle plHandle,
+                                clfftDirection dir,
+                                cl_uint numQueuesAndEvents,
+                                cl_command_queue* commQueues,
+                                cl_uint numWaitEvents,
+                                const cl_event* waitEvents,
+                                cl_event* outEvents,
+                                cl_mem* clInputBuffers,
+                                cl_mem* clOutputBuffers);
+
+protected:
+
+    virtual clfftGenerators                getGenerator() = 0;
+
+    clfftStatus                            compileKernels  ( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan);
+    clfftStatus                            writeKernel     ( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device);
+
+    virtual clfftStatus                    generateKernel  ( FFTRepo & fftRepo, const cl_command_queue commQueueFFT) = 0;
+    virtual clfftStatus                    getWorkSizes    ( std::vector<size_t> & globalws, std::vector<size_t> & localws) = 0;
+
+    virtual const FFTKernelSignatureHeader * getSignatureData() = 0;
+
+    FFTPlan * plan;
+
+private:
+
+    clfftStatus selectBufferArguments(FFTPlan * plan,
+                                      cl_mem* clInputBuffers,
+                                      cl_mem* clOutputBuffers,
+                                      std::vector< cl_mem > &inputBuff,
+                                      std::vector< cl_mem > &outputBuff);
+
+    virtual bool buildForwardKernel() = 0;
+    virtual bool buildBackwardKernel() = 0;
+};
+
+
 //	The "envelope" is a set of limits imposed by the hardware
 //	This will depend on the GPU(s) in the OpenCL context.
 //	If there are multiple devices, this should be the least
@@ -194,23 +320,11 @@ struct FFTEnvelope {
 	}
 };
 
-class FFTRepo;
 
 //	This class contains objects that are specific to a particular FFT transform, and the data herein is useful
 //	for us to know ahead of transform time such that we can optimize for these settings
 class	FFTPlan
 {
-	template <clfftGenerators G>
-	clfftStatus GetWorkSizesPvt (std::vector<size_t> & globalws, std::vector<size_t> & localws) const;
-
-	template <clfftGenerators G>
-	clfftStatus GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const;
-
-	template <clfftGenerators G>
-	clfftStatus GenerateKernelPvt (FFTRepo& fftRepo,  const cl_command_queue& commQueueFFT ) const;
-
-	template <clfftGenerators G>
-	clfftStatus GetMax1DLengthPvt (size_t *longest ) const;
 
 public:
 
@@ -301,6 +415,12 @@ public:
 	BlockComputeType blockComputeType;
 
 
+    clfftPlanHandle plHandle;
+
+    // The action
+    FFTAction * action;
+
+
 	FFTPlan ()
 	:	baked (false)
 	,	dim (CLFFT_1D)
@@ -340,6 +460,8 @@ public:
 	,	planCopy(0)
 	,	const_buffer( NULL )
 	,	gen(Stockham)
+    ,   action(0)
+    ,   plHandle(0)
 	{};
 
 
@@ -348,9 +470,6 @@ public:
 	clfftStatus AllocateBuffers ();
 	clfftStatus ReleaseBuffers ();
 
-	clfftStatus GetWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws) const;
-	clfftStatus GetKernelGenKey (FFTKernelGenKeyParams & params) const;
-	clfftStatus GenerateKernel (FFTRepo & fftRepo, const cl_command_queue commQueueFFT) const;
 	clfftStatus GetMax1DLength (size_t *longest ) const;
 
 	void ResetBinarySizes();
@@ -362,6 +481,8 @@ public:
 	clfftStatus GetEnvelope (const FFTEnvelope **) const;
 	clfftStatus SetEnvelope ();
 
+	clfftStatus GetMax1DLengthStockham (size_t *longest ) const;
+
 	~FFTPlan ()
 	{
 		ReleaseBuffers ();
diff --git a/src/library/private.h b/src/library/private.h
index 000ab65..fecb84f 100644
--- a/src/library/private.h
+++ b/src/library/private.h
@@ -90,6 +90,8 @@
 	}
 #endif
 
+void clfftInitBinaryCache();
+
 //	This header file is not visible to clients, and contains internal structures and functions for use
 //	by the FFT library.  Since this header is private to this implementation, there is no need to keep
 //	strict C compliance.
@@ -270,6 +272,8 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
 
 //	This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
 //	If an error occurs, we issue a return statement to exit the calling function.
+#if defined( _DEBUG )
+
 #define OPENCL_V( fn, msg ) \
 { \
 	clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
@@ -288,6 +292,23 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
 	} \
 }
 
+#else
+
+#define OPENCL_V( fn, msg ) \
+{ \
+	clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
+	switch( vclStatus ) \
+	{ \
+		case	CL_SUCCESS:		/**< No error */ \
+			break; \
+		default: \
+		{ \
+			return	vclStatus; \
+		} \
+	} \
+}
+#endif
+
 static inline bool IsPo2 (size_t u) {
 	return (u != 0) &&  (0 == (u & (u-1)));
 }
diff --git a/src/library/repo.cpp b/src/library/repo.cpp
index 6d44985..9e2ab8d 100644
--- a/src/library/repo.cpp
+++ b/src/library/repo.cpp
@@ -60,6 +60,11 @@ clfftStatus FFTRepo::releaseResources( )
 	//
 	for (fftRepo_iterator iProg = mapFFTs.begin( ); iProg != mapFFTs.end( ); ++iProg )
 	{
+        if (iProg->first.data != NULL)
+        {
+            const_cast<FFTRepoKey*>(&iProg->first)->deleteData();
+        }
+
 		cl_program p = iProg->second.clProgram;
 		iProg->second.clProgram = NULL;
 		if (NULL != p)
@@ -91,14 +96,13 @@ clfftStatus FFTRepo::releaseResources( )
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const std::string& kernel, const cl_device_id &device, const cl_context& planContext )
+clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const std::string& kernel, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "setProgramCode" ) );
 
-	ClPair clPair = std::make_pair(planContext, device);
-	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
-	fftRepoKey key = std::make_pair( gen, Params );
+	FFTRepoKey key(gen, data, planContext, device);
 
+    key.privatizeData();
 
 	// Prefix copyright statement at the top of generated kernels
 	std::stringstream ss;
@@ -127,13 +131,11 @@ clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelG
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
+clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "getProgramCode" ) );
 
-	ClPair clPair = std::make_pair(planContext, device);
-	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
-	fftRepoKey key = std::make_pair( gen, Params );
+	FFTRepoKey key(gen, data, planContext, device);
 
 	fftRepo_iterator pos = mapFFTs.find( key);
 	if( pos == mapFFTs.end( ) )
@@ -143,14 +145,12 @@ clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelG
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
+clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFTKernelSignatureHeader * data,
 	const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext  )
 {
 	scopedLock sLock( lockRepo, _T( "setProgramEntryPoints" ) );
 
-	ClPair clPair = std::make_pair(planContext, device);
-	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
-	fftRepoKey key = std::make_pair( gen, Params );
+	FFTRepoKey key(gen, data, planContext, device);
 
 	fftRepoValue& fft  = mapFFTs[ key ];
 	fft.EntryPoint_fwd  = kernel_fwd;
@@ -159,14 +159,12 @@ clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFT
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
+clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTKernelSignatureHeader * data,
 			clfftDirection dir, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "getProgramEntryPoint" ) );
 
-	ClPair clPair = std::make_pair(planContext, device);
-	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
-	fftRepoKey key = std::make_pair( gen, Params );
+	FFTRepoKey key(gen, data, planContext, device);
 
 	fftRepo_iterator pos = mapFFTs.find( key );
 	if( pos == mapFFTs.end( ) )
@@ -190,23 +188,18 @@ clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTK
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext )
+clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const cl_program& prog, const cl_device_id &device, const cl_context& planContext )
 {
 	scopedLock sLock( lockRepo, _T( "setclProgram" ) );
 
-  cl_int status = CL_SUCCESS;
-  cl_context ProgramContext = NULL;
-  status = clGetProgramInfo(prog, CL_PROGRAM_CONTEXT, sizeof(cl_context), &ProgramContext, NULL);
-
-  OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
-	ClPair clPair = std::make_pair(planContext, device);
-	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
-	fftRepoKey key = std::make_pair( gen, Params );
+ 	FFTRepoKey key(gen, data, planContext, device);
 
 	fftRepo_iterator pos = mapFFTs.find( key );
 	if( pos == mapFFTs.end( ) )
+	{
+		key.privatizeData(); // the key owns the data
 		mapFFTs[ key ].clProgram = prog;
+	}
 	else {
 		cl_program p = pos->second.clProgram;
 		assert (NULL == p);
@@ -218,13 +211,11 @@ clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGen
 	return	CLFFT_SUCCESS;
 }
 
-clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext  )
+clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, cl_program& prog, const cl_device_id &device, const cl_context& planContext  )
 {
 	scopedLock sLock( lockRepo, _T( "getclProgram" ) );
 
-	ClPair clPair = std::make_pair(planContext, device);
-	std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
-	fftRepoKey key = std::make_pair( gen, Params );
+	FFTRepoKey key(gen, data, planContext, device);
 
 	fftRepo_iterator pos = mapFFTs.find( key );
 	if( pos == mapFFTs.end( ) )
diff --git a/src/library/repo.h b/src/library/repo.h
index 9adc349..5cce489 100644
--- a/src/library/repo.h
+++ b/src/library/repo.h
@@ -27,6 +27,8 @@
 
 
 
+
+
 //	This class contains objects that we wish to retain between individual calls into the FFT interface.
 //	These objects will be shared across different individual FFT plans, and we wish to keep only one
 //	copy of these programs, objects and events.  When the client decides that they either want to reset
@@ -34,6 +36,61 @@
 //	up as much as it can.  It is implemented as a Singleton object.
 class	FFTRepo
 {
+
+    struct FFTRepoKey
+    {
+        clfftGenerators gen;
+        const FFTKernelSignatureHeader * data;
+        cl_context context;
+		cl_device_id device;
+
+        FFTRepoKey(clfftGenerators gen_, const FFTKernelSignatureHeader * data_, cl_context context_, cl_device_id device_)
+            : gen(gen_), data(data_), context(context_), device(device_)
+        {
+
+        }
+
+        void privatizeData()
+        {
+            char * tmp = new char[data->datasize];
+            ::memcpy(tmp, data, data->datasize);
+            this->data = (FFTKernelSignatureHeader*) tmp;
+        }
+
+        void deleteData()
+        {
+            if (this->data)
+            {
+                delete this->data;
+            }
+
+            this->data = NULL;
+        }
+
+        bool operator<(const FFTRepoKey & b) const
+        {
+            const FFTRepoKey & a = *this;
+
+            if (a.gen != b.gen)
+            {
+                return a.gen < b.gen;
+            }
+            else if (a.data->datasize != b.data->datasize)
+            {
+                return a.data->datasize < b.data->datasize;
+            }
+            else if (a.context != b.context)
+            {
+                return a.context < b.context;
+            }
+            else
+            {
+                return ::memcmp(a.data, b.data, a.data->datasize) < 0;
+            }
+        }
+    };
+
+
 	//	Structure containing all the data we need to remember for a specific invokation of a kernel
 	//	generator
 	struct fftRepoValue {
@@ -51,9 +108,7 @@ class	FFTRepo
 	//	has created
 	//typedef std::pair< clfftGenerators, FFTKernelGenKeyParams > fftRepoKey;
 
-	typedef std::pair< cl_context, cl_device_id > ClPair;
-	typedef std::pair< clfftGenerators, std::pair<FFTKernelGenKeyParams, ClPair> > fftRepoKey;
-	typedef std::map< fftRepoKey, fftRepoValue > fftRepoType;
+	typedef std::map< FFTRepoKey, fftRepoValue > fftRepoType;
 	typedef fftRepoType::iterator fftRepo_iterator;
 
 
@@ -140,14 +195,14 @@ public:
 
 	clfftStatus releaseResources( );
 
-	clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, const std::string& kernel, const cl_device_id &device, const cl_context& planContext );
-	clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, std::string& kernel, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const std::string& kernel, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, std::string& kernel, const cl_device_id &device, const cl_context& planContext );
 
-	clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext );
-	clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, clfftDirection dir, std::string& kernel , const cl_device_id &device, const cl_context& planContext );
+	clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelSignatureHeader * data, clfftDirection dir, std::string& kernel , const cl_device_id &device, const cl_context& planContext );
 
-	clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext );
-	clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const cl_program& prog, const cl_device_id &device, const cl_context& planContext );
+	clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, cl_program& prog, const cl_device_id &device, const cl_context& planContext );
 
 	clfftStatus setclKernel ( cl_program prog, clfftDirection dir, const cl_kernel& kernel );
 	clfftStatus getclKernel ( cl_program prog, clfftDirection dir, cl_kernel& kernel );
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 8ab7efa..8dc5e9f 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -846,488 +846,13 @@ clfftStatus clfftEnqueueTransform(
 		}
 	}
 
-	// 1d with normal length will fall into the below category
-	// add: 2d transpose kernel will fall into here too.
-	vector< cl_mem >	inputBuff;
-	vector< cl_mem >	outputBuff;
-	inputBuff.reserve( 2 );
-	outputBuff.reserve( 2 );
-
-	//	Decode the relevant properties from the plan paramter to figure out how many input/output buffers we have
-	switch( fftPlan->inputLayout )
-	{
-		case CLFFT_COMPLEX_INTERLEAVED:
-		{
-			switch( fftPlan->outputLayout )
-			{
-				case CLFFT_COMPLEX_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_COMPLEX_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						//	Invalid to be an inplace transform, and go from 1 to 2 buffers
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				case CLFFT_REAL:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				default:
-				{
-					//	Don't recognize output layout
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-			}
-
-			break;
-		}
-		case CLFFT_COMPLEX_PLANAR:
-		{
-			switch( fftPlan->outputLayout )
-			{
-				case CLFFT_COMPLEX_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_COMPLEX_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				case CLFFT_REAL:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				default:
-				{
-					//	Don't recognize output layout
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-			}
-
-			break;
-		}
-		case CLFFT_HERMITIAN_INTERLEAVED:
-		{
-			switch( fftPlan->outputLayout )
-			{
-				case CLFFT_COMPLEX_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_COMPLEX_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_INTERLEAVED:
-				{
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-				case CLFFT_HERMITIAN_PLANAR:
-				{
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-				case CLFFT_REAL:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				default:
-				{
-					//	Don't recognize output layout
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-			}
-
-			break;
-		}
-		case CLFFT_HERMITIAN_PLANAR:
-		{
-			switch( fftPlan->outputLayout )
-			{
-				case CLFFT_COMPLEX_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_COMPLEX_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_INTERLEAVED:
-				{
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-				case CLFFT_HERMITIAN_PLANAR:
-				{
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-				case CLFFT_REAL:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						inputBuff.push_back( clInputBuffers[ 1 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				default:
-				{
-					//	Don't recognize output layout
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-			}
-
-			break;
-		}
-		case CLFFT_REAL:
-		{
-			switch( fftPlan->outputLayout )
-			{
-				case CLFFT_COMPLEX_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_COMPLEX_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_INTERLEAVED:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-					}
-
-					break;
-				}
-				case CLFFT_HERMITIAN_PLANAR:
-				{
-					if( fftPlan->placeness == CLFFT_INPLACE )
-					{
-						return CLFFT_INVALID_ARG_VALUE;
-					}
-					else
-					{
-						inputBuff.push_back( clInputBuffers[ 0 ] );
-
-						outputBuff.push_back( clOutputBuffers[ 0 ] );
-						outputBuff.push_back( clOutputBuffers[ 1 ] );
-					}
-
-					break;
-				}
-				default:
-				{
-					//	Don't recognize output layout
-					return CLFFT_INVALID_ARG_VALUE;
-				}
-			}
-
-			break;
-		}
-		default:
-		{
-			//	Don't recognize output layout
-			return CLFFT_INVALID_ARG_VALUE;
-		}
-	}
-
-	//	TODO:  In the case of length == 1, FFT is a trivial NOP, but we still need to apply the forward and backwards tranforms
-	//	TODO:  Are map lookups expensive to call here?  We can cache a pointer to the cl_program/cl_kernel in the plan
-
-	FFTKernelGenKeyParams fftParams;
-	//	Translate the user plan into the structure that we use to map plans to clPrograms
-	OPENCL_V( fftPlan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
-
-	cl_program	prog;
-	cl_kernel	kern;
-	OPENCL_V( fftRepo.getclProgram( fftPlan->gen, fftParams, prog, fftPlan->bakeDevice, fftPlan->context ), _T( "fftRepo.getclProgram failed" ) );
-	OPENCL_V( fftRepo.getclKernel( prog, dir, kern ), _T( "fftRepo.getclKernels failed" ) );
-
-
-
-	cl_uint uarg = 0;
-	if (!fftPlan->transflag && !(fftPlan->gen == Copy))
-	{
-		//	::clSetKernelArg() is not thread safe, according to the openCL spec for the same cl_kernel object
-		//	TODO:  Need to verify that two different plans (which would get through our lock above) with exactly the same
-		//	parameters would NOT share the same cl_kernel objects
-
-		/* constant buffer */
-		OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&fftPlan->const_buffer ), _T( "clSetKernelArg failed" ) );
-	}
-
-	//	Input buffer(s)
-	//	Input may be 1 buffer  (CLFFT_COMPLEX_INTERLEAVED)
-	//	          or 2 buffers (CLFFT_COMPLEX_PLANAR)
-
-	for (size_t i = 0; i < inputBuff.size(); ++i)
-	{
-		OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&inputBuff[i] ), _T( "clSetKernelArg failed" ) );
-	}
-	//	Output buffer(s)
-	//	Output may be 0 buffers (CLFFT_INPLACE)
-	//	           or 1 buffer  (CLFFT_COMPLEX_INTERLEAVED)
-	//	           or 2 buffers (CLFFT_COMPLEX_PLANAR)
-	for (size_t o = 0; o < outputBuff.size(); ++o)
-	{
-		OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
-	}
-
-	vector< size_t > gWorkSize;
-	vector< size_t > lWorkSize;
-	clfftStatus result = fftPlan->GetWorkSizes (gWorkSize, lWorkSize);
-
-	// TODO:  if GetWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
-	// that this multidimensional input data array is too large to be transformed
-	// with a single call to clEnqueueNDRangeKernel.  For now, we will just return
-	// the error code back up the call stack.
-	// The *correct* course of action would be to split the work into mutliple
-	// calls to clEnqueueNDRangeKernel.
-	if (CLFFT_INVALID_GLOBAL_WORK_SIZE == result)
-	{
-		OPENCL_V( result, _T("Work size too large for clEnqueNDRangeKernel()"));
-	}
-	else
-	{
-		OPENCL_V( result, _T("FFTPlan::GetWorkSizes failed"));
-	}
-	BUG_CHECK (gWorkSize.size() == lWorkSize.size());
-
-	//size_t *lwSize = NULL;
-	//if(fftPlan->gen != Copy) lwSize = &lWorkSize[ 0 ];
-
-	status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
-		NULL, &gWorkSize[ 0 ], &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
-	OPENCL_V( status, _T( "clEnqueueNDRangeKernel failed" ) );
-
-	if( fftRepo.pStatTimer )
-	{
-		fftRepo.pStatTimer->AddSample( plHandle, fftPlan, kern, numQueuesAndEvents, outEvents, gWorkSize );
-	}
-
-	return	CLFFT_SUCCESS;
+	return fftPlan->action->enqueue(plHandle,
+                                        dir,
+                                        numQueuesAndEvents,
+                                        commQueues,
+                                        numWaitEvents,
+                                        waitEvents,
+                                        outEvents,
+                                        clInputBuffers,
+                                        clOutputBuffers);
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list