[clfft] 02/128: precallback-initialversion

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:32 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 3b8afc505d9dae90f4e6900cfd84de926b33a2cb
Author: Pradeep <pradeep.rao at amd.com>
Date:   Tue Jul 21 16:30:28 2015 +0530

    precallback-initialversion
---
 src/CMakeLists.txt                      |  15 +
 src/client-callback/CMakeLists.txt      |  62 +++
 src/client-callback/callback-client.cpp | 744 ++++++++++++++++++++++++++++++++
 src/client-callback/client.h            |  70 +++
 src/client-callback/openCL.misc.cpp     | 536 +++++++++++++++++++++++
 src/client-callback/openCL.misc.h       | 151 +++++++
 src/client-callback/stdafx.cpp          |  25 ++
 src/include/clFFT.h                     |  24 ++
 src/library/accessors.cpp               |  35 ++
 src/library/action.cpp                  |  13 +
 src/library/generator.stockham.cpp      | 220 +++++++++-
 src/library/plan.cpp                    |   8 +
 src/library/plan.h                      |  24 +-
 13 files changed, 1910 insertions(+), 17 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index af571ce..4a87888 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,6 +72,7 @@ option( BUILD_TEST "Build the library testing suite (dependency on google test,
 option( BUILD_LOADLIBRARIES "Build the optional dynamic load libraries that the FFT runtime will search for" ON )
 option( BUILD_SHARED_LIBS "Build shared libraries." ON)
 option( BUILD_EXAMPLES "Build examples." ON)
+option( BUILD_CALLBACK_CLIENT "Build a command line clFFT client program that tests callback functionality (dependency on Boost)" ON )
 
 # If BOOST_ROOT is defined as an environment value, use that value and cache it so it's visible in the cmake-gui.
 # Otherwise, create a sensible default that the user can change
@@ -175,6 +176,13 @@ if( BUILD_TEST )
     endif( )
 endif( )
 
+# Enable building of the callback client if requested and all dependencies are found
+if( BUILD_CALLBACK_CLIENT AND Boost_FOUND )
+	set( FFT_CALLBACK_CLIENT ON )
+else( )
+	set( FFT_CALLBACK_CLIENT OFF )
+endif( )
+
 # FFLAGS depend on the compiler, grab the compiler name from the path
 get_filename_component( C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME_WE )
 # message( "C_COMPILER_NAME: " ${C_COMPILER_NAME} )
@@ -283,6 +291,13 @@ else( )
 	message( "GoogleTest unit tests will NOT be built" )
 endif( )
 
+# We only want to build the following if the user options are set
+if( FFT_CALLBACK_CLIENT AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/client-callback" )
+	add_subdirectory( client-callback )
+else( )
+	message( "FFT callback client will NOT be built" )
+endif( )
+
 if( BUILD_EXAMPLES )
     add_subdirectory( examples )
 endif()
diff --git a/src/client-callback/CMakeLists.txt b/src/client-callback/CMakeLists.txt
new file mode 100644
index 0000000..268f2b6
--- /dev/null
+++ b/src/client-callback/CMakeLists.txt
@@ -0,0 +1,62 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+#  client
+set( Client.Source	callback-client.cpp 
+                    openCL.misc.cpp 
+                    stdafx.cpp )
+
+set( Client.Headers client.h 
+                    openCL.misc.h 
+                    ../statTimer/statisticalTimer.extern.h
+                    ../include/unicode.compatibility.h 
+                    ../include/stdafx.h 
+                    ../include/targetver.h 
+                    ../include/clFFT.h )
+
+set( Client.Files ${Client.Source} ${Client.Headers} )
+
+set( DL_LIB "" )
+if( WIN32 )
+	add_definitions( "/D_CONSOLE" )
+elseif( APPLE )
+	set( CMAKE_CXX_FLAGS "-std=c++11 -stdlib=libc++ ${CMAKE_CXX_FLAGS}" )	
+else( )
+	# To use the dlopen() and dlclose() functions, we should link with libdl
+	set( DL_LIB "-ldl -lrt" )
+endif( )
+
+# Include standard OpenCL headers
+include_directories( ${Boost_INCLUDE_DIRS} ${OPENCL_INCLUDE_DIRS} ../../../common ${PROJECT_BINARY_DIR}/include ../include )
+
+add_executable( clFFT-callback ${Client.Files} )
+
+target_link_libraries( clFFT-callback clFFT ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} ${DL_LIB} )
+
+set_target_properties( clFFT-callback PROPERTIES VERSION ${CLFFT_VERSION} )
+set_target_properties( clFFT-callback PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
+if( APPLE )
+    # properly deal with RPATH on mac
+    set_target_properties( clFFT-callback PROPERTIES INSTALL_RPATH "@loader_path/../lib${SUFFIX_LIB}")
+endif()
+
+# CPack configuration; include the executable into the package
+install( TARGETS clFFT-callback
+        RUNTIME DESTINATION bin${SUFFIX_BIN}
+        LIBRARY DESTINATION lib${SUFFIX_LIB}
+        ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+        )
diff --git a/src/client-callback/callback-client.cpp b/src/client-callback/callback-client.cpp
new file mode 100644
index 0000000..fc8de65
--- /dev/null
+++ b/src/client-callback/callback-client.cpp
@@ -0,0 +1,744 @@
+#include "stdafx.h"
+#include <functional>
+#include <cmath>
+
+#include "client.h"
+#include "../library/private.h"
+#include "openCL.misc.h"
+#include "../statTimer/statisticalTimer.extern.h"
+#include "../include/sharedLibrary.h"
+#include "../include/unicode.compatibility.h"
+
+namespace po = boost::program_options;
+
+#define SCALAR 100
+#define PRECALLBACKTYPE 1
+
+#define MULVAL float2 mulval(__global void* in, int offset, __global void* userdata)\n \
+				{ \n \
+				int scalar = *((__global int*)userdata); \n \
+				float2 ret = *((__global float2*)in + offset) * scalar; \n \
+				return ret; \n \
+				}
+
+#define MULVAL_PLANAR float2 mulval(__global void* inRe, __global void* inIm, int offset, __global void* userdata)\n \
+				{ \n \
+				__global USER_DATA *data = (__global USER_DATA *)userdata; \n \
+				int scalar = (int)data->scalar; \n \
+				float2 ret; \n \
+				ret.x = *((__global float*)inRe + offset) * scalar; \n \
+				ret.y = *((__global float*)inIm + offset) * scalar; \n \
+				return ret; \n \
+				}
+
+#define STRUCT_USERDATA typedef struct USER_DATA  \
+					   {  \
+						int scalar;  \
+						int datalength;  \
+						} USER_DATA; 
+STRUCT_USERDATA
+
+//	This is used with the program_options class so that the user can type an integer on the command line
+//	and we store into an enum varaible
+template<class _Elem, class _Traits>
+std::basic_istream<_Elem, _Traits> & operator>> (std::basic_istream<_Elem, _Traits> & stream, clfftLayout & layout)
+{
+	cl_uint tmp;
+	stream >> tmp;
+	layout = clfftLayout(tmp);
+	return stream;
+}
+
+template < typename T >
+int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
+				clfftLayout in_layout, clfftLayout out_layout,
+				clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
+				cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
+				cl_uint command_queue_flags, cl_uint profile_count,
+				std::auto_ptr< clfftSetupData > setupData,
+				bool hasPrecallback)
+{
+	//	Our command line does not specify what dimension FFT we wish to transform; we decode
+	//	this from the lengths that the user specifies for X, Y, Z.  A length of one means that
+	//	The user does not want that dimension.
+
+	const size_t max_dimensions = 3;
+	size_t strides[ 4 ];
+	size_t o_strides[ 4 ];
+	size_t fftVectorSize = 0;
+	size_t fftVectorSizePadded = 0;
+	size_t fftBatchSize = 0;
+	size_t outfftVectorSize = 0;
+	size_t outfftVectorSizePadded = 0;
+	size_t outfftBatchSize = 0;
+	size_t size_of_input_buffers_in_bytes = 0;
+	size_t size_of_output_buffers_in_bytes = 0;
+	cl_uint number_of_output_buffers = 0;
+	clfftDim	dim = CLFFT_1D;
+	cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
+	cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
+	std::vector< cl_device_id > device_id;
+	cl_context context;
+	cl_command_queue queue;
+	cl_event outEvent = NULL;
+	clfftPlanHandle plan_handle;
+
+	for (unsigned u = 0; u < max_dimensions; ++u) {
+		if (0 != lengths[u])
+			continue;
+		lengths[u] = 1;
+	}
+
+	if( lengths[ 1 ] > 1 )
+	{
+		dim	= CLFFT_2D;
+	}
+	if( lengths[ 2 ] > 1 )
+	{
+		dim	= CLFFT_3D;
+	}
+
+	strides[ 0 ] = inStrides[0];
+	strides[ 1 ] = inStrides[1];
+	strides[ 2 ] = inStrides[2];
+	strides[ 3 ] = inStrides[3];
+
+	o_strides[ 0 ] = outStrides[0];
+	o_strides[ 1 ] = outStrides[1];
+	o_strides[ 2 ] = outStrides[2];
+	o_strides[ 3 ] = outStrides[3];
+
+	fftVectorSize = lengths[0] * lengths[1] * lengths[2];
+	fftVectorSizePadded = strides[3];
+	fftBatchSize = fftVectorSizePadded * batch_size;
+
+	size_t Nt = 1 + lengths[0]/2;
+
+	if(place == CLFFT_INPLACE)
+	{
+		outfftVectorSize = fftVectorSize;
+		outfftVectorSizePadded = fftVectorSizePadded;
+		outfftBatchSize = fftBatchSize;
+	}
+	else
+	{
+		outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
+		outfftVectorSizePadded = o_strides[3];
+		outfftBatchSize = outfftVectorSizePadded * batch_size;
+	}
+
+	// Real to complex case
+	if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) )
+	{
+		terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+		return 1;
+	}
+
+	switch( out_layout )
+	{
+	case CLFFT_COMPLEX_INTERLEAVED:
+		number_of_output_buffers = 1;
+		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > );
+		break;
+	case CLFFT_COMPLEX_PLANAR:
+		number_of_output_buffers = 2;
+		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T);
+		break;
+	default:
+		terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+		return 1;
+	}
+
+		// Fill the input buffers
+	switch( in_layout )
+	{
+	case CLFFT_COMPLEX_INTERLEAVED:
+		{
+			//	This call creates our openCL context and sets up our devices; expected to throw on error
+			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > );
+
+			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
+			createOpenCLCommandQueue( context,
+				command_queue_flags, queue,
+				device_id,
+				size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
+				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+
+			std::vector< std::complex< T > > input( fftBatchSize );
+
+			// set zero
+			for( cl_uint i = 0; i < fftBatchSize; ++i )
+			{
+				input[ i ] = 0;
+			}
+
+			// impulse test case
+			for(size_t b = 0; b < batch_size; b++)
+			{
+				size_t p3 = b * strides[3];
+				for(size_t k = 0; k < lengths[2]; k++)
+				{
+					size_t p2 = p3 + k * strides[2];
+					for(size_t j = 0; j < lengths[1]; j++)
+					{
+						size_t p1 = p2 + j * strides[1];
+						for(size_t i = 0; i < lengths[0]; i++)
+						{
+							size_t p0 = p1 + i * strides[0];
+							input[p0] = 1;
+						}
+					}
+				}
+			}
+
+
+			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ],
+				0, NULL, &outEvent ),
+				"clEnqueueWriteBuffer failed" );
+
+		}
+		break;
+	case CLFFT_COMPLEX_PLANAR:
+		{
+			//	This call creates our openCL context and sets up our devices; expected to throw on error
+			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T );
+
+			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
+			createOpenCLCommandQueue( context,
+				command_queue_flags, queue,
+				device_id,
+				size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
+				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);
+
+			std::vector< T > real( fftBatchSize );
+			std::vector< T > imag( fftBatchSize );
+
+			// set zero
+			for( cl_uint i = 0; i < fftBatchSize; ++i )
+			{
+				real[ i ] = 0;
+				imag[ i ] = 0;
+			}
+
+			// impulse test case
+			for(size_t b = 0; b < batch_size; b++)
+			{
+				size_t p3 = b * strides[3];
+				for(size_t k = 0; k < lengths[2]; k++)
+				{
+					size_t p2 = p3 + k * strides[2];
+					for(size_t j = 0; j < lengths[1]; j++)
+					{
+						size_t p1 = p2 + j * strides[1];
+						for(size_t i = 0; i < lengths[0]; i++)
+						{
+							size_t p0 = p1 + i * strides[0];
+							real[p0] = 1;
+						}
+					}
+				}
+			}
+
+
+			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+				0, NULL, &outEvent ),
+				"clEnqueueWriteBuffer failed" );
+			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+				0, NULL, &outEvent ),
+				"clEnqueueWriteBuffer failed" );
+		}
+		break;
+	default:
+		terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+		return 1;
+	}
+
+		//	Discover and load the timer module if present
+	void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
+	if( timerLibHandle == NULL )
+	{
+		terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
+	}
+
+
+	//	Timer module discovered and loaded successfully
+	//	Initialize function pointers to call into the shared module
+	PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );
+
+	//	Create and initialize our timer class, if the external timer shared library loaded
+	baseStatTimer* timer = NULL;
+	size_t	clFFTID = 0;
+	if( get_timer )
+	{
+		timer = get_timer( CLFFT_GPU );
+		timer->Reserve( 1, profile_count );
+		timer->setNormalize( true );
+
+		clFFTID	= timer->getUniqueID( "clFFT", 0 );
+	}
+
+	OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+	OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );
+
+	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+	OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
+	OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
+	OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+	OPENCL_V_THROW (clfftSetPlanInStride  ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
+	OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
+	OPENCL_V_THROW (clfftSetPlanDistance  ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );
+
+	// Set backward scale factor to 1.0 for non real FFTs to do correct output checks
+	if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
+		OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );
+
+	//Check for Precallback
+	//Currently test includes only for 1D
+	if (hasPrecallback && dim == CLFFT_1D)
+	{
+		int precallbakType = PRECALLBACKTYPE;
+		cl_mem userdata;
+
+		if (in_layout == CLFFT_COMPLEX_INTERLEAVED)
+		{
+			switch (precallbakType)
+			{
+			case 1: //C2C 1D Interleaved without LDS
+				{
+					char* precallbackstr = STRINGIFY(MULVAL);
+					int h_userdata[1] = { SCALAR };
+					userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), (void*)h_userdata, NULL);
+
+					//Register the callback
+					OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, NULL, 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+				}
+				break;
+			default:
+				break;
+			}
+		}
+
+		if (in_layout == CLFFT_COMPLEX_PLANAR)
+		{
+			switch (precallbakType)
+			{
+			case 1: //C2C 1D PLANAR without LDS
+				{
+					char* precallbackstr = STRINGIFY(MULVAL_PLANAR);
+					USER_DATA h_userdata[1];
+					h_userdata[0].scalar = SCALAR;
+					userdata = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA), (void*)h_userdata, NULL);
+
+					//Register the callback
+					OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "mulval", precallbackstr, STRINGIFY(STRUCT_USERDATA), 0, PRECALLBACK, userdata), "clFFTSetPlanCallback failed");
+				}
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" );
+
+	//get the buffersize
+	size_t buffersize=0;
+	OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+	//allocate the intermediate buffer
+	cl_mem clMedBuffer=NULL;
+
+	if (buffersize)
+	{
+		cl_int medstatus;
+		clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+		OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+	}
+
+	if (( place == CLFFT_INPLACE )
+	&&  ( in_layout != out_layout )) 
+	{
+		switch( in_layout )
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			{
+				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
+				{
+					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
+				}
+				break;
+			}
+		case CLFFT_COMPLEX_PLANAR:
+			{
+				if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
+				{
+					throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
+				}
+				break;
+			}
+		default:
+			terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+			return 1;
+		}
+	}
+
+	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
+
+	Timer tr;
+	tr.Start();
+
+	//	Loop as many times as the user specifies to average out the timings
+	for( cl_uint i = 0; i < profile_count; ++i )
+	{
+		if( timer ) timer->Start( clFFTID );
+
+		OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent,
+			&input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ),
+			"clfftEnqueueTransform failed" );
+
+		if( timer ) timer->Stop( clFFTID );
+	}
+	OPENCL_V_THROW( clFinish( queue ), "clFinish failed" );
+	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+	double wtime = tr.Sample()/((double)profile_count);
+	size_t totalLen = 1;
+	for(int i=0; i<dim; i++) totalLen *= lengths[i];
+	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	if(profile_count > 1)
+	{
+		tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl;
+		tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl;
+	}
+
+	if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) )
+	{
+		//	Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result
+		timer->pruneOutliers( 2.0 );
+		timer->Print( );
+		timer->Reset( );
+	}
+
+	/*****************/
+	FreeSharedLibrary( timerLibHandle );
+
+		// Read and check output data
+	// This check is not valid if the FFT is executed multiple times inplace.
+	//
+	if (( place == CLFFT_OUTOFPLACE )
+	||  ( profile_count == 1))
+	{
+		bool checkflag= false;
+		switch( out_layout )
+		{
+		case CLFFT_HERMITIAN_INTERLEAVED:
+		case CLFFT_COMPLEX_INTERLEAVED:
+			{
+				std::vector< std::complex< T > > output( outfftBatchSize );
+
+				if( place == CLFFT_INPLACE )
+				{
+					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
+						0, NULL, NULL ),
+						"Reading the result buffer failed" );
+				}
+				else
+				{
+					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
+						0, NULL, NULL ),
+						"Reading the result buffer failed" );
+				}
+
+				//check output data
+				for( cl_uint i = 0; i < outfftBatchSize; ++i )
+				{
+					if (0 == (i % outfftVectorSizePadded))
+					{
+						if (hasPrecallback)
+						{
+							if (output[i].real() != outfftVectorSize * SCALAR)
+							{
+								checkflag = true;
+								break;
+							}
+						}
+						else
+						{
+							if (output[i].real() != outfftVectorSize)
+							{
+								checkflag = true;
+								break;
+							}
+						}
+					}
+					else
+					{
+						if (output[ i ].real() != 0)
+						{
+							checkflag = true;
+							break;
+						}
+					}
+
+					if (output[ i ].imag() != 0)
+					{
+						checkflag = true;
+						break;
+					}
+					//std::cout << i << " real = " << output[i].real() << " img = " << output[ i ].imag() << std::endl;
+				}
+			}
+			break;
+		case CLFFT_HERMITIAN_PLANAR:
+		case CLFFT_COMPLEX_PLANAR:
+			{
+				std::valarray< T > real( outfftBatchSize );
+				std::valarray< T > imag( outfftBatchSize );
+
+				if( place == CLFFT_INPLACE )
+				{
+					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
+						0, NULL, NULL ),
+						"Reading the result buffer failed" );
+					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
+						0, NULL, NULL ),
+						"Reading the result buffer failed" );
+				}
+				else
+				{
+					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
+						0, NULL, NULL ),
+						"Reading the result buffer failed" );
+					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
+						0, NULL, NULL ),
+						"Reading the result buffer failed" );
+				}
+
+				//  Check output data
+				for( cl_uint i = 0; i < outfftBatchSize; ++i )
+				{
+					if (0 == (i % outfftVectorSizePadded))
+					{
+						if (hasPrecallback)
+						{
+							if (real[i] != outfftVectorSize * SCALAR)
+							{
+								checkflag = true;
+								break;
+							}
+						}
+						else
+						{
+							if (real[i] != outfftVectorSize)
+							{
+								checkflag = true;
+								break;
+							}
+						}
+					}
+					else
+					{
+						if (real[i] != 0)
+						{
+							checkflag = true;
+							break;
+						}
+					}
+
+					if (imag[i] != 0)
+					{
+						checkflag = true;
+						break;
+					}
+					//std::cout << i << " real = " << real[i] << " img = " << imag[ i ] << std::endl;
+				}
+			}
+			break;
+		default:
+			terr << _T("Complex-Real callback cases not yet implemented" ) << std::endl;
+			throw std::runtime_error( "Input layout format not yet supported" );
+			break;
+		}
+
+		if (checkflag)
+		{
+			std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+		}
+		else
+		{
+			std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+		}
+	}
+
+	OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+	OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+	cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent );
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	//	OpenCL state 
+	cl_device_type		deviceType	= CL_DEVICE_TYPE_ALL;
+	cl_int				deviceId = 0;
+	cl_int				platformId = 0;
+
+	//	FFT state
+
+	clfftResultLocation	place = CLFFT_INPLACE;
+	clfftLayout	inLayout  = CLFFT_COMPLEX_INTERLEAVED;
+	clfftLayout	outLayout = CLFFT_COMPLEX_INTERLEAVED;
+	clfftPrecision precision = CLFFT_SINGLE;
+	clfftDirection dir = CLFFT_FORWARD;
+	size_t lengths[ 3 ] = {1,1,1};
+	size_t iStrides[ 4 ] = {0,0,0,0};
+	size_t oStrides[ 4 ] = {0,0,0,0};
+	cl_uint profile_count = 0;
+
+	cl_uint command_queue_flags = 0;
+	size_t batchSize = 1;
+
+	//callback
+	bool hasPrecallback = true;
+
+	//	Initialize flags for FFT library
+	std::auto_ptr< clfftSetupData > setupData( new clfftSetupData );
+	OPENCL_V_THROW( clfftInitSetupData( setupData.get( ) ),
+		"clfftInitSetupData failed" );
+
+	try
+	{
+		// Declare the supported options.
+		po::options_description desc( "clFFT client command line options" );
+		desc.add_options()
+			( "help,h",        "produces this help message" )
+			( "gpu,g",         "Force selection of OpenCL GPU devices only" )
+			( "cpu,c",         "Force selection of OpenCL CPU devices only" )
+			( "all,a",         "Force selection of all OpenCL devices (default)" )
+			( "outPlace,o",    "Out of place FFT transform (default: in place)" )
+			( "double",		   "Double precision transform (default: single)" )
+			( "inv",			"Backward transform (default: forward)" )
+			( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" )
+			( "noprecall",		"Disable Precallback (default: precallback on)" )
+			( "lenX,x",        po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ),   "Specify the length of the 1st dimension of a test array" )
+			( "lenY,y",        po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ),      "Specify the length of the 2nd dimension of a test array" )
+			( "lenZ,z",        po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ),      "Specify the length of the 3rd dimension of a test array" )
+			( "isX",   po::value< size_t >( &iStrides[ 0 ] )->default_value( 1 ),						"Specify the input stride of the 1st dimension of a test array" )
+			( "isY",   po::value< size_t >( &iStrides[ 1 ] )->default_value( 0 ),	"Specify the input stride of the 2nd dimension of a test array" )
+			( "isZ",   po::value< size_t >( &iStrides[ 2 ] )->default_value( 0 ),	"Specify the input stride of the 3rd dimension of a test array" )
+			( "iD", po::value< size_t >( &iStrides[ 3 ] )->default_value( 0 ), "input distance between subsequent sets of data when batch size > 1" )
+			( "osX",   po::value< size_t >( &oStrides[ 0 ] )->default_value( 1 ),						"Specify the output stride of the 1st dimension of a test array" )
+			( "osY",   po::value< size_t >( &oStrides[ 1 ] )->default_value( 0 ),	"Specify the output stride of the 2nd dimension of a test array" )
+			( "osZ",   po::value< size_t >( &oStrides[ 2 ] )->default_value( 0 ),	"Specify the output stride of the 3rd dimension of a test array" )
+			( "oD", po::value< size_t >( &oStrides[ 3 ] )->default_value( 0 ), "output distance between subsequent sets of data when batch size > 1" )
+			( "batchSize,b",   po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " )
+			( "profile,p",     po::value< cl_uint >( &profile_count )->default_value( 1 ), "Time and report the kernel speed of the FFT (default: profiling off)" )
+			( "inLayout",      po::value< clfftLayout >( &inLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" )
+			( "outLayout",     po::value< clfftLayout >( &outLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" )
+			;
+
+		po::variables_map vm;
+		po::store( po::parse_command_line( argc, argv, desc ), vm );
+		po::notify( vm );
+
+		if( vm.count( "help" ) )
+		{
+			std::cout << desc << std::endl;
+			return 0;
+		}
+
+		size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
+			| ((vm.count( "cpu" ) > 0) ? 2 : 0)
+			| ((vm.count( "all" ) > 0) ? 4 : 0);
+		if ((mutex & (mutex-1)) != 0) {
+			terr << _T("You have selected mutually-exclusive OpenCL device options:") << std::endl;
+			if (vm.count ( "gpu" )  > 0) terr << _T("    gpu,g   Force selection of OpenCL GPU devices only" ) << std::endl;
+			if (vm.count ( "cpu" )  > 0) terr << _T("    cpu,c   Force selection of OpenCL CPU devices only" ) << std::endl;
+			if (vm.count ( "all" )  > 0) terr << _T("    all,a   Force selection of all OpenCL devices (default)" ) << std::endl;
+			return 1;
+		}
+
+		if( vm.count( "gpu" ) )
+		{
+			deviceType	= CL_DEVICE_TYPE_GPU;
+		}
+
+		if( vm.count( "cpu" ) )
+		{
+			deviceType	= CL_DEVICE_TYPE_CPU;
+		}
+
+		if( vm.count( "all" ) )
+		{
+			deviceType	= CL_DEVICE_TYPE_ALL;
+		}
+
+		if( vm.count( "dumpKernels" ) )
+		{
+			setupData->debugFlags	|= CLFFT_DUMP_PROGRAMS;
+		}
+
+		if( vm.count( "noprecall" ) )
+		{
+			hasPrecallback = false;
+		}
+
+		int inL = (int)inLayout;
+		int otL = (int)outLayout;
+
+		// input output layout support matrix
+		int ioLayoutSupport[5][5] =		{
+										{ 1, 1, 0, 0, 1 },
+										{ 1, 1, 0, 0, 1 },
+										{ 0, 0, 0, 0, 1 },
+										{ 0, 0, 0, 0, 1 },
+										{ 1, 1, 1, 1, 0 },
+										};
+
+		if((inL < 1) || (inL > 5)) throw std::runtime_error( "Invalid Input layout format" );
+		if((otL < 1) || (otL > 5)) throw std::runtime_error( "Invalid Output layout format" );
+
+		if(ioLayoutSupport[inL-1][otL-1] == 0) throw std::runtime_error( "Invalid combination of Input/Output layout formats" );
+
+		if( ((inL == 1) || (inL == 2)) && ((otL == 1) || (otL == 2)) ) // Complex-Complex cases
+		{
+			iStrides[1] = iStrides[1] ? iStrides[1] : lengths[0] * iStrides[0];
+			iStrides[2] = iStrides[2] ? iStrides[2] : lengths[1] * iStrides[1];
+			iStrides[3] = iStrides[3] ? iStrides[3] : lengths[2] * iStrides[2];
+			
+			if(place == CLFFT_INPLACE)
+			{
+				oStrides[0] = iStrides[0];
+				oStrides[1] = iStrides[1];
+				oStrides[2] = iStrides[2];
+				oStrides[3] = iStrides[3];
+			}
+			else
+			{
+				oStrides[1] = oStrides[1] ? oStrides[1] : lengths[0] * oStrides[0];
+				oStrides[2] = oStrides[2] ? oStrides[2] : lengths[1] * oStrides[1];
+				oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2];
+			}
+		}
+		else
+		{
+			terr << _T("Real-Complex and Complex-Real callback cases not yet implemented" ) << std::endl;
+			return 1;
+		}
+
+		if( precision == CLFFT_SINGLE )
+			transform<float>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, false, command_queue_flags, profile_count, setupData, hasPrecallback );
+		else
+			transform<double>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, false, command_queue_flags, profile_count, setupData, hasPrecallback );
+	}
+	catch( std::exception& e )
+	{
+		terr << _T( "clFFT error condition reported:" ) << std::endl << e.what() << std::endl;
+		return 1;
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/src/client-callback/client.h b/src/client-callback/client.h
new file mode 100644
index 0000000..a1e100d
--- /dev/null
+++ b/src/client-callback/client.h
@@ -0,0 +1,70 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( CLIENT_H )
+#define CLIENT_H
+
+//	Boost headers that we want to use
+//	#define BOOST_PROGRAM_OPTIONS_DYN_LINK
+#include <boost/program_options.hpp>
+
+#define CALLBCKSTR(...) #__VA_ARGS__
+#define STRINGIFY(...) 	CALLBCKSTR(__VA_ARGS__)
+
+#ifdef WIN32
+
+struct Timer
+{
+    LARGE_INTEGER start, stop, freq;
+
+public:
+    Timer() { QueryPerformanceFrequency( &freq ); }
+
+    void Start() { QueryPerformanceCounter(&start); }
+    double Sample()
+    {
+        QueryPerformanceCounter  ( &stop );
+        double time = (double)(stop.QuadPart-start.QuadPart) / (double)(freq.QuadPart);
+        return time;
+    }
+};
+
+#else
+
+#include <time.h>
+#include <math.h>
+
+struct Timer
+{
+    struct timespec start, end;
+
+public:
+    Timer() { }
+
+    void Start() { clock_gettime(CLOCK_MONOTONIC, &start); }
+    double Sample()
+    {
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        double time = 1000000000L * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+        return time * 1E-9;
+    }
+};
+
+#endif
+
+#endif
diff --git a/src/client-callback/openCL.misc.cpp b/src/client-callback/openCL.misc.cpp
new file mode 100644
index 0000000..cb5db29
--- /dev/null
+++ b/src/client-callback/openCL.misc.cpp
@@ -0,0 +1,536 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// clfft.opencl.cpp : Provides functions to set up openCL
+//
+
+#include "stdafx.h"
+#include <stdexcept>
+#include <iomanip>
+#include <sstream>
+#include <cstring>
+#include <vector>
+#include "clFFT.h"
+#include "openCL.misc.h"
+
+
+
+void prettyPrintPlatformInfo( const cl_platform_id& pId )
+{
+    size_t platformProfileSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, 0, NULL, &platformProfileSize ),
+        "Getting CL_PLATFORM_PROFILE Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformProfile( platformProfileSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_PROFILE, platformProfileSize, &szPlatformProfile[ 0 ], NULL),
+        "Getting CL_PLATFORM_PROFILE Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t platformVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, 0, NULL, &platformVersionSize ),
+        "Getting CL_PLATFORM_VERSION Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformVersion( platformVersionSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VERSION, platformVersionSize, &szPlatformVersion[ 0 ], NULL),
+        "Getting CL_PLATFORM_VERSION Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t platformNameSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, 0, NULL, &platformNameSize ),
+        "Getting CL_PLATFORM_NAME Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformName( platformNameSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_NAME, platformNameSize, &szPlatformName[ 0 ], NULL),
+        "Getting CL_PLATFORM_NAME Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t vendorStringSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, 0, NULL, &vendorStringSize ),
+        "Getting CL_PLATFORM_VENDOR Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformVendor( vendorStringSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_VENDOR, vendorStringSize, &szPlatformVendor[ 0 ], NULL),
+        "Getting CL_PLATFORM_VENDOR Platform Info string ( ::clGetPlatformInfo() )" );
+
+    size_t platformExtensionsSize	= 0;
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, 0, NULL, &platformExtensionsSize ),
+        "Getting CL_PLATFORM_EXTENSIONS Platform Info string size ( ::clGetPlatformInfo() )" );
+
+    std::vector< char > szPlatformExtensions( platformExtensionsSize );
+    OPENCL_V_THROW( ::clGetPlatformInfo( pId, CL_PLATFORM_EXTENSIONS, platformExtensionsSize, &szPlatformExtensions[ 0 ], NULL),
+        "Getting CL_PLATFORM_EXTENSIONS Platform Info string ( ::clGetPlatformInfo() )" );
+
+    const int indent = countOf( "    CL_PLATFORM_EXTENSIONS: " );
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_PROFILE: " << &szPlatformProfile[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_VERSION: " << &szPlatformVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_NAME: " << &szPlatformName[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_VENDOR: " << &szPlatformVendor[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_PLATFORM_EXTENSIONS: " << &szPlatformExtensions[ 0 ] << std::endl;
+    std::cout << std::right << std::endl;
+}
+
+void prettyPrintDeviceInfo( const cl_device_id& dId )
+{
+    size_t deviceNameSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, 0, NULL, &deviceNameSize ),
+        "Getting CL_DEVICE_NAME Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDeviceName( deviceNameSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_NAME, deviceNameSize, &szDeviceName[ 0 ], NULL ),
+        "Getting CL_DEVICE_NAME Platform Info string ( ::clGetDeviceInfo() )" );
+
+    size_t deviceVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
+        "Getting CL_DEVICE_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDeviceVersion( deviceVersionSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
+        "Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+    size_t driverVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, 0, NULL, &driverVersionSize ),
+        "Getting CL_DRIVER_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDriverVersion( driverVersionSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DRIVER_VERSION, driverVersionSize, &szDriverVersion[ 0 ], NULL ),
+        "Getting CL_DRIVER_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+    size_t openCLVersionSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &openCLVersionSize ),
+        "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szOpenCLVersion( openCLVersionSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_OPENCL_C_VERSION, openCLVersionSize, &szOpenCLVersion[ 0 ], NULL ),
+        "Getting CL_DEVICE_OPENCL_C_VERSION Platform Info string ( ::clGetDeviceInfo() )" );
+
+    cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_TYPE, sizeof( cl_device_type ), &devType, NULL ),
+        "Getting CL_DEVICE_TYPE device info ( ::clGetDeviceInfo() )" );
+
+    cl_uint devAddrBits = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_ADDRESS_BITS, sizeof( cl_uint ), &devAddrBits, NULL ),
+        "Getting CL_DEVICE_ADDRESS_BITS device info ( ::clGetDeviceInfo() )" );
+
+    cl_uint maxClockFreq = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( cl_uint ), &maxClockFreq, NULL ),
+        "Getting CL_DEVICE_MAX_CLOCK_FREQUENCY device info ( ::clGetDeviceInfo() )" );
+
+    cl_bool devAvailable = CL_FALSE;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_AVAILABLE, sizeof( cl_bool ), &devAvailable, NULL ),
+        "Getting CL_DEVICE_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+    cl_bool devCompAvailable = CL_FALSE;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_COMPILER_AVAILABLE, sizeof( cl_bool ), &devCompAvailable, NULL ),
+        "Getting CL_DEVICE_COMPILER_AVAILABLE device info ( ::clGetDeviceInfo() )" );
+
+    size_t devMaxWorkGroup	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &devMaxWorkGroup, NULL ),
+        "Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_uint devMaxWorkItemDim = CL_FALSE;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( cl_uint ), &devMaxWorkItemDim, NULL ),
+        "Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )" );
+
+    std::vector< size_t >	devMaxWorkItemSizes( devMaxWorkItemDim );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( size_t )*devMaxWorkItemSizes.size( ), &devMaxWorkItemSizes[0], NULL),
+        "Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )" );
+
+    cl_bool deviceHostUnified = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( cl_bool ), &deviceHostUnified, NULL ),
+        "Getting CL_DEVICE_HOST_UNIFIED_MEMORY Platform Info string ( ::clGetDeviceInfo() )" );
+
+    cl_ulong devMaxConstantBuffer	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( cl_ulong ), &devMaxConstantBuffer, NULL ),
+        "Getting CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_ulong devLocalMemSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &devLocalMemSize, NULL ),
+        "Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_ulong deviceGlobalMemSize = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &deviceGlobalMemSize, NULL ),
+        "Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    cl_ulong deviceMaxMemAllocSize = 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &deviceMaxMemAllocSize, NULL ),
+        "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
+
+    size_t deviceExtSize	= 0;
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
+        "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
+
+    std::vector< char > szDeviceExt( deviceExtSize );
+    OPENCL_V_THROW( ::clGetDeviceInfo( dId, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
+        "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
+
+    const int indent = countOf( "    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " );
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_NAME: " << &szDeviceName[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_VERSION: " << &szDeviceVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DRIVER_VERSION: " << &szDriverVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_TYPE: "
+        << (CL_DEVICE_TYPE_DEFAULT     & devType ? "default"     : "")
+        << (CL_DEVICE_TYPE_CPU         & devType ? "CPU"         : "")
+        << (CL_DEVICE_TYPE_GPU         & devType ? "GPU"         : "")
+        << (CL_DEVICE_TYPE_ACCELERATOR & devType ? "Accelerator" : "")
+        << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_CLOCK_FREQUENCY: " << maxClockFreq << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_ADDRESS_BITS: " << devAddrBits << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_AVAILABLE: " << ( devAvailable ? "TRUE": "FALSE") << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_COMPILER_AVAILABLE: " << ( devCompAvailable ? "TRUE": "FALSE") << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_OPENCL_C_VERSION: " << &szOpenCLVersion[ 0 ] << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_WORK_GROUP_SIZE: " << devMaxWorkGroup << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << devMaxWorkItemDim << std::endl;
+    for( cl_uint wis = 0; wis < devMaxWorkItemSizes.size( ); ++wis )
+    {
+        std::stringstream dimString;
+        dimString << "Dimension[ " << wis << " ]  ";
+        std::cout << std::right << std::setw( indent ) << dimString.str( ) << devMaxWorkItemSizes[wis] << std::endl;
+    }
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_HOST_UNIFIED_MEMORY: " << ( deviceHostUnified ? "TRUE": "FALSE") << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << devMaxConstantBuffer;
+    std::cout << " ( " << devMaxConstantBuffer / 1024 << " KB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_LOCAL_MEM_SIZE: " << devLocalMemSize;
+    std::cout << " ( " << devLocalMemSize / 1024 << " KB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_GLOBAL_MEM_SIZE: " << deviceGlobalMemSize;
+    std::cout << " ( " << deviceGlobalMemSize / 1048576 << " MB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << deviceMaxMemAllocSize;
+    std::cout << " ( " << deviceMaxMemAllocSize / 1048576 << " MB )" << std::endl;
+    std::cout << std::left << std::setw( indent ) << "    CL_DEVICE_EXTENSIONS: " << &szDeviceExt[ 0 ] << std::endl;
+
+    std::cout << std::right << std::endl;
+}
+
+void prettyPrintCLPlatforms(std::vector< cl_platform_id >& platforms,
+	std::vector< std::vector< cl_device_id > >& devices)
+{
+	for (unsigned int i = 0; i < platforms.size(); ++i)
+	{
+		std::cout << "OpenCL platform [ " << i << " ]:" << std::endl;
+		prettyPrintPlatformInfo(platforms[i]);
+
+		for (unsigned int n = 0; n < devices[i].size(); ++n)
+		{
+			std::cout << "OpenCL platform [ " << i << " ], device [ " << n << " ]:" << std::endl;
+			prettyPrintDeviceInfo((devices[i])[n]);
+		}
+	}
+
+}
+
+//	Verify a failed condition; return true on fail
+inline cl_bool OPENCL_V_FAIL( cl_int res )
+{
+    if( res == CL_SUCCESS )
+        return CL_FALSE;
+    else
+        return CL_TRUE;
+}
+
+std::string prettyPrintclFFTStatus( const cl_int& status )
+{
+    switch( status )
+    {
+        case CLFFT_INVALID_GLOBAL_WORK_SIZE:
+            return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
+        case CLFFT_INVALID_MIP_LEVEL:
+            return "CLFFT_INVALID_MIP_LEVEL";
+        case CLFFT_INVALID_BUFFER_SIZE:
+            return "CLFFT_INVALID_BUFFER_SIZE";
+        case CLFFT_INVALID_GL_OBJECT:
+            return "CLFFT_INVALID_GL_OBJECT";
+        case CLFFT_INVALID_OPERATION:
+            return "CLFFT_INVALID_OPERATION";
+        case CLFFT_INVALID_EVENT:
+            return "CLFFT_INVALID_EVENT";
+        case CLFFT_INVALID_EVENT_WAIT_LIST:
+            return "CLFFT_INVALID_EVENT_WAIT_LIST";
+        case CLFFT_INVALID_GLOBAL_OFFSET:
+            return "CLFFT_INVALID_GLOBAL_OFFSET";
+        case CLFFT_INVALID_WORK_ITEM_SIZE:
+            return "CLFFT_INVALID_WORK_ITEM_SIZE";
+        case CLFFT_INVALID_WORK_GROUP_SIZE:
+            return "CLFFT_INVALID_WORK_GROUP_SIZE";
+        case CLFFT_INVALID_WORK_DIMENSION:
+            return "CLFFT_INVALID_WORK_DIMENSION";
+        case CLFFT_INVALID_KERNEL_ARGS:
+            return "CLFFT_INVALID_KERNEL_ARGS";
+        case CLFFT_INVALID_ARG_SIZE:
+            return "CLFFT_INVALID_ARG_SIZE";
+        case CLFFT_INVALID_ARG_VALUE:
+            return "CLFFT_INVALID_ARG_VALUE";
+        case CLFFT_INVALID_ARG_INDEX:
+            return "CLFFT_INVALID_ARG_INDEX";
+        case CLFFT_INVALID_KERNEL:
+            return "CLFFT_INVALID_KERNEL";
+        case CLFFT_INVALID_KERNEL_DEFINITION:
+            return "CLFFT_INVALID_KERNEL_DEFINITION";
+        case CLFFT_INVALID_KERNEL_NAME:
+            return "CLFFT_INVALID_KERNEL_NAME";
+        case CLFFT_INVALID_PROGRAM_EXECUTABLE:
+            return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
+        case CLFFT_INVALID_PROGRAM:
+            return "CLFFT_INVALID_PROGRAM";
+        case CLFFT_INVALID_BUILD_OPTIONS:
+            return "CLFFT_INVALID_BUILD_OPTIONS";
+        case CLFFT_INVALID_BINARY:
+            return "CLFFT_INVALID_BINARY";
+        case CLFFT_INVALID_SAMPLER:
+            return "CLFFT_INVALID_SAMPLER";
+        case CLFFT_INVALID_IMAGE_SIZE:
+            return "CLFFT_INVALID_IMAGE_SIZE";
+        case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+            return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+        case CLFFT_INVALID_MEM_OBJECT:
+            return "CLFFT_INVALID_MEM_OBJECT";
+        case CLFFT_INVALID_HOST_PTR:
+            return "CLFFT_INVALID_HOST_PTR";
+        case CLFFT_INVALID_COMMAND_QUEUE:
+            return "CLFFT_INVALID_COMMAND_QUEUE";
+        case CLFFT_INVALID_QUEUE_PROPERTIES:
+            return "CLFFT_INVALID_QUEUE_PROPERTIES";
+        case CLFFT_INVALID_CONTEXT:
+            return "CLFFT_INVALID_CONTEXT";
+        case CLFFT_INVALID_DEVICE:
+            return "CLFFT_INVALID_DEVICE";
+        case CLFFT_INVALID_PLATFORM:
+            return "CLFFT_INVALID_PLATFORM";
+        case CLFFT_INVALID_DEVICE_TYPE:
+            return "CLFFT_INVALID_DEVICE_TYPE";
+        case CLFFT_INVALID_VALUE:
+            return "CLFFT_INVALID_VALUE";
+        case CLFFT_MAP_FAILURE:
+            return "CLFFT_MAP_FAILURE";
+        case CLFFT_BUILD_PROGRAM_FAILURE:
+            return "CLFFT_BUILD_PROGRAM_FAILURE";
+        case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
+            return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
+        case CLFFT_IMAGE_FORMAT_MISMATCH:
+            return "CLFFT_IMAGE_FORMAT_MISMATCH";
+        case CLFFT_MEM_COPY_OVERLAP:
+            return "CLFFT_MEM_COPY_OVERLAP";
+        case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
+            return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
+        case CLFFT_OUT_OF_HOST_MEMORY:
+            return "CLFFT_OUT_OF_HOST_MEMORY";
+        case CLFFT_OUT_OF_RESOURCES:
+            return "CLFFT_OUT_OF_RESOURCES";
+        case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
+            return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
+        case CLFFT_COMPILER_NOT_AVAILABLE:
+            return "CLFFT_COMPILER_NOT_AVAILABLE";
+        case CLFFT_DEVICE_NOT_AVAILABLE:
+            return "CLFFT_DEVICE_NOT_AVAILABLE";
+        case CLFFT_DEVICE_NOT_FOUND:
+            return "CLFFT_DEVICE_NOT_FOUND";
+        case CLFFT_SUCCESS:
+            return "CLFFT_SUCCESS";
+        case CLFFT_NOTIMPLEMENTED:
+            return "CLFFT_NOTIMPLEMENTED";
+        case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
+            return "CLFFT_TRANSPOSED_NOTIMPLEMENTED";
+        case CLFFT_FILE_NOT_FOUND:
+            return "CLFFT_FILE_NOT_FOUND";
+        case CLFFT_FILE_CREATE_FAILURE:
+            return "CLFFT_FILE_CREATE_FAILURE";
+        case CLFFT_VERSION_MISMATCH:
+            return "CLFFT_VERSION_MISMATCH";
+        case CLFFT_INVALID_PLAN:
+            return "CLFFT_INVALID_PLAN";
+        default:
+            return "Error code not defined";
+        break;
+    }
+}
+
+
+int discoverCLPlatforms( cl_device_type deviceType,
+						 std::vector< cl_platform_id >& platforms,
+						 std::vector< std::vector< cl_device_id > >& devices )
+{
+	cl_int status = 0;
+
+	/*
+	* Find all OpenCL platforms this system has to offer.
+	*/
+
+	cl_uint numPlatforms = 0;
+	cl_platform_id platform = NULL;
+	OPENCL_V_THROW(::clGetPlatformIDs(0, NULL, &numPlatforms),
+		"Getting number of platforms( ::clGetPlatformsIDs() )");
+
+	if (numPlatforms > 0)
+	{
+		platforms.resize( numPlatforms );
+		devices.resize( numPlatforms );
+		OPENCL_V_THROW(::clGetPlatformIDs(numPlatforms, &platforms[0], NULL),
+			"Getting Platform Id's ( ::clGetPlatformsIDs() )");
+
+		if (NULL == platforms[0])
+		{
+			throw std::runtime_error("No appropriate OpenCL platform could be found");
+		}
+		
+		/*
+		* Now, for each platform get all available devices matching deviceType.
+		*/
+		for (unsigned int i = 0; i < numPlatforms; ++i)
+		{
+			//	Get the device list for deviceType.
+			//
+			cl_uint numDevices = 0;
+			OPENCL_V_WARN(::clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices),
+				"Getting OpenCL devices ( ::clGetDeviceIDs() )");
+			if (0 == numDevices)
+			{
+				// OPENCL_V_WARN(CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+				continue;
+			}
+
+			devices[i].resize(numDevices);
+			OPENCL_V_THROW(::clGetDeviceIDs(platforms[i], deviceType, numDevices, &(devices[i])[0], NULL),
+				"Getting OpenCL deviceIDs ( ::clGetDeviceIDs() )");
+		}
+	}
+
+	return 0;
+}
+
+std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
+										  cl_int deviceId,
+										  cl_int platformId,
+										  cl_context& context,
+										  bool printclInfo )
+{
+	cl_int status = 0;
+	cl_platform_id platform = NULL;
+	std::vector< cl_device_id > devices(1);
+	devices[0] = NULL;
+	
+	// Have a look at all the available platforms on this system
+	std::vector< cl_platform_id > platformInfos;
+	std::vector< std::vector< cl_device_id > > deviceInfos;
+	discoverCLPlatforms( deviceType, platformInfos, deviceInfos );
+
+
+	for (unsigned int i = 0; i < platformInfos.size(); ++i)
+	{
+		if(i == platformId)
+		{
+			for (unsigned int n = 0; n < deviceInfos[i].size(); ++n)
+			{
+				if (n == deviceId)
+				{
+					platform = platformInfos[i];
+					devices[0] = deviceInfos[i][n];
+
+					if(printclInfo)
+					{
+						prettyPrintPlatformInfo(platform);
+						prettyPrintDeviceInfo(devices[0]);
+					}
+
+					break;
+				}
+			}
+
+			break;
+		}
+	}
+
+
+
+	// Do some error checking if we really selected a valid platform and a valid device
+	if (NULL == devices[0])
+	{
+		OPENCL_V_THROW(CLFFT_DEVICE_NOT_AVAILABLE, "No devices available");
+	}
+
+	if (NULL == platform)
+	{
+		throw std::runtime_error("No appropriate OpenCL platform could be found");
+	}	
+		
+	// Create an OpenCL context
+	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0 };
+	context = clCreateContext(cps,
+		(cl_uint)devices.size(),
+		&devices[0],
+		NULL,
+		NULL,
+		&status);
+	OPENCL_V_THROW(status, "Creating Context ( ::clCreateContextFromType() )");
+
+	return devices;
+}
+
+int cleanupCL( cl_context* context, cl_command_queue* commandQueue,
+    const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent )
+{
+    if( *outEvent != NULL )
+        OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" );
+
+    releaseOpenCLMemBuffer( numBuffersIn, inputBuffer);
+    releaseOpenCLMemBuffer( numBuffersOut, outputBuffer);
+
+    if( *commandQueue != NULL )
+        OPENCL_V_THROW( clReleaseCommandQueue( *commandQueue ), "Error: In clReleaseCommandQueue\n" );
+
+    if( *context != NULL )
+        OPENCL_V_THROW( clReleaseContext( *context ), "Error: In clReleaseContext\n" );
+
+    return 0;
+}
+
+int createOpenCLMemoryBuffer( cl_context& context, const size_t bufferSizeBytes, const cl_uint numBuffers, cl_mem buffer[], cl_mem_flags accessibility) {
+    cl_int status = 0;
+
+    for( cl_uint i = 0; i < numBuffers; ++i )
+    {
+        buffer[ i ] = ::clCreateBuffer( context, accessibility, bufferSizeBytes, NULL, &status);
+        OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
+    }
+
+    return 0;
+}
+
+int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[])
+{
+    for( cl_uint i = 0; i < numBuffers; ++i )
+    {
+        if( buffer[ i ] != NULL )
+            OPENCL_V_THROW( clReleaseMemObject( buffer[ i ] ), "Error: In clReleaseMemObject\n" );
+    }
+
+    return 0;
+}
+
+void createOpenCLCommandQueue( cl_context& context,
+                               cl_uint commandQueueFlags,
+                               cl_command_queue& commandQueue,
+                               std::vector< cl_device_id > devices,
+                               const size_t bufferSizeBytesIn,
+                               const cl_uint numBuffersIn,
+                               cl_mem clMemBufferIn[],
+                               const size_t bufferSizeBytesOut,
+                               const cl_uint numBuffersOut,
+                               cl_mem clMemBufferOut[] )
+{
+    cl_int status = 0;
+    commandQueue = ::clCreateCommandQueue( context, devices[0], commandQueueFlags, &status );
+    OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
+
+    createOpenCLMemoryBuffer( context, bufferSizeBytesIn,  numBuffersIn,  clMemBufferIn,  CL_MEM_READ_WRITE);
+    createOpenCLMemoryBuffer( context, bufferSizeBytesOut, numBuffersOut, clMemBufferOut, CL_MEM_READ_WRITE);
+}
+
diff --git a/src/client-callback/openCL.misc.h b/src/client-callback/openCL.misc.h
new file mode 100644
index 0000000..07cc416
--- /dev/null
+++ b/src/client-callback/openCL.misc.h
@@ -0,0 +1,151 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( OPENCL_MISC_H )
+#define OPENCL_MISC_H
+#include <memory>
+#include <stdexcept>
+#include "unicode.compatibility.h"
+
+//	Creating a portable defintion of countof
+#if defined( _MSC_VER )
+	#define countOf _countof
+#else
+	#define countOf( arr ) ( sizeof( arr ) / sizeof( arr[ 0 ] ) )
+#endif
+
+/*
+ * \brief OpenCL platform and device discovery
+ *        Creates a list of OpenCL platforms
+ *        and their associated devices 
+ */
+int discoverCLPlatforms( cl_device_type deviceType,
+					     std::vector< cl_platform_id >& platforms,
+						 std::vector< std::vector< cl_device_id > >& devices );
+
+void prettyPrintCLPlatforms(std::vector< cl_platform_id >& platforms,
+	std::vector< std::vector< cl_device_id > >& devices);
+
+/*
+ * \brief OpenCL related initialization
+ *        Create Context, Device list
+ *        Load CL file, compile, link CL source
+ *		  Build program and kernel objects
+ */
+std::vector< cl_device_id > initializeCL( cl_device_type deviceType,
+										  cl_int deviceId,
+										  cl_int platformId,
+										  cl_context& context,
+										  bool printclInfo );
+
+/*
+ * \brief OpenCL memory buffer creation
+ */
+int createOpenCLMemoryBuffer(
+		cl_context& context,
+		const size_t bufferSizeBytes,
+		const cl_uint numBuffers,
+		cl_mem buffer[],
+		cl_mem_flags accessibility
+		);
+
+/*
+ * \brief OpenCL command queue creation
+ *        Create Command Queue
+ *        Create OpenCL memory buffer objects
+ */
+void createOpenCLCommandQueue( cl_context& context,
+							   cl_uint commandQueueFlags,
+							   cl_command_queue& commandQueue,
+							   std::vector< cl_device_id > devices,
+							   const size_t bufferSizeBytesIn,
+							   const cl_uint numBuffersIn,
+							   cl_mem clMemBufferIn[],
+							   const size_t bufferSizeBytesOut,
+							   const cl_uint numBuffersOut,
+							   cl_mem clMemBufferOut[] );
+
+/*
+ * \brief release OpenCL memory buffer
+ */
+int releaseOpenCLMemBuffer( const cl_uint numBuffers, cl_mem buffer[] );
+
+std::string prettyPrintclFFTStatus( const cl_int& status );
+
+//	This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
+//	If an error occurs, we throw.
+//	Note: std::runtime_error does not take unicode strings as input, so only strings supported
+inline cl_int OpenCL_V_Throw ( cl_int res, const std::string& msg, size_t lineno )
+{
+	switch( res )
+	{
+		case	CL_SUCCESS:		/**< No error */
+			break;
+		default:
+		{
+			std::stringstream tmp;
+			tmp << "OPENCL_V_THROWERROR< ";
+			tmp << prettyPrintclFFTStatus( res );
+			tmp << " > (";
+			tmp << lineno;
+			tmp << "): ";
+			tmp << msg;
+			std::string errorm (tmp.str());
+			std::cout << errorm<< std::endl;
+			throw	std::runtime_error( errorm );
+		}
+	}
+
+	return	res;
+}
+#define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw (_status, _message, __LINE__)
+
+inline cl_int OpenCL_V_Warn(cl_int res, const std::string& msg, size_t lineno)
+{
+	switch (res)
+	{
+		case	CL_SUCCESS:		/**< No error */
+			break;
+		case	CL_DEVICE_NOT_FOUND:
+			// This happens all the time when discovering the OpenCL capabilities of the system,
+			// so do nothing here.
+			break;
+		default:
+		{
+			std::stringstream tmp;
+			tmp << "OPENCL_V_WARN< ";
+			tmp << prettyPrintclFFTStatus(res);
+			tmp << " > (";
+			tmp << lineno;
+			tmp << "): ";
+			tmp << msg;
+			std::string errorm(tmp.str());
+			std::cout << errorm << std::endl;
+		}
+	}
+
+	return	res;
+}
+#define OPENCL_V_WARN(_status,_message) OpenCL_V_Warn (_status, _message, __LINE__);
+
+/*
+ * \brief Release OpenCL resources (Context, Memory etc.)
+ */
+int cleanupCL( cl_context* context, cl_command_queue* commandQueue, const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent );
+
+#endif
diff --git a/src/client-callback/stdafx.cpp b/src/client-callback/stdafx.cpp
new file mode 100644
index 0000000..2587b2c
--- /dev/null
+++ b/src/client-callback/stdafx.cpp
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// stdafx.cpp : source file that includes just the standard includes
+// clFFT.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/src/include/clFFT.h b/src/include/clFFT.h
index 95ded9f..5de09b1 100644
--- a/src/include/clFFT.h
+++ b/src/include/clFFT.h
@@ -213,6 +213,14 @@ struct clfftSetupData_
 };
 typedef struct clfftSetupData_ clfftSetupData;
 
+/*! @brief Type of Callback function.
+*/
+typedef enum clFFTCallbackType_
+{
+	PRECALLBACK,	/*!< Callback function will be invoked only once at the beginning of FFT transform for each point of input */
+	POSTCALLBACK	/*!< Callback function will be invoked only once at the end of FFT transform for each point of output */
+}clFFTCallbackType;
+
 /*!  @brief An abstract handle to the object that represents the state of the FFT(s) */
 typedef size_t clfftPlanHandle;
 
@@ -541,6 +549,22 @@ extern "C" {
 	 */
 	CLFFTAPI clfftStatus clfftGetTmpBufSize( const clfftPlanHandle plHandle, size_t* buffersize );
 
+	/*! @brief Register the callback parameters
+	 *  @details Client can provide a callback function to do custom processing when reading input data and/or 
+	 *  when writing output data. The callback function is provided as a string.
+	 *  clFFT library incorporates the callback function string into the main FFT kernel. This function is used
+	 *  by client to set the necessary parameters for callback
+	 *  @param[in] plHandle Handle to a plan previously created
+	 *  @param[funcName] Callback function name
+	 *  @param[funcString] Callback function in string form
+	 *  @param[userStructString] Optional - Custom data struct in string form used by Callback function. Pass NULL callback has no custom data type
+	 *  @param[localMemSize] Optional - Local memory size if needed by callback. Pass 0 if local memory not needed by callback
+	 *  @param[callbackType] Type of callback - Pre-Callback or Post-Callback
+	 *  @param[userdata] cl_mem object passed as paarameter to callback function
+	 */
+	CLFFTAPI clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName, const char* funcString, const char* userStructString, int localMemSize, clFFTCallbackType callbackType, void *userdata);
+
+
 	/*! @brief Enqueue an FFT transform operation, and return immediately (non-blocking)
 	 *  @details This transform API is the function that actually computes the FFT transfrom. It is non-blocking as it
 	 *  only enqueues the OpenCL kernels for execution. The synchronization step has to be managed by the user.
diff --git a/src/library/accessors.cpp b/src/library/accessors.cpp
index 4b5bb4f..c47075e 100644
--- a/src/library/accessors.cpp
+++ b/src/library/accessors.cpp
@@ -765,4 +765,39 @@ clfftStatus clfftLocalMemSize( const clfftPlanHandle plHandle, cl_ulong* local_m
 
 	*local_mem_size = plan->envelope.limit_LocalMemSize;
 	return CLFFT_SUCCESS;
+}
+
+clfftStatus clFFTSetPlanCallback(clfftPlanHandle plHandle, const char* funcName, 
+								 const char* funcString, const char* userStructString, 
+								 int localMemSize, clFFTCallbackType callbackType, 
+								 void *userdata)
+{
+	FFTRepo& fftRepo	= FFTRepo::getInstance( );
+	FFTPlan* fftPlan	= NULL;
+	lockRAII* planLock	= NULL;
+
+	OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
+	scopedLock sLock( *planLock, _T( "clFFTSetPlanCallback" ) );
+
+	if (callbackType == PRECALLBACK)
+	{
+		if (fftPlan->dim == CLFFT_1D && (fftPlan->inputLayout == CLFFT_COMPLEX_INTERLEAVED || fftPlan->inputLayout == CLFFT_COMPLEX_PLANAR))
+		{
+			if (funcName != NULL && funcString != NULL)
+			{
+				fftPlan->hasPreCallback = true;
+
+				fftPlan->preCallback.funcname = funcName;
+				fftPlan->preCallback.funcstring = funcString;
+				fftPlan->preCallback.userdatastruct = userStructString;
+				fftPlan->preCallback.localMemSize = (localMemSize > 0) ? localMemSize : 0;
+
+				fftPlan->precallUserData = userdata;
+			}
+		}
+		else
+			return CLFFT_NOTIMPLEMENTED;
+	}
+
+	return	CLFFT_SUCCESS;
 }
\ No newline at end of file
diff --git a/src/library/action.cpp b/src/library/action.cpp
index e1506ff..194df1c 100644
--- a/src/library/action.cpp
+++ b/src/library/action.cpp
@@ -582,6 +582,19 @@ clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
         OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
     }
 
+	//If pre-callback function is set for the plan, pass the appropriate aruments
+	if (this->plan->hasPreCallback)
+	{
+		OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->precallUserData ), _T( "clSetKernelArg failed" ) );
+
+		//Pass LDS size arument if set
+		if (this->plan->preCallback.localMemSize > 0)
+		{
+			//TODO: Check for available LDS beyond what FFT already uses
+			OPENCL_V( clSetKernelArg( kern, uarg++, this->plan->preCallback.localMemSize, NULL ), _T( "clSetKernelArg failed" ) );
+		}
+	}
+
     std::vector< size_t > gWorkSize;
     std::vector< size_t > lWorkSize;
     clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 93d073d..5fd0cd9 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -612,6 +612,10 @@ namespace StockhamGenerator
 		bool halfLds;						// only half the LDS of a complex length need to be used
 		Pass<PR> *nextPass;
 
+		//callback members
+		bool fft_doPreCallback;
+		clfftCallbackParam fft_preCallback;
+
 		inline void RegBase(size_t regC, std::string &str) const
 		{
 			str += "B";
@@ -875,14 +879,51 @@ namespace StockhamGenerator
 									regIndexSub += SztToStr(v);
 								}
 
+								//get offset 
+								std::string bufOffset;
+								bufOffset += offset; bufOffset += " + ( "; bufOffset += SztToStr(numPrev); bufOffset += " + ";
+								bufOffset += "me*"; bufOffset += SztToStr(numButterfly); bufOffset += " + ";
+								bufOffset += SztToStr(i*regC + v); bufOffset += " + ";
+								bufOffset += SztToStr(r*length/radix); bufOffset += " )*";
+								bufOffset += SztToStr(stride);
+
+								//If precallback is set invoke callback function
+								//Invoke callback only once in Planar data layout (i.e.c==0)
+								if (fft_doPreCallback && c == 0)
+								{
+									passStr += "\n\t";
+									passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "] = "; passStr += fft_preCallback.funcname; passStr += "("; 
+									if(interleaved)
+									{
+										passStr += buffer; passStr += ", ";
+									}
+									else
+									{
+										passStr += bufferRe; passStr += ", "; passStr += bufferIm; passStr += ", ";
+									}
+									passStr += bufOffset; passStr += ", userdata";
+									if (fft_preCallback.localMemSize > 0)
+									{
+										passStr += ", localmem";
+									}
+									passStr += ");";
+								}
+
 								passStr += "\n\t";
 								passStr += regIndexSub;
-								passStr += " = "; passStr += buffer;
-								passStr += "["; passStr += offset; passStr += " + ( "; passStr += SztToStr(numPrev); passStr += " + ";
-								passStr += "me*"; passStr += SztToStr(numButterfly); passStr += " + ";
-								passStr += SztToStr(i*regC + v); passStr += " + ";
-								passStr += SztToStr(r*length/radix); passStr += " )*";
-								passStr += SztToStr(stride); passStr += "]"; passStr += tail;
+								passStr += " = "; 
+
+								//Use the return value from precallback if set
+								if (fft_doPreCallback)
+								{
+									passStr += "retPrecallback["; passStr += SztToStr(v); passStr += "]"; 
+									passStr += interleaved ? tail : (c == 0) ? ".x;" : ".y;";
+								}
+								else
+								{
+									passStr += buffer;
+									passStr += "["; passStr += bufOffset; passStr += "]"; passStr += tail;
+								}
 							}
 
 							// Since we read real & imag at once, we break the loop
@@ -1484,7 +1525,7 @@ namespace StockhamGenerator
 			r2c(r2cVal), c2r(c2rVal), rcFull(rcFullVal), rcSimple(rcSimpleVal), realSpecial(realSpecialVal),
 			enableGrouping(true),
 			numB1(0), numB2(0), numB4(0),
-			nextPass(NULL)
+			nextPass(NULL), fft_doPreCallback(false)
 		{
 			assert(radix <= length);
 			assert(length%radix == 0);
@@ -1528,6 +1569,12 @@ namespace StockhamGenerator
 		void SetNextPass(Pass<PR> *np) { nextPass = np; }
 		void SetGrouping(bool grp) { enableGrouping = grp; }
 
+		void SetPrecallback(bool hasPrecallback, clfftCallbackParam precallbackParam) 
+		{ 
+			fft_doPreCallback = hasPrecallback;
+			fft_preCallback = precallbackParam;
+		}
+
 		void GeneratePass(	bool fwd, std::string &passStr, bool fft_3StepTwiddle,
 							bool inInterleaved, bool outInterleaved,
 							bool inReal, bool outReal,
@@ -1688,6 +1735,18 @@ namespace StockhamGenerator
 			{
 				passStr += ", "; passStr += IterRegArgs();
 			}
+
+			//Include callback parameters if callback is set
+			if (fft_doPreCallback)
+			{
+				passStr += ", __global void* userdata";
+
+				if (fft_preCallback.localMemSize > 0)
+				{
+					passStr += ", __local void* localmem";
+				}
+			}
+
 			passStr += ")\n{\n";
 
 			// Register Declarations
@@ -1872,6 +1931,13 @@ namespace StockhamGenerator
 			{
 				if( (!halfLds) || (halfLds && (position == 0)) )
 				{
+					//If precallback is set
+					if (fft_doPreCallback)
+					{
+						passStr += "\n\tfloat2 retPrecallback["; 
+						passStr += (numB4 > 0) ? "4" : (numB2 > 0) ? "2" : "1"; 
+						passStr += "];";
+					}
 					passStr += "\n\tif(rw)\n\t{";
 					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 1, numB1, 0, passStr);
 					SweepRegs(SR_READ, fwd, inInterleaved, inStride, SR_COMP_BOTH, 1.0f, false, bufferInRe, bufferInIm, "inOffset", 2, numB2, numB1, passStr);
@@ -2325,6 +2391,13 @@ namespace StockhamGenerator
 
 					radices.push_back(rad);
 					passes.push_back(Pass<PR>(i, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
+					
+					//Pass precallback information to Pass object if its the first pass. 
+					//This will be used in single kernel transforms
+					if (!r2c2r && i == 0 && params.fft_hasPreCallback)
+					{
+						passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
+					}
 
 					LS *= rad;
 				}
@@ -2364,6 +2437,13 @@ namespace StockhamGenerator
 					radices.push_back(rad);
 					passes.push_back(Pass<PR>(pid, length, rad, cnPerWI, L, LS, R, linearRegs, halfLds, r2c, c2r, rcFull, rcSimple, realSpecial));
 
+					//Pass precallback information to Pass object if its the first pass. 
+					//This will be used in single kernel transforms
+					if (!r2c2r && pid == 0 && params.fft_hasPreCallback)
+					{
+						passes[0].SetPrecallback(params.fft_hasPreCallback, params.fft_preCallback);
+					}
+
 					pid++;
 					LS *= rad;
 
@@ -2560,6 +2640,24 @@ namespace StockhamGenerator
 			uradices.sort();
 			uradices.unique();
 
+			//If pre-callback is set for the plan
+			std::string callbackstr;
+			if (params.fft_hasPreCallback)
+			{
+				//If user defined struct defined for callback function add it to opencl source string
+				if (params.fft_preCallback.userdatastruct != NULL)
+				{
+					callbackstr += params.fft_preCallback.userdatastruct;
+					callbackstr += "\n";
+				}
+
+				//Insert callback function code at the beginning 
+				callbackstr += params.fft_preCallback.funcstring;
+				callbackstr += "\n\n";
+
+				str += callbackstr;
+			}
+
 			typename std::vector< Pass<PR> >::const_iterator p;
 			if(length > 1)
 			{
@@ -2675,6 +2773,20 @@ namespace StockhamGenerator
 
         delete [] nameVendor;
 
+		//If plan has pre-callback
+		callbackstr.clear();
+		if (params.fft_hasPreCallback)
+		{
+			if (params.fft_preCallback.localMemSize > 0)
+			{
+				callbackstr += ", __global void* userdata, __local void* localmem";
+			}
+			else
+			{
+				callbackstr += ", __global void* userdata";
+			}
+		}
+
 				// Function attributes
 				if(params.fft_placeness == CLFFT_INPLACE)
 				{
@@ -2697,12 +2809,28 @@ namespace StockhamGenerator
 
 						if(inInterleaved)
 						{
-							str += "__global "; str += r2Type; str += " * restrict gb)\n";
+							str += "__global "; str += r2Type; str += " * restrict gb";
+
+							//If plan has pre-callback
+							if (params.fft_hasPreCallback)
+							{
+								str += callbackstr;
+							}
+							
+							str += ")\n";
 						}
 						else
 						{
 							str += "__global "; str += rType; str += " * restrict gbRe, ";
-							str += "__global "; str += rType; str += " * restrict gbIm)\n";
+							str += "__global "; str += rType; str += " * restrict gbIm";
+
+							//If plan has pre-callback
+							if (params.fft_hasPreCallback)
+							{
+								str += callbackstr;
+							}
+
+							str += ")\n";
 						}
 					}
 				}
@@ -2752,13 +2880,21 @@ namespace StockhamGenerator
 
 						if(outInterleaved)
 						{
-							str += "__global "; str += r2Type; str += " * restrict gbOut)\n";
+							str += "__global "; str += r2Type; str += " * restrict gbOut";
 						}
 						else
 						{
 							str += "__global "; str += rType; str += " * restrict gbOutRe, ";
-							str += "__global "; str += rType; str += " * restrict gbOutIm)\n";
+							str += "__global "; str += rType; str += " * restrict gbOutIm";
 						}
+
+						//If plan has pre-callback
+						if (params.fft_hasPreCallback)
+						{
+							str += callbackstr;
+						}
+
+						str += ")\n";
 					}
 				}
 
@@ -3148,8 +3284,8 @@ namespace StockhamGenerator
 				{
 					if(params.fft_placeness == CLFFT_INPLACE)
 					{
-						if(inInterleaved)	{ inBuf = "lwb, "; outBuf = "lwb"; }
-						else				{ inBuf = "lwbRe, lwbIm, "; outBuf = "lwbRe, lwbIm"; }
+						if(inInterleaved)	{ inBuf = "gb, "; outBuf = "lwb"; }
+						else				{ inBuf = "gbRe, gbIm, "; outBuf = "lwbRe, lwbIm"; }
 					}
 					else
 					{
@@ -3182,15 +3318,40 @@ namespace StockhamGenerator
 					str += "\n\tfor(uint t=0; t<2; t++)\n\t{\n\n";
 				}
 
+				std::string inOffset;
+				if (!r2c2r)
+				{
+					if (params.fft_placeness == CLFFT_INPLACE)
+					{
+						inOffset += "ioOffset";
+					}
+					else
+					{
+						inOffset += "iOffset";
+					}
+				}
+
 				// Call passes
 				if(numPasses == 1)
 				{
 					str += "\t";
 					str += PassName(0, fwd);
 					str += "("; str += rw; str += me;
-					str += "0, 0, ";
+					str += (!r2c2r) ? inOffset : "0";
+					str += ", 0, ";
 					str += inBuf; str += outBuf;
 					str += IterRegs("&");
+
+					//if precalback set 
+					if (!r2c2r && params.fft_hasPreCallback)
+					{
+						str += ", userdata";
+
+						if (params.fft_preCallback.localMemSize > 0)
+						{
+							str += ", localmem";
+						}
+					}
 					str += ");\n";
 				}
 				else
@@ -3234,12 +3395,32 @@ namespace StockhamGenerator
 						str += me;
 						if(p == passes.begin()) // beginning pass
 						{
-							str += blockCompute ? ldsOff : "0";
+							if (blockCompute)
+							{
+								str += ldsOff;
+							}
+							else
+							{
+								str += (!r2c2r) ? inOffset : "0";
+							}
 							str += ", ";
 							str += ldsOff;
 							str += ", ";
 							str += inBuf;
-							str += ldsArgs; str += IterRegs("&"); str += ");\n";
+							str += ldsArgs; str += IterRegs("&"); 
+							
+							//if precalback set, append additional arguments
+							if (!r2c2r && params.fft_hasPreCallback)
+							{
+								str += ", userdata";
+
+								if (params.fft_preCallback.localMemSize > 0)
+								{
+									str += ", localmem";
+								}
+							}
+
+							str += ");\n";
 							if(!halfLds) { str += exTab; str += "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"; }
 						}
 						else if((p+1) == passes.end()) // ending pass
@@ -3360,6 +3541,13 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
     this->signature.fft_inputLayout  = this->plan->inputLayout;
 	this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
 
+	//Set callback if specified
+	if (this->plan->hasPreCallback)
+	{
+		this->signature.fft_hasPreCallback = true;
+		this->signature.fft_preCallback = this->plan->preCallback;
+	}
+
     ARG_CHECK(this->plan->length.size()    > 0);
 	ARG_CHECK(this->plan->inStride.size()  > 0);
     ARG_CHECK(this->plan->outStride.size() > 0);
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 8a2b4ec..ae06d07 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1588,6 +1588,14 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						colTPlan->inStride.push_back(fftPlan->inStride[0]);
 						colTPlan->outStride.push_back(1);
 
+						//Set callback data if set on top level plan
+						if (fftPlan->hasPreCallback)
+						{
+							colTPlan->hasPreCallback = true;
+							colTPlan->preCallback = fftPlan->preCallback;
+							colTPlan->precallUserData = fftPlan->precallUserData;
+						}
+
 						// Enabling block column compute
 						if( (colTPlan->inStride[0] == length0) && IsPo2(fftPlan->length[0]) && (fftPlan->length[0] < 524288) )
 						{
diff --git a/src/library/plan.h b/src/library/plan.h
index be6231e..747137c 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -91,6 +91,18 @@ enum BlockComputeType
 #define CLFFT_CB_SIZE 32
 #define CLFFT_MAX_INTERNAL_DIM 16
 
+/*! @brief Data structure to store the callback function string and other metadata passed by client 
+*  @details Client sets the callback function and other required parameters through clFFTSetPlanCallback() 
+*  in order to register the callback function. The library populates these values into this data structure
+*/ 
+typedef struct clfftCallbackParam_
+{
+	int localMemSize;			/*!< optional local memory size if needed by callback */
+	const char* funcname;		/*!< callback function name */
+	const char* funcstring;		/*!< callback function in string form */
+	const char* userdatastruct;	/*!< optional custom data struct in string form */
+}clfftCallbackParam;
+
 struct FFTKernelGenKeyParams {
 	/*
 	 *	This structure distills a subset of the fftPlan data,
@@ -135,6 +147,8 @@ struct FFTKernelGenKeyParams {
 	size_t					 blockSIMD;
 	size_t					 blockLDS;
 
+	bool fft_hasPreCallback;
+	clfftCallbackParam fft_preCallback;
 
 	// Default constructor
 	FFTKernelGenKeyParams()
@@ -170,6 +184,8 @@ struct FFTKernelGenKeyParams {
 		blockComputeType = BCT_C2C;
 		blockSIMD = 0;
 		blockLDS = 0;
+
+		fft_hasPreCallback = false;
 	}
 };
 
@@ -429,6 +445,10 @@ public:
 	bool blockCompute;
 	BlockComputeType blockComputeType;
 
+	bool hasPreCallback;
+
+	clfftCallbackParam preCallback;
+	void *precallUserData;
 
     clfftPlanHandle plHandle;
 
@@ -479,7 +499,9 @@ public:
 	,	gen(Stockham)
     ,   action(0)
     ,   plHandle(0)
-	{};
+	,   hasPreCallback(false)
+	{
+	};
 
 
 	size_t ElementSize() const;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list