[clfft] 25/107: adding the binary caching feature to store plans on disk
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jul 30 18:06:29 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 6ab7f558a5d3eeeac830b69251e27593d3c3ef35
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Wed Mar 4 12:14:30 2015 -0600
adding the binary caching feature to store plans on disk
---
src/library/CMakeLists.txt | 6 +
src/library/action.cpp | 762 +++++++++++++++++++++++++++++++
src/library/action.h | 257 +++++++++++
src/library/fft_binary_lookup.cpp | 688 ++++++++++++++++++++++++++++
src/library/fft_binary_lookup.h | 276 +++++++++++
src/library/generator.copy.cpp | 155 ++++---
src/library/generator.stockham.cpp | 220 +++++----
src/library/generator.transpose.gcn.cpp | 168 ++++---
src/library/generator.transpose.vliw.cpp | 154 +++++--
src/library/lifetime.cpp | 2 +
src/library/mainpage.h | 8 +-
src/library/md5sum.c | 312 +++++++++++++
src/library/md5sum.h | 50 ++
src/library/plan.cpp | 639 +++-----------------------
src/library/plan.h | 151 +++++-
src/library/private.h | 21 +
src/library/repo.cpp | 51 +--
src/library/repo.h | 73 ++-
src/library/transform.cpp | 493 +-------------------
19 files changed, 3137 insertions(+), 1349 deletions(-)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 0c81ae3..ebd38a7 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -24,6 +24,9 @@ set( clFFT.Source transform.cpp
generator.transpose.gcn.cpp
generator.copy.cpp
lifetime.cpp
+ fft_binary_lookup.cpp
+ md5sum.c
+ action.cpp
stdafx.cpp )
# Windows only uses dllmain
@@ -32,6 +35,7 @@ if( MSVC )
endif( )
set( clFFT.Headers private.h
+ action.h
repo.h
plan.h
lock.h
@@ -40,6 +44,8 @@ set( clFFT.Headers private.h
generator.stockham.h
generator.transpose.vliw.h
generator.transpose.gcn.h
+ fft_binary_lookup.h
+ md5sum.h
../include/stdafx.h
../include/unicode.compatibility.h
../include/targetver.h
diff --git a/src/library/action.cpp b/src/library/action.cpp
new file mode 100644
index 0000000..b9b9884
--- /dev/null
+++ b/src/library/action.cpp
@@ -0,0 +1,762 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include "stdafx.h"
+#include <math.h>
+#include "private.h"
+#include "repo.h"
+#include "plan.h"
+#include "generator.stockham.h"
+#include "../include/convenienceFunctions.h"
+
+#include "action.h"
+#include "fft_binary_lookup.h"
+
+#define FFT_CACHE_DEBUG 0
+
+
+
+FFTCopyAction::FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTAction(plan, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTAction() failed, exit constructor
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+FFTTransposeVLIWAction::FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTAction(plan, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTAction() failed, exit constructor
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+FFTTransposeGCNAction::FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTAction(plan, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTAction() failed, exit constructor
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+
+FFTStockhamAction::FFTStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTAction(plan, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTAction() failed, exit constructor
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+
+
+FFTAction::FFTAction(FFTPlan * fftPlan, clfftStatus & err)
+ : plan(fftPlan)
+{
+ err = CLFFT_SUCCESS;
+}
+
+clfftStatus FFTAction::selectBufferArguments(FFTPlan * fftPlan,
+ cl_mem* clInputBuffers,
+ cl_mem* clOutputBuffers,
+ std::vector< cl_mem > &inputBuff,
+ std::vector< cl_mem > &outputBuff)
+{
+
+ // 1d with normal length will fall into the below category
+ // add: 2d transpose kernel will fall into here too.
+ inputBuff.reserve( 2 );
+ outputBuff.reserve( 2 );
+
+ // Decode the relevant properties from the plan paramter to figure out how many input/output buffers we have
+ switch( fftPlan->inputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ switch( fftPlan->outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ // Invalid to be an inplace transform, and go from 1 to 2 buffers
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ case CLFFT_REAL:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ default:
+ {
+ // Don't recognize output layout
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ switch( fftPlan->outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ case CLFFT_REAL:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ default:
+ {
+ // Don't recognize output layout
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ switch( fftPlan->outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ case CLFFT_REAL:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ default:
+ {
+ // Don't recognize output layout
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ switch( fftPlan->outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ case CLFFT_REAL:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ inputBuff.push_back( clInputBuffers[ 1 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ default:
+ {
+ // Don't recognize output layout
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
+ break;
+ }
+ case CLFFT_REAL:
+ {
+ switch( fftPlan->outputLayout )
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_COMPLEX_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ }
+
+ break;
+ }
+ case CLFFT_HERMITIAN_PLANAR:
+ {
+ if( fftPlan->placeness == CLFFT_INPLACE )
+ {
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ else
+ {
+ inputBuff.push_back( clInputBuffers[ 0 ] );
+
+ outputBuff.push_back( clOutputBuffers[ 0 ] );
+ outputBuff.push_back( clOutputBuffers[ 1 ] );
+ }
+
+ break;
+ }
+ default:
+ {
+ // Don't recognize output layout
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
+ break;
+ }
+ default:
+ {
+ // Don't recognize output layout
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+
+clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
+ clfftDirection dir,
+ cl_uint numQueuesAndEvents,
+ cl_command_queue* commQueues,
+ cl_uint numWaitEvents,
+ const cl_event* waitEvents,
+ cl_event* outEvents,
+ cl_mem* clInputBuffers,
+ cl_mem* clOutputBuffers)
+{
+ FFTRepo & fftRepo = FFTRepo::getInstance();
+
+ std::vector< cl_mem > inputBuff;
+ std::vector< cl_mem > outputBuff;
+
+
+ clfftStatus status = selectBufferArguments(this->plan,
+ clInputBuffers, clOutputBuffers,
+ inputBuff, outputBuff);
+
+ if (status != CLFFT_SUCCESS)
+ {
+ return status;
+ }
+
+ // TODO: In the case of length == 1, FFT is a trivial NOP, but we still need to apply the forward and backwards tranforms
+ // TODO: Are map lookups expensive to call here? We can cache a pointer to the cl_program/cl_kernel in the plan
+
+ // Translate the user plan into the structure that we use to map plans to clPrograms
+
+ cl_program prog;
+ cl_kernel kern;
+ OPENCL_V( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), prog, this->plan->bakeDevice, this->plan->context ), _T( "fftRepo.getclProgram failed" ) );
+ OPENCL_V( fftRepo.getclKernel( prog, dir, kern ), _T( "fftRepo.getclKernels failed" ) );
+
+ cl_uint uarg = 0;
+ if (!this->plan->transflag && !(this->plan->gen == Copy))
+ {
+ // ::clSetKernelArg() is not thread safe, according to the openCL spec for the same cl_kernel object
+ // TODO: Need to verify that two different plans (which would get through our lock above) with exactly the same
+ // parameters would NOT share the same cl_kernel objects
+
+ /* constant buffer */
+ OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->const_buffer ), _T( "clSetKernelArg failed" ) );
+ }
+
+ // Input buffer(s)
+ // Input may be 1 buffer (CLFFT_COMPLEX_INTERLEAVED)
+ // or 2 buffers (CLFFT_COMPLEX_PLANAR)
+
+ for (size_t i = 0; i < inputBuff.size(); ++i)
+ {
+ OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&inputBuff[i] ), _T( "clSetKernelArg failed" ) );
+ }
+ // Output buffer(s)
+ // Output may be 0 buffers (CLFFT_INPLACE)
+ // or 1 buffer (CLFFT_COMPLEX_INTERLEAVED)
+ // or 2 buffers (CLFFT_COMPLEX_PLANAR)
+ for (size_t o = 0; o < outputBuff.size(); ++o)
+ {
+ OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
+ }
+
+ std::vector< size_t > gWorkSize;
+ std::vector< size_t > lWorkSize;
+ clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
+
+ // TODO: if getWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
+ // that this multidimensional input data array is too large to be transformed
+ // with a single call to clEnqueueNDRangeKernel. For now, we will just return
+ // the error code back up the call stack.
+ // The *correct* course of action would be to split the work into mutliple
+ // calls to clEnqueueNDRangeKernel.
+ if (CLFFT_INVALID_GLOBAL_WORK_SIZE == result)
+ {
+ OPENCL_V( result, _T("Work size too large for clEnqueNDRangeKernel()"));
+ }
+ else
+ {
+ OPENCL_V( result, _T("FFTAction::getWorkSizes failed"));
+ }
+ BUG_CHECK (gWorkSize.size() == lWorkSize.size());
+
+
+ cl_int call_status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
+ NULL, &gWorkSize[ 0 ], &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
+ OPENCL_V( call_status, _T( "clEnqueueNDRangeKernel failed" ) );
+
+ if( fftRepo.pStatTimer )
+ {
+ fftRepo.pStatTimer->AddSample( plHandle, this->plan, kern, numQueuesAndEvents, outEvents, gWorkSize );
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+
+
+// Read the kernels that this plan uses from file, and store into the plan
+clfftStatus FFTAction::writeKernel( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device )
+{
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+
+ std::string kernelPath = getKernelName(gen, plHandle, true);
+
+ // Logic to write string contents out to file
+ tofstreamRAII< std::ofstream, std::string > kernelFile( kernelPath.c_str( ) );
+ if( !kernelFile.get( ) )
+ {
+ std::cerr << "Failed to open kernel file for writing: " << kernelPath.c_str( ) << std::endl;
+ return CLFFT_FILE_CREATE_FAILURE;
+ }
+
+ std::string kernel;
+ OPENCL_V( fftRepo.getProgramCode( gen, data, kernel, device, context ), _T( "fftRepo.getProgramCode failed." ) );
+
+ kernelFile.get( ) << kernel << std::endl;
+
+ return CLFFT_SUCCESS;
+}
+
+
+// **************** TODO TODO TODO ***********************
+// Making compileKernels function take in command queue parameter so we can build for 1 particular device only;
+// this may not be desirable for persistent plans, where we may have to compile for all devices in the context;
+// make changes appropriately before enabling persistent plans and then remove this comment
+
+// Compile the kernels that this plan uses, and store into the plan
+clfftStatus FFTAction::compileKernels( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan )
+{
+ cl_int status = 0;
+ size_t deviceListSize = 0;
+
+ FFTRepo& fftRepo = FFTRepo::getInstance( );
+
+ // create a cl program executable for the device associated with command queue
+ // Get the device
+ cl_device_id &q_device = fftPlan->bakeDevice;
+
+ cl_program program;
+ if( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
+ {
+ FFTBinaryLookup lookup (this->getGenerator(), plHandle, fftPlan->context, q_device);
+
+ lookup.variantRaw(this->getSignatureData(), this->getSignatureData()->datasize);
+
+ if (lookup.found())
+ {
+#if FFT_CACHE_DEBUG
+ // debug message in debug mode to ensure that the cache is used
+ fprintf(stderr, "Kernel loaded from cache\n");
+#endif
+
+ program = lookup.getProgram();
+ }
+ else
+ {
+#if FFT_CACHE_DEBUG
+ fprintf(stderr, "Kernel built from source\n");
+#endif
+
+ // If the user wishes us to write the kernels out to disk, we do so
+ if( fftRepo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
+ {
+ OPENCL_V( writeKernel( plHandle, this->getGenerator(), this->getSignatureData(), fftPlan->context, fftPlan->bakeDevice ), _T( "writeKernel failed." ) );
+ }
+
+ std::string programCode;
+ OPENCL_V( fftRepo.getProgramCode( this->getGenerator(), this->getSignatureData(), programCode, q_device, fftPlan->context ), _T( "fftRepo.getProgramCode failed." ) );
+
+ const char* source = programCode.c_str();
+ program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
+ OPENCL_V( status, _T( "clCreateProgramWithSource failed." ) );
+
+ // create a cl program executable for the device associated with command queue
+
+#if defined(DEBUGGING)
+ status = clBuildProgram( program, 1, &q_device, "-g -cl-opt-disable", NULL, NULL); // good for debugging kernels
+
+// if you have trouble creating smbols that GDB can pick up to set a breakpoint after kernels are loaded into memory
+// this can be used to stop execution to allow you to set a breakpoint in a kernel after kernel symbols are in memory.
+#ifdef DEBUG_BREAK_GDB
+ __debugbreak();
+#endif
+#else
+ status = clBuildProgram( program, 1, &q_device, NULL, NULL, NULL);
+#endif
+ if( status != CL_SUCCESS )
+ {
+ if( status == CL_BUILD_PROGRAM_FAILURE )
+ {
+ size_t buildLogSize = 0;
+ OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
+ _T( "clGetProgramBuildInfo failed" ) );
+
+ std::vector< char > buildLog( buildLogSize );
+ ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
+
+ OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+ _T( "clGetProgramBuildInfo failed" ) );
+
+ std::cerr << "\n\t\t\tBUILD LOG\n";
+ std::cerr << "************************************************\n";
+ std::cerr << &buildLog[ 0 ] << std::endl;
+ std::cerr << "************************************************\n";
+ }
+
+ OPENCL_V( status, _T( "clBuildProgram failed" ) );
+ }
+
+ lookup.setProgram(program, source);
+ lookup.populateCache();
+ }
+
+ fftRepo.setclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context );
+
+
+ // For real transforms we compile either forward or backward kernel
+ bool buildFwdKernel = buildForwardKernel();
+ bool buildBwdKernel = buildBackwardKernel();
+
+ // get a kernel object handle for a kernel with the given name
+ cl_kernel kernel;
+ if( buildFwdKernel )
+ {
+ if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
+ {
+ std::string entryPoint;
+ OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+
+ kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
+ OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+ fftRepo.setclKernel( program, CLFFT_FORWARD, kernel );
+ }
+ }
+
+ if( buildBwdKernel )
+ {
+ if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
+ {
+ std::string entryPoint;
+ OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
+
+ kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
+ OPENCL_V( status, _T( "clCreateKernel failed" ) );
+
+ fftRepo.setclKernel( program, CLFFT_BACKWARD, kernel );
+ }
+ }
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+
diff --git a/src/library/action.h b/src/library/action.h
new file mode 100644
index 0000000..83b3142
--- /dev/null
+++ b/src/library/action.h
@@ -0,0 +1,257 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#if !defined( AMD_CLFFT_action_H )
+#define AMD_CLFFT_action_H
+
+#include "plan.h"
+
+
+//
+// FFTCopyAction
+//
+// Base class for every Copy action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclass FFTGeneratedCopyAction
+//
+class FFTCopyAction : public FFTAction
+{
+public:
+ FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ clfftGenerators getGenerator() { return Copy; }
+};
+
+
+//
+// FFTStockhamAction
+//
+// Base class for every Stockham action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclasses FFTGeneratedStockhamAction or FFTStaticStockhamAction
+//
+class FFTStockhamAction : public FFTAction
+{
+public:
+ FFTStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ clfftGenerators getGenerator() { return Stockham; }
+};
+
+
+//
+// FFTTransposeVLIWAction
+//
+// Base class for every TransposeVLIW action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclass FFTGeneratedTransposeVLIWAction
+//
+class FFTTransposeVLIWAction : public FFTAction
+{
+public:
+ FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ clfftGenerators getGenerator() { return Transpose_VLIW; }
+};
+
+
+//
+// FFTTransposeGCNAction
+//
+// Base class for every TransposeGCN action for the FFT.
+// Currently do nothing special. The kernel generation and compilation occurs
+// by the subclass FFTGeneratedTransposeGCNAction
+//
+class FFTTransposeGCNAction : public FFTAction
+{
+public:
+ FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ clfftGenerators getGenerator() { return Transpose_GCN; }
+};
+
+//
+// FFTGeneratedCopyAction
+//
+// Implements a Copy action for the FFT
+// Its signature is represented by FFTKernelGenKeyParams structure
+//
+// This class implements:
+// - the generation of the kernel string
+// - the build of the kernel
+//
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate copy kernel. That structure is used for the signature of
+// this action. It is common to Stockham, copy and transpose methods. For
+// convenience, this structure is used for every FFTGenerated*Action class,
+// but in practice the copy action only use a few information of that
+// structure, so a proper structure should be used instead.
+//
+class FFTGeneratedCopyAction : public FFTCopyAction
+{
+public:
+ FFTGeneratedCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_COPY_ACTION> Signature;
+
+private:
+ Signature signature;
+
+ clfftStatus generateKernel (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+ clfftStatus getWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+ clfftStatus initParams ();
+
+ bool buildForwardKernel();
+ bool buildBackwardKernel();
+
+public:
+
+ virtual const Signature * getSignatureData()
+ {
+ return &this->signature;
+ }
+};
+
+
+//
+// FFTGeneratedStockhamAction
+//
+// Represents a Stockham action for the FFT. This class implements the former
+// mechanism of kernel generation and compilation for Stockham method.
+//
+// This class implements:
+// - the generation of the kernel string
+// - the build of the kernel
+//
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate kernel. That structure is used for the signature of this
+// action. For convenience, this structure is used for every
+// FFTGenerated*Action class, but a "Stockham-specific" version of that
+// structure should be used instead.
+//
+class FFTGeneratedStockhamAction : public FFTStockhamAction
+{
+public:
+ FFTGeneratedStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_STOCKHAM_ACTION> Signature;
+
+private:
+ Signature signature;
+
+ clfftStatus generateKernel (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+ clfftStatus getWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+ clfftStatus initParams ();
+
+ bool buildForwardKernel();
+ bool buildBackwardKernel();
+
+public:
+
+ virtual const Signature * getSignatureData()
+ {
+ return &this->signature;
+ }
+};
+
+//
+// FFTGeneratedTransposeVLIWAction
+//
+// Implements a TransposeVLIW action for the FFT
+// Its signature is represented by FFTKernelGenKeyParams structure
+//
+// This class implements:
+// - the generation of the kernel string
+// - the build of the kernel
+//
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate transpose kernel. That structure is used for the signature of
+// this action. It is common to Stockham, copy and transpose methods. For
+// convenience, this structure is used for every FFTGenerated*Action class,
+// but in practice the transpose action only use a few information of that
+// structure, so a proper structure should be used instead.
+//
+class FFTGeneratedTransposeVLIWAction : public FFTTransposeVLIWAction
+{
+public:
+ FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_TRANSPOSE_ACTION> Signature;
+
+private:
+ Signature signature;
+
+ clfftStatus generateKernel (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+ clfftStatus getWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+ clfftStatus initParams ();
+
+ bool buildForwardKernel();
+ bool buildBackwardKernel();
+
+public:
+
+ virtual const Signature * getSignatureData()
+ {
+ return &this->signature;
+ }
+};
+
+
+
+// FFTGeneratedTransposeGCNAction
+//
+// Implements a TransposeGCN action for the FFT
+// Its signature is represented by FFTKernelGenKeyParams structure
+//
+// This class implements:
+// - the generation of the kernel string
+// - the build of the kernel
+//
+// The structure FFTKernelGenKeyParams is used to characterize and generate
+// the appropriate transpose kernel. That structure is used for the signature of
+// this action. It is common to Stockham, copy and transpose methods. For
+// convenience, this structure is used for every FFTGenerated*Action class,
+// but in practice the transpose action only use a few information of that
+// structure, so a proper structure should be used instead.
+//
+class FFTGeneratedTransposeGCNAction : public FFTTransposeGCNAction
+{
+public:
+ FFTGeneratedTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
+
+ typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_TRANSPOSE_ACTION> Signature;
+
+private:
+ Signature signature;
+
+ clfftStatus generateKernel (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
+ clfftStatus getWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws);
+ clfftStatus initParams ();
+
+ bool buildForwardKernel();
+ bool buildBackwardKernel();
+
+public:
+
+ virtual const Signature * getSignatureData()
+ {
+ return &this->signature;
+ }
+};
+
+#endif // AMD_CLFFT_action_H
diff --git a/src/library/fft_binary_lookup.cpp b/src/library/fft_binary_lookup.cpp
new file mode 100644
index 0000000..96f2872
--- /dev/null
+++ b/src/library/fft_binary_lookup.cpp
@@ -0,0 +1,688 @@
+/* ************************************************************************
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include "fft_binary_lookup.h"
+
+#include <iostream>
+#include <fstream>
+#include <cassert>
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#ifdef _WIN32
+#include <windows.h>
+#include <direct.h> // for _mkdir
+#else
+#include <unistd.h>
+#endif
+
+extern "C"
+{
+#include "md5sum.h"
+}
+
+// size for clGetDeviceInfo queries
+#define SIZE 256
+
+#define ENABLE_SOURCE_DUMP 0
+
+
+#define CAPS_DEBUG 0
+
+#include <string.h>
+
+char * sep()
+{
+#ifdef __WIN32
+ return (char*)"\\";
+#else
+ return (char*)"/";
+#endif
+}
+
+static std::string cache_path;
+static bool cache_enabled(false);
+
+void clfftInitBinaryCache()
+{
+ const char * path = getenv("CLFFT_CACHE_PATH");
+ if (path)
+ {
+ cache_path = std::string(path) + sep();
+ cache_enabled = true;
+ }
+ else
+ {
+ cache_path = "";
+ }
+}
+
+FFTBinaryLookup::CacheEntry::CacheEntry(const std::string & filename)
+ : m_filename(filename), m_successful_creation(false)
+{
+
+}
+
+void FFTBinaryLookup::CacheEntry::close()
+{
+#ifdef _WIN32
+ CloseHandle(this->m_handle);
+#else
+ ::close(*(int*)this->m_handle);
+ //delete (int*)this->m_handle;
+#endif
+}
+
+bool FFTBinaryLookup::CacheEntry::successful_creation()
+{
+ return this->m_successful_creation;
+}
+
+bool FFTBinaryLookup::CacheEntry::exclusive_create()
+{
+#ifdef _WIN32
+ std::wstring tmp;
+ tmp.assign(this->m_filename.begin(), this->m_filename.end());
+
+ HANDLE handle = CreateFile(tmp.c_str(),
+ GENERIC_WRITE,
+ 0, // no share with other process
+ NULL,
+ CREATE_NEW,
+ FILE_ATTRIBUTE_NORMAL,
+ NULL);
+
+ this->m_handle = handle;
+ this->m_successful_creation = (handle != INVALID_HANDLE_VALUE);
+ return this->m_successful_creation;
+#else
+ int * fd = new int;
+ *fd = open (this->m_filename.c_str(),
+ O_CREAT | O_EXCL,
+ S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+ this->m_handle = fd;
+ this->m_successful_creation = (*fd != -1);
+ return *fd >= 0;
+#endif
+}
+
+FFTBinaryLookup::FFTBinaryLookup(const clfftGenerators gen, const clfftPlanHandle plHandle, cl_context ctxt, cl_device_id device)
+ : m_context(ctxt), m_device(device), m_program(NULL), m_binary(0), m_signature(0), m_cache_enabled(cache_enabled)
+{
+ // initialize the entry name
+ this->m_cache_entry_name = getKernelName(gen, plHandle, false);
+
+ if (this->m_cache_enabled)
+ {
+ // retrieve device informations to compute the path of the cache
+ cl_int err = this->retrieveDeviceAndDriverInfo();
+
+ if (err != CL_SUCCESS)
+ {
+ cache_enabled = false;
+ this->m_cache_enabled = false;
+ }
+ }
+}
+
+FFTBinaryLookup::~FFTBinaryLookup()
+{
+ delete [] this->m_binary;
+ delete [] this->m_signature;
+}
+
+FFTBinaryLookup::Variant::Variant()
+ : m_kind((VariantKind)0), m_size(0), m_data(0)
+{
+}
+
+FFTBinaryLookup::Variant::Variant(VariantKind kind, char * data, size_t size)
+ : m_kind(kind), m_size(size)
+{
+ this->m_data = new char[this->m_size];
+ memcpy(this->m_data, data, size);
+}
+
+FFTBinaryLookup::Variant::~Variant()
+{
+ // delete this->m_data;
+}
+
+void FFTBinaryLookup::variantInt(int num)
+{
+ m_variants.push_back(Variant(INT, (char*)&num, sizeof(num)));
+}
+
+void FFTBinaryLookup::variantDouble(double num)
+{
+ m_variants.push_back(Variant(DOUBLE, (char*)&num, sizeof(num)));
+}
+
+void FFTBinaryLookup::variantCompileOptions(const std::string & opts)
+{
+ m_variants.push_back(Variant(STRING, (char*)opts.c_str(), opts.size()));
+}
+
+void FFTBinaryLookup::variantRaw(const void * data, size_t bytes)
+{
+ m_variants.push_back(Variant(DATA, (char*)data, bytes));
+}
+
+enum BinaryRepresentation
+{
+ LSB,
+ MSB,
+ UNKNOWN
+};
+
+enum BinaryRepresentation getStorageMode(char * data)
+{
+ if (data[0] == 'C' &&
+ data[1] == 'L' &&
+ data[2] == 'B' &&
+ data[3] == '\0')
+ return LSB;
+
+ if (data[0] == 'B' &&
+ data[1] == 'L' &&
+ data[2] == 'C' &&
+ data[3] == '\0')
+ return MSB;
+
+ return UNKNOWN;
+}
+
+void FFTBinaryLookup::finalizeVariant()
+{
+ // serialize variants
+ size_t whole_variant_size_in_bytes = 0;
+
+ // store 1 byte for the variant kind
+ whole_variant_size_in_bytes += this->m_variants.size() * sizeof(int); // for the variant kind
+ whole_variant_size_in_bytes += this->m_variants.size() * sizeof(size_t); // for the variant size
+
+ // add every variant sizes
+ for(size_t i=0 ; i<this->m_variants.size() ; ++i)
+ {
+ const Variant & v = this->m_variants[i];
+
+ // compute the whole size of the signature
+ whole_variant_size_in_bytes += v.m_size;
+ }
+
+ this->m_header.signature_size = whole_variant_size_in_bytes;
+
+ this->m_signature = new char[whole_variant_size_in_bytes];
+ char * current_address = this->m_signature;
+ for(size_t i=0 ; i<this->m_variants.size() ; ++i)
+ {
+ Variant v = this->m_variants[i];
+
+ // write the variant kind
+ memcpy(current_address, &v.m_kind, sizeof(int));
+ current_address += sizeof(v.m_kind);
+
+ // write the variant size
+ memcpy(current_address, &v.m_size, sizeof(v.m_size));
+ current_address += sizeof(v.m_size);
+
+ // write the variant itself
+ memcpy(current_address, v.m_data, v.m_size);
+ current_address += v.m_size;
+ }
+
+ // Update the cache entry name if there are variants...
+ if (whole_variant_size_in_bytes != 0)
+ {
+ char md5_sum[33];
+ md5sum(this->m_signature, this->m_header.signature_size, md5_sum);
+ this->m_cache_entry_name = md5_sum;
+ }
+ else
+ {
+ this->m_cache_entry_name += ".db";
+ }
+}
+
+bool FFTBinaryLookup::loadHeader(std::ifstream &file, size_t length)
+{
+ file.read ((char*)&this->m_header, sizeof(Header));
+
+ // FIXME: Consider LSB Vs MSB number representation
+ assert(getStorageMode(this->m_header.magic_key) == LSB);
+
+ if (this->m_header.whole_file_size != (int)length)
+ {
+ // the file has not been correctly initialized (yet)
+ return false;
+ }
+
+ return true;
+}
+
+bool FFTBinaryLookup::loadBinaryAndSignature(std::ifstream &file)
+{
+ {
+ this->m_binary = new unsigned char [this->m_header.binary_size];
+ const std::istream& res = file.read((char*)this->m_binary, this->m_header.binary_size);
+ if (!res.good())
+ return false;
+ }
+
+ {
+ this->m_signature = new char [this->m_header.signature_size];
+ const std::istream& res = file.read((char*)this->m_signature, this->m_header.signature_size);
+
+ if (!res.good())
+ return false;
+
+ this->m_variants.clear();
+
+ char * current = this->m_signature;
+ for (int i=0 ; i<this->m_header.signature_size ; ++i)
+ {
+ Variant v;
+ v.m_kind = *(VariantKind*) current;
+ i += sizeof(int);
+ current += sizeof(int);
+
+ v.m_size = *(size_t*) current;
+ i += sizeof(size_t);
+ current += sizeof(size_t);
+
+ v.m_data = new char[v.m_size];
+ memcpy(v.m_data, current, v.m_size);
+ i += v.m_size;
+ current += v.m_size;
+
+ this->m_variants.push_back(v);
+ }
+ }
+
+ return true;
+}
+
+bool FFTBinaryLookup::tryLoadCacheFile()
+{
+ // may create empty file or may wait until file is ready
+ const std::string & filename = this->m_path + this->m_cache_entry_name;
+ std::ifstream file (filename.c_str(), std::ios_base::binary);
+
+ if (file.is_open())
+ {
+ file.seekg (0, file.end);
+ size_t length = file.tellg();
+ file.seekg (0, file.beg);
+
+ if (length == 0)
+ {
+ // the file is corrupted, so return false
+ return false;
+ }
+
+ bool st;
+ st = loadHeader(file, length);
+
+ if (! st)
+ return false;
+
+ st = loadBinaryAndSignature(file);
+
+ if (! st)
+ return false;
+
+ file.close();
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+}
+
+bool FFTBinaryLookup::found()
+{
+ // if we could not create the directory, it is useless to
+ if (! this->m_cache_enabled)
+ {
+ return false; // not found
+ }
+
+ this->finalizeVariant(); // serialize variant and cumpute checksum on it
+ // also compute the tree to search from the cache entry (this->m_cache_entry_name, cache path ??)
+
+ if (tryLoadCacheFile())
+ {
+ cl_int err = buildFromBinary(this->m_binary,
+ this->m_header.binary_size);
+
+ // return false if the buildFromBinary failed, true else
+ return err==CL_SUCCESS;
+ }
+
+ return false;
+}
+
+static cl_int getSingleBinaryFromProgram(cl_program program,
+ std::vector<unsigned char*> & binary)
+{
+ // 3 - Determine the size of each program binary
+ size_t size;
+ cl_int err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+ sizeof(size_t),
+ &size, NULL);
+ if (err != CL_SUCCESS)
+ {
+ std::cerr << "Error querying for program binary sizes" << std::endl;
+ return err;
+ }
+
+ binary.resize(size);
+ binary[0] = new unsigned char[size];
+
+ unsigned char * binary_address[1] = { binary[0] };
+
+ // 4 - Get all of the program binaries
+ err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 1 * sizeof(unsigned char*),
+ binary_address, NULL);
+
+
+ if (err != CL_SUCCESS)
+ {
+#if CAPS_DEBUG
+ std::cerr << "Error querying for program binaries" << std::endl;
+#endif
+ return err;
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int FFTBinaryLookup::writeCacheFile(std::vector<unsigned char*> &data)
+{
+ if (! this->m_cache_enabled)
+ {
+ return 0;
+ }
+
+ // exclusive open to ensure that only one thread will write the file
+ const std::string & filename = this->m_path + this->m_cache_entry_name;
+
+ CacheEntry cache_file(filename);
+ bool created = cache_file.exclusive_create();
+
+ // try to exclusively create the cache file on the disk
+ if (created)
+ {
+ // if it was created by the current thread, this one will write into cache file
+ cache_file.close();
+
+ const std::string & filename = this->m_path + this->m_cache_entry_name;
+ std::ofstream file (filename.c_str(), std::ios_base::binary);
+
+ file.write((char*)&this->m_header, sizeof(m_header));
+ file.write((char*)data[0], this->m_header.binary_size);
+ file.write((char*)this->m_signature, this->m_header.signature_size);
+ file.close();
+
+#if ENABLE_SOURCE_DUMP
+ const std::string & srcFilename = this->m_path + this->m_cache_entry_name + ".cl";
+ std::ofstream srcFile (srcFilename.c_str());
+ srcFile << this->m_source;
+
+ srcFile.close();
+#endif
+
+ return CL_SUCCESS;
+ }
+
+ // other thread do not write the cache file
+ return 1;
+}
+
+cl_int FFTBinaryLookup::populateCache()
+{
+ // FIXME: support MSB
+ this->m_header.magic_key[0] = 'C';
+ this->m_header.magic_key[1] = 'L';
+ this->m_header.magic_key[2] = 'B';
+ this->m_header.magic_key[3] = '\0';
+
+ std::vector<unsigned char*> data;
+ cl_int err = getSingleBinaryFromProgram(this->m_program, data);
+
+ if (err != CL_SUCCESS)
+ {
+ return err;
+ }
+
+ this->m_header.header_size = sizeof(Header);
+ this->m_header.binary_size = data.size();
+ this->m_header.whole_file_size = this->m_header.header_size + this->m_header.binary_size + this->m_header.signature_size;
+
+ writeCacheFile(data); // ignore return code, because it does nothing if
+ // the file could not be written (i.e the current
+ // thread did not create the file
+
+ return CL_SUCCESS;
+}
+
+cl_int FFTBinaryLookup::buildFromSource(const char * source)
+{
+ cl_int err;
+ this->m_program = FFTBinaryLookup::buildProgramFromSource(source,
+ this->m_context,
+ this->m_device,
+ err);
+
+ if (err != CL_SUCCESS)
+ {
+ return err;
+ }
+
+ // write to the cache
+ this->populateCache();
+
+ return CL_SUCCESS;
+}
+
+cl_int FFTBinaryLookup::buildFromLoadedBinary(const void * data,
+ size_t len)
+{
+ cl_int err;
+ this->m_program = FFTBinaryLookup::buildProgramFromBinary((char*) data,
+ len,
+ this->m_context,
+ this->m_device,
+ err);
+
+ return err;
+}
+
+cl_int FFTBinaryLookup::buildFromBinary(const void * data,
+ size_t len)
+{
+ cl_int err = buildFromLoadedBinary(data, len);
+ if (err != CL_SUCCESS)
+ return err;
+
+ // write to the cache
+ this->populateCache();
+
+ return CL_SUCCESS;
+}
+
+cl_program FFTBinaryLookup::buildProgramFromSource(const char * source,
+ cl_context context,
+ cl_device_id device,
+ cl_int & err,
+ const char * options)
+{
+ cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &err);
+
+ if (err != CL_SUCCESS)
+ return NULL;
+
+ err = clBuildProgram(program,
+ 1, /* FIXME: 1 device */
+ &device,
+ options,
+ NULL,
+ NULL);
+
+ if (err != CL_SUCCESS)
+ return NULL;
+
+ return program;
+}
+
+
+
+cl_program FFTBinaryLookup::buildProgramFromBinary(const char * data,
+ size_t data_size,
+ cl_context context,
+ cl_device_id device,
+ cl_int & err,
+ const char * options)
+{
+ cl_program program = clCreateProgramWithBinary(context,
+ 1, // num_device
+ &device, // device_list
+ &data_size, // lengths
+ (const unsigned char **)&data,
+ NULL,
+ &err);
+ if (err != CL_SUCCESS)
+ {
+ // FIXME: emit an internal message for OPENCL errors
+ return NULL;
+ }
+
+ err = clBuildProgram(program,
+ 1, /* FIXME: 1 device */
+ &device,
+ options,
+ NULL,
+ NULL);
+
+ if (err != CL_SUCCESS)
+ {
+ return NULL;
+ }
+
+ return program;
+}
+
+cl_program FFTBinaryLookup::getProgram()
+{
+ return this->m_program;
+}
+
+void FFTBinaryLookup::setProgram(cl_program program, const char * source)
+{
+ this->m_program = program;
+ this->m_source = source;
+}
+
+
+static int make_directory(const std::string &path)
+{
+#ifdef _WIN32
+ return _mkdir (path.c_str());
+#else
+ return mkdir (path.c_str(), S_IRWXU);
+#endif
+}
+
+static void do_mkdir(const std::string &path)
+{
+ int st = make_directory (path.c_str());
+
+ if (st != 0)
+ {
+ if ( errno != EEXIST )
+ {
+ std::string tmp = "Cannot not create directory '" + std::string(path) + "': ";
+ throw tmp;
+ }
+ }
+}
+
+cl_int FFTBinaryLookup::retrieveDeviceAndDriverInfo()
+{
+ char m_device_vendor[SIZE];
+ char m_device_name[SIZE];
+ char m_driver_version[SIZE];
+
+ cl_int err = clGetDeviceInfo(this->m_device, CL_DEVICE_VENDOR, sizeof(m_device_vendor),
+ &m_device_vendor, NULL);
+ if (err != CL_SUCCESS)
+ {
+ return err;
+ }
+
+ err = clGetDeviceInfo(this->m_device, CL_DEVICE_NAME, sizeof(m_device_name),
+ &m_device_name, NULL);
+ if (err != CL_SUCCESS)
+ {
+ return err;
+ }
+
+ err = clGetDeviceInfo(this->m_device, CL_DRIVER_VERSION, sizeof(m_driver_version),
+ &m_driver_version, NULL);
+ if (err != CL_SUCCESS)
+ {
+ return err;
+ }
+
+#if CAPS_DEBUG
+ fprintf(stderr, "device vendor = %s\n", this->m_device_vendor);
+ fprintf(stderr, "device name = %s\n", this->m_device_name);
+ fprintf(stderr, "driver version = %s\n", this->m_driver_version);
+#endif
+
+ try
+ {
+ const std::string & root = (std::string(cache_path) + m_device_vendor + sep());
+ do_mkdir(root.c_str());
+
+ const std::string & root2 = (root + m_device_name + sep());
+ do_mkdir(root2.c_str());
+
+ const std::string & root3 = (root2 + m_driver_version + sep());
+ do_mkdir(root3.c_str());
+
+ const std::string & root4 = (root3 + this->m_cache_entry_name + sep());
+ do_mkdir(root4.c_str());
+
+ this->m_path = root4;
+
+ return CL_SUCCESS;
+ }
+ catch (std::string & e)
+ {
+ fprintf(stderr, "%s\n", e.c_str());
+ cache_enabled = false;
+ this->m_cache_enabled = false;
+
+ return CL_INVALID_VALUE;
+ }
+}
diff --git a/src/library/fft_binary_lookup.h b/src/library/fft_binary_lookup.h
new file mode 100644
index 0000000..2819751
--- /dev/null
+++ b/src/library/fft_binary_lookup.h
@@ -0,0 +1,276 @@
+/* ************************************************************************
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef __CLFFT_CLFFT_BINARY_LOOKUP__
+#define __CLFFT_CLFFT_BINARY_LOOKUP__
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <string>
+#include <vector>
+
+#include "generator.h"
+#include "plan.h"
+
+//
+// FFTBinaryLookup defines an API to manage the kernel cache on the disk
+//
+// The FFTBinaryLookup object provides methods to:
+// * check if a cache file exists on the disk or not
+// * fill-up the signature to characterize the program beeing built on the disk
+// * build a cl_program from a string kernel or from a binary
+//
+// A cache entry is a file stored on the disk which contains 3 sections:
+// * A header section (providing information about file structure)
+// * The binary contained in the cl_program
+// * A signature which provides additionnal informations about the kernel
+// and allows to characterize the kernel in the disk cache
+//
+// The environment variable CLFFT_CACHE_PATH defines the location of the
+// cache on the disk. If the variable CLFFT_CACHE_PATH is not defined, no
+// cache file is written on the disk, but the cl_program can be built and
+// remains on memory
+//
+// Concerning multithreading, the policy is that every thread build the
+// cl_program from the source, but only the first one writes it on the
+// disk. Other threads continue with the cl_program in memory.
+//
+// A typical cache query shall be composed of the following steps:
+//
+// (1) Create a local instance of FFTBinaryLookup
+//
+// (2) Specify the additional characteristics (i.e. variants) of the
+// requested program. Those information combined with the program
+// name and the OpenCL context and device shall form a unique
+// signature for the binary program.
+//
+// (3) Perform the effective search by calling the 'found' method
+//
+// (4) if the search was successfull then cl_program can be retreived
+// by a call to the 'getProgram' method
+//
+// (5) if the search was not successfull then a cl_program
+// must be created and populated in the cache by a call
+// to the 'setProgram' method.
+//
+// (6) Destroy the FFTBinaryLookup local instance.
+//
+// For instance, that could be
+//
+// cl_program program ;
+//
+// The program name is part of the signature and shall be unique
+// const char * program_name = "... my unique program name ... " ;
+//
+// FFTBinaryLookup bl(context, device, program_name);
+//
+// // Specify additionnal information used to build a
+// // signature signature for that cache entry
+//
+// bl.variantInt( vectorSize );
+// bl.variantInt( hasBorder );
+// ...
+//
+// // Perform the query
+// if ( bl.found() )
+// {
+// // Success! use the cl_program retreived from the cache
+// program = bl.getProgram();
+// }
+// else
+// {
+// // Failure! we need to build the program ourself
+// program = build_the_program(context,device,vectorSize,...) ;
+// // Inform the lookup object of the program
+// bl.setProgram(program);
+// // And populate the cache
+// bl.populateCache()
+// }
+//
+// Remark: The members buildFromSource, buildFromBinary etc are utility
+// functions that can be used to build the cl_program from either
+// sources or binary (e.g. SPIR). Their use is optionnal.
+//
+//
+class FFTBinaryLookup
+ {
+public:
+ // Constructor
+ // \param ctxt the context for which the cl_program should be built
+ // \param device the device for which the cl_program should be built
+ // \param kernel_name the kernel identifier
+ FFTBinaryLookup(const clfftGenerators gen, const clfftPlanHandle plHandle, cl_context ctxt, cl_device_id device);
+ ~FFTBinaryLookup();
+
+ // Methods to fill up the signature of the cache entry
+ void variantInt(int num);
+ void variantDouble(double num);
+ void variantCompileOptions(const std::string & opts);
+ void variantRaw(const void * data, size_t bytes);
+
+ // Indicates whether or not the cache entry was found on the disk
+ // If the cache entry was found and is complete on the disk, its content
+ // is loaded
+ // \return true if a cache entry was found, false else
+ bool found();
+
+ // Build a cl_program from the source code and init attributes
+ // of the current structure
+ // so that the program can be accessed with the getProgram method
+ // Write the file to the cache
+ cl_int buildFromSource(const char * source);
+
+ // Build a cl_program from the source code and init attributes
+ // so that the program can be accessed with the getProgram method
+ // Write the file to the cache
+ cl_int buildFromBinary(const void * data,
+ size_t len);
+
+ // Returns the cl_program built from binary or loaded from disk
+ cl_program getProgram();
+
+ // Set the current m_program to the given program
+ void setProgram(cl_program program, const char * source);
+
+ // Build a cl_program from a text
+ static cl_program buildProgramFromSource(const char * filename,
+ cl_context context,
+ cl_device_id device,
+ cl_int & err,
+ const char * options = 0);
+
+ // Build a cl_program from binary
+ static cl_program buildProgramFromBinary(const char * data,
+ size_t data_size,
+ cl_context context,
+ cl_device_id device,
+ cl_int & err,
+ const char * options = 0);
+
+ // Initialize the whole cache file information (magic_key, header and program)
+ // and dump on the disk
+ cl_int populateCache();
+
+private:
+
+ // Serialize variants and compute the checksum to load the file from cache
+ void finalizeVariant();
+
+ // Build a cl_program from the source code and init attributes
+ // so that the program can be accessed with the getProgram method
+ // Do not write the file to the cache
+ cl_int buildFromLoadedBinary(const void * data,
+ size_t len);
+
+ // Try to retrieve the header of the cache file
+ // Returns: ok if the header sections was successfully loaded, false else
+ bool loadHeader(std::ifstream &file, size_t length);
+
+ // Try to retrieve the cl_program and its signature in file
+ // Returns: ok if the binary and signature sections were successfully loaded, false else
+ bool loadBinaryAndSignature(std::ifstream &file);
+
+ // Try to create a file associated to the current program/variant in the cache folder
+ // Returns true if the file was successfully opened and loaded, false else
+ bool tryLoadCacheFile();
+
+ // Dump the file on the disk with the name stored in this->m_cache_entry_name
+ cl_int writeCacheFile(std::vector<unsigned char*> &data);
+
+ // Retrieve device name, device vendor and driver number by calling
+ // clGetDeviceInfo
+ cl_int retrieveDeviceAndDriverInfo();
+
+ // Cache entry name
+ std::string m_cache_entry_name;
+
+ // Path for the cache entry name
+ std::string m_path;
+
+ // Header structure of a cache entry
+ typedef struct Header_
+ {
+ char magic_key[4]; // = |C|L|F|\0, useful to know that we are loading a clfft cache entry
+ int whole_file_size; // the whole file of the size to know if the current file is complete or not
+ int header_size; // = sizeof(Header)
+ int binary_size; // kernel binary size
+ int signature_size; // variant information
+ } Header;
+
+ Header m_header;
+
+ cl_context m_context;
+ cl_device_id m_device;
+
+ cl_program m_program;
+
+ std::string m_source;
+
+ unsigned char * m_binary;
+ char * m_signature;
+
+ enum VariantKind
+ {
+ INT,
+ DOUBLE,
+ STRING,
+ DATA
+ };
+
+ struct Variant
+ {
+ Variant();
+ Variant(VariantKind kind, char * data, size_t size);
+
+ ~Variant();
+
+ VariantKind m_kind;
+ size_t m_size;
+ char * m_data;
+
+ static char * serialize(VariantKind kind, char * data, size_t size);
+ static Variant unserialize(char * data);
+ };
+
+ // Cache entry, useful to abstract Windows and linux
+ // cache entry file descriptor
+ struct CacheEntry
+ {
+ CacheEntry(const std::string & filename);
+ bool exclusive_create();
+ void close();
+ bool successful_creation();
+
+ private:
+ std::string m_filename;
+ bool m_successful_creation;
+ void * m_handle;
+ };
+
+ // Variants
+ std::vector<Variant> m_variants;
+
+ // Indicates whether the cache should be used or not
+ bool m_cache_enabled;
+};
+
+#undef SIZE
+
+#endif
diff --git a/src/library/generator.copy.cpp b/src/library/generator.copy.cpp
index 1221882..3315e0d 100644
--- a/src/library/generator.copy.cpp
+++ b/src/library/generator.copy.cpp
@@ -19,6 +19,71 @@
#include <math.h>
#include <list>
#include "generator.stockham.h"
+#include "action.h"
+
+FFTGeneratedCopyAction::FFTGeneratedCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTCopyAction(plHandle, plan, queue, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTCopyAction() failed, exit
+ fprintf(stderr, "FFTCopyAction() failed!\n");
+ return;
+ }
+
+ // Initialize the FFTAction::FFTKernelGenKeyParams member
+ err = this->initParams();
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedCopyAction::initParams() failed!\n");
+ return;
+ }
+
+ FFTRepo &fftRepo = FFTRepo::getInstance();
+
+ err = this->generateKernel(fftRepo, queue);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedCopyAction::generateKernel failed\n");
+ return;
+ }
+
+ err = compileKernels( queue, plHandle, plan);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedCopyAction::compileKernels failed\n");
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+bool FFTGeneratedCopyAction::buildForwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool h2c = (inputLayout == CLFFT_HERMITIAN_PLANAR) || (inputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+ bool c2h = (outputLayout == CLFFT_HERMITIAN_PLANAR) || (outputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+
+ return (r2c_transform || c2h) || (!(h2c || c2h));
+}
+
+bool FFTGeneratedCopyAction::buildBackwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool h2c = (inputLayout == CLFFT_HERMITIAN_PLANAR) || (inputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+ bool c2h = (outputLayout == CLFFT_HERMITIAN_PLANAR) || (outputLayout == CLFFT_HERMITIAN_INTERLEAVED);
+
+ return (c2r_transform || h2c) || (!(h2c || c2h));
+}
using namespace StockhamGenerator;
@@ -30,7 +95,7 @@ namespace CopyGenerator
{
size_t N;
size_t Nt;
- const FFTKernelGenKeyParams params;
+ const FFTGeneratedCopyAction::Signature & params;
bool h2c, c2h;
bool general;
@@ -59,7 +124,7 @@ namespace CopyGenerator
}
public:
- CopyKernel( const FFTKernelGenKeyParams ¶msVal) :
+ CopyKernel( const FFTGeneratedCopyAction::Signature ¶msVal) :
params(paramsVal)
{
@@ -338,66 +403,58 @@ namespace CopyGenerator
};
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Copy> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTGeneratedCopyAction::initParams ()
{
// Query the devices in this context for their local memory sizes
// How we generate a kernel depends on the *minimum* LDS size for all devices.
//
const FFTEnvelope * pEnvelope = NULL;
- OPENCL_V(const_cast<FFTPlan*>(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+ OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
BUG_CHECK (NULL != pEnvelope);
- ::memset( ¶ms, 0, sizeof( params ) );
- params.fft_precision = this->precision;
- params.fft_placeness = this->placeness;
- params.fft_inputLayout = this->inputLayout;
- params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
+ this->signature.fft_precision = this->plan->precision;
+ this->signature.fft_placeness = this->plan->placeness;
+ this->signature.fft_inputLayout = this->plan->inputLayout;
+ this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
- ARG_CHECK (this->inStride.size() == this->outStride.size())
+ ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
- params.fft_outputLayout = this->outputLayout;
+ this->signature.fft_outputLayout = this->plan->outputLayout;
- params.fft_DataDim = this->length.size() + 1;
+ this->signature.fft_DataDim = this->plan->length.size() + 1;
int i = 0;
- for(i = 0; i < (params.fft_DataDim - 1); i++)
+ for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
{
- params.fft_N[i] = this->length[i];
- params.fft_inStride[i] = this->inStride[i];
- params.fft_outStride[i] = this->outStride[i];
+ this->signature.fft_N[i] = this->plan->length[i];
+ this->signature.fft_inStride[i] = this->plan->inStride[i];
+ this->signature.fft_outStride[i] = this->plan->outStride[i];
}
- params.fft_inStride[i] = this->iDist;
- params.fft_outStride[i] = this->oDist;
+ this->signature.fft_inStride[i] = this->plan->iDist;
+ this->signature.fft_outStride[i] = this->plan->oDist;
- params.fft_fwdScale = this->forwardScale;
- params.fft_backScale = this->backwardScale;
+ this->signature.fft_fwdScale = this->plan->forwardScale;
+ this->signature.fft_backScale = this->plan->backwardScale;
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus FFTGeneratedCopyAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
{
- FFTKernelGenKeyParams fftParams;
- OPENCL_V( this->GetKernelGenKeyPvt<Copy>( fftParams ), _T("GetKernelGenKey() failed!") );
-
bool h2c, c2h;
- h2c = ( (fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
- (fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
- c2h = ( (fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
- (fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ h2c = ( (this->signature.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
+ c2h = ( (this->signature.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
bool general = !(h2c || c2h);
- size_t count = this->batchsize;
+ size_t count = this->plan->batchsize;
- switch(fftParams.fft_DataDim)
+ switch(this->signature.fft_DataDim)
{
case 5: assert(false);
- case 4: count *= fftParams.fft_N[2];
- case 3: count *= fftParams.fft_N[1];
+ case 4: count *= this->signature.fft_N[2];
+ case 3: count *= this->signature.fft_N[1];
case 2:
{
if(general)
@@ -406,7 +463,7 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std:
}
else
{
- count *= (1 + fftParams.fft_N[0]/2);
+ count *= (1 + this->signature.fft_N[0]/2);
}
}
break;
@@ -419,40 +476,30 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Copy> (std::vector<size_t> & globalWS, std:
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetMax1DLengthPvt<Copy> (size_t * longest) const
-{
- return FFTPlan::GetMax1DLengthPvt<Stockham>(longest);
-}
using namespace CopyGenerator;
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedCopyAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
{
- FFTKernelGenKeyParams params;
- OPENCL_V( this->GetKernelGenKeyPvt<Copy> (params), _T("GetKernelGenKey() failed!") );
bool h2c, c2h;
- h2c = ( (params.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) ||
- (params.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
- c2h = ( (params.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) ||
- (params.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ) ? true : false;
+ h2c = ( (this->signature.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
+ c2h = ( (this->signature.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (this->signature.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED) );
bool general = !(h2c || c2h);
std::string programCode;
- Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+ Precision pr = (this->signature.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
switch(pr)
{
case P_SINGLE:
{
- CopyKernel<P_SINGLE> kernel(params);
+ CopyKernel<P_SINGLE> kernel(this->signature);
kernel.GenerateKernel(programCode);
} break;
case P_DOUBLE:
{
- CopyKernel<P_DOUBLE> kernel(params);
+ CopyKernel<P_DOUBLE> kernel(this->signature);
kernel.GenerateKernel(programCode);
} break;
}
@@ -466,15 +513,15 @@ clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- OPENCL_V( fftRepo.setProgramCode( Copy, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramCode( this->getGenerator(), this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
if(general)
{
- OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_general", "copy_general", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( this->getGenerator(), this->getSignatureData(), "copy_general", "copy_general", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
}
else
{
- OPENCL_V( fftRepo.setProgramEntryPoints( Copy, params, "copy_c2h", "copy_h2c", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( this->getGenerator(), this->getSignatureData(), "copy_c2h", "copy_h2c", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
}
return CLFFT_SUCCESS;
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 08f173c..bd45288 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -19,6 +19,73 @@
#include <math.h>
#include "generator.stockham.h"
#include <list>
+#include "action.h"
+
+
+FFTGeneratedStockhamAction::FFTGeneratedStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTStockhamAction(plHandle, plan, queue, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTAction() failed, exit
+ fprintf(stderr, "FFTStockhamAction() failed!\n");
+ return;
+ }
+
+ // Initialize the FFTAction::FFTKernelGenKeyParams member
+ err = this->initParams();
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedStockhamAction::initParams() failed!\n");
+ return;
+ }
+
+ FFTRepo &fftRepo = FFTRepo::getInstance();
+
+ err = this->generateKernel(fftRepo, queue);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedStockhamAction::generateKernel failed\n");
+ return;
+ }
+
+ err = compileKernels( queue, plHandle, plan);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedStockhamAction::compileKernels failed\n");
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+bool FFTGeneratedStockhamAction::buildForwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedStockhamAction::buildBackwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || c2r_transform;
+}
+
// FFT Stockham Autosort Method
//
@@ -3182,64 +3249,64 @@ namespace StockhamGenerator
using namespace StockhamGenerator;
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTGeneratedStockhamAction::initParams ()
{
// Query the devices in this context for their local memory sizes
// How we generate a kernel depends on the *minimum* LDS size for all devices.
//
const FFTEnvelope * pEnvelope = NULL;
- OPENCL_V(const_cast<FFTPlan*>(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+ OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
BUG_CHECK (NULL != pEnvelope);
- ::memset( ¶ms, 0, sizeof( params ) );
- params.fft_precision = this->precision;
- params.fft_placeness = this->placeness;
- params.fft_inputLayout = this->inputLayout;
- params.fft_MaxWorkGroupSize = this->envelope.limit_WorkGroupSize;
+ // Remainder: params was properly cleared by its constructor
+ // clearing it again would destroy datasize and id!!
+ this->signature.fft_precision = this->plan->precision;
+ this->signature.fft_placeness = this->plan->placeness;
+ this->signature.fft_inputLayout = this->plan->inputLayout;
+ this->signature.fft_MaxWorkGroupSize = this->plan->envelope.limit_WorkGroupSize;
- ARG_CHECK(this->length.size() > 0);
- ARG_CHECK(this->inStride.size() > 0);
- ARG_CHECK(this->outStride.size() > 0);
+ ARG_CHECK(this->plan->length.size() > 0);
+ ARG_CHECK(this->plan->inStride.size() > 0);
+ ARG_CHECK(this->plan->outStride.size() > 0);
- ARG_CHECK (this->inStride.size() == this->outStride.size())
+ ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
- bool real_transform = ((this->inputLayout == CLFFT_REAL) || (this->outputLayout == CLFFT_REAL));
+ bool real_transform = ((this->plan->inputLayout == CLFFT_REAL) || (this->plan->outputLayout == CLFFT_REAL));
- if ( (CLFFT_INPLACE == this->placeness) && (!real_transform) ) {
+ if ( (CLFFT_INPLACE == this->plan->placeness) && (!real_transform) ) {
// If this is an in-place transform the
// input and output layout, dimensions and strides
// *MUST* be the same.
//
- ARG_CHECK (this->inputLayout == this->outputLayout)
- params.fft_outputLayout = this->inputLayout;
- for (size_t u = this->inStride.size(); u-- > 0; ) {
- ARG_CHECK (this->inStride[u] == this->outStride[u]);
+ ARG_CHECK (this->plan->inputLayout == this->plan->outputLayout)
+ this->signature.fft_outputLayout = this->plan->inputLayout;
+ for (size_t u = this->plan->inStride.size(); u-- > 0; ) {
+ ARG_CHECK (this->plan->inStride[u] == this->plan->outStride[u]);
}
} else {
- params.fft_outputLayout = this->outputLayout;
+ this->signature.fft_outputLayout = this->plan->outputLayout;
}
- params.fft_DataDim = this->length.size() + 1;
+ this->signature.fft_DataDim = this->plan->length.size() + 1;
int i = 0;
- for(i = 0; i < (params.fft_DataDim - 1); i++)
+ for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
{
- params.fft_N[i] = this->length[i];
- params.fft_inStride[i] = this->inStride[i];
- params.fft_outStride[i] = this->outStride[i];
+ this->signature.fft_N[i] = this->plan->length[i];
+ this->signature.fft_inStride[i] = this->plan->inStride[i];
+ this->signature.fft_outStride[i] = this->plan->outStride[i];
}
- params.fft_inStride[i] = this->iDist;
- params.fft_outStride[i] = this->oDist;
+ this->signature.fft_inStride[i] = this->plan->iDist;
+ this->signature.fft_outStride[i] = this->plan->oDist;
- params.fft_RCsimple = this->RCsimple;
+ this->signature.fft_RCsimple = this->plan->RCsimple;
- params.blockCompute = this->blockCompute;
- params.blockComputeType = this->blockComputeType;
+ this->signature.blockCompute = this->plan->blockCompute;
+ this->signature.blockComputeType = this->plan->blockComputeType;
- params.fft_twiddleFront = this->twiddleFront;
+ this->signature.fft_twiddleFront = this->plan->twiddleFront;
size_t wgs, nt;
#ifdef PARMETERS_TO_BE_READ
@@ -3249,106 +3316,101 @@ clfftStatus FFTPlan::GetKernelGenKeyPvt<Stockham> (FFTKernelGenKeyParams & param
nt = pr.numTransformsPerWg;
#else
size_t t_wgs, t_nt;
- Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+ Precision pr = (this->signature.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
switch(pr)
{
case P_SINGLE:
{
KernelCoreSpecs<P_SINGLE> kcs;
- kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
- if(params.blockCompute)
+ kcs.GetWGSAndNT(this->signature.fft_N[0], t_wgs, t_nt);
+ if(this->signature.blockCompute)
{
- params.blockSIMD = Kernel<P_SINGLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
- params.blockLDS = Kernel<P_SINGLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+ this->signature.blockSIMD = Kernel<P_SINGLE>::BlockSizes::BlockWorkGroupSize(this->signature.fft_N[0]);
+ this->signature.blockLDS = Kernel<P_SINGLE>::BlockSizes::BlockLdsSize(this->signature.fft_N[0]);
}
} break;
case P_DOUBLE:
{
KernelCoreSpecs<P_DOUBLE> kcs;
- kcs.GetWGSAndNT(params.fft_N[0], t_wgs, t_nt);
- if(params.blockCompute)
+ kcs.GetWGSAndNT(this->signature.fft_N[0], t_wgs, t_nt);
+ if(this->signature.blockCompute)
{
- params.blockSIMD = Kernel<P_DOUBLE>::BlockSizes::BlockWorkGroupSize(params.fft_N[0]);
- params.blockLDS = Kernel<P_DOUBLE>::BlockSizes::BlockLdsSize(params.fft_N[0]);
+ this->signature.blockSIMD = Kernel<P_DOUBLE>::BlockSizes::BlockWorkGroupSize(this->signature.fft_N[0]);
+ this->signature.blockLDS = Kernel<P_DOUBLE>::BlockSizes::BlockLdsSize(this->signature.fft_N[0]);
}
} break;
}
- if((t_wgs != 0) && (t_nt != 0) && (this->envelope.limit_WorkGroupSize >= 256))
+ if((t_wgs != 0) && (t_nt != 0) && (this->plan->envelope.limit_WorkGroupSize >= 256))
{
wgs = t_wgs;
nt = t_nt;
}
else
- DetermineSizes(this->envelope.limit_WorkGroupSize, params.fft_N[0], wgs, nt);
+ DetermineSizes(this->plan->envelope.limit_WorkGroupSize, this->signature.fft_N[0], wgs, nt);
#endif
- assert((nt * params.fft_N[0]) >= wgs);
- assert((nt * params.fft_N[0])%wgs == 0);
+ assert((nt * this->signature.fft_N[0]) >= wgs);
+ assert((nt * this->signature.fft_N[0])%wgs == 0);
- params.fft_R = (nt * params.fft_N[0])/wgs;
- params.fft_SIMD = wgs;
+ this->signature.fft_R = (nt * this->signature.fft_N[0])/wgs;
+ this->signature.fft_SIMD = wgs;
- if (this->large1D != 0) {
- ARG_CHECK (params.fft_N[0] != 0)
- ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
- params.fft_3StepTwiddle = true;
- ARG_CHECK ( this->large1D == (params.fft_N[1] * params.fft_N[0]) );
+ if (this->plan->large1D != 0) {
+ ARG_CHECK (this->signature.fft_N[0] != 0)
+ ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
+ this->signature.fft_3StepTwiddle = true;
+ ARG_CHECK ( this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
}
- params.fft_fwdScale = this->forwardScale;
- params.fft_backScale = this->backwardScale;
+ this->signature.fft_fwdScale = this->plan->forwardScale;
+ this->signature.fft_backScale = this->plan->backwardScale;
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Stockham> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus FFTGeneratedStockhamAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
{
// How many complex numbers in the input mutl-dimensional array?
//
unsigned long long count = 1;
- for (unsigned u = 0; u < length.size(); ++u) {
- count *= std::max<size_t> (1, this->length[ u ]);
+ for (unsigned u = 0; u < this->plan->length.size(); ++u) {
+ count *= std::max<size_t> (1, this->plan->length[ u ]);
}
- count *= this->batchsize;
+ count *= this->plan->batchsize;
- FFTKernelGenKeyParams fftParams;
- // Translate the user plan into the structure that we use to map plans to clPrograms
- OPENCL_V( this->GetKernelGenKeyPvt<Stockham>( fftParams ), _T("GetKernelGenKey() failed!") );
- if(fftParams.blockCompute)
+ if(this->signature.blockCompute)
{
- count = DivRoundingUp<unsigned long long> (count, fftParams.blockLDS);
- count = count * fftParams.blockSIMD;
+ count = DivRoundingUp<unsigned long long> (count, this->signature.blockLDS);
+ count = count * this->signature.blockSIMD;
globalWS.push_back( static_cast< size_t >( count ) );
- localWS.push_back( fftParams.blockSIMD );
+ localWS.push_back( this->signature.blockSIMD );
return CLFFT_SUCCESS;
}
- count = DivRoundingUp<unsigned long long> (count, fftParams.fft_R); // count of WorkItems
- count = DivRoundingUp<unsigned long long> (count, fftParams.fft_SIMD); // count of WorkGroups
+ count = DivRoundingUp<unsigned long long> (count, this->signature.fft_R); // count of WorkItems
+ count = DivRoundingUp<unsigned long long> (count, this->signature.fft_SIMD); // count of WorkGroups
// for real transforms we only need half the work groups since we do twice the work in 1 work group
- if( !(fftParams.fft_RCsimple) && ((fftParams.fft_inputLayout == CLFFT_REAL) || (fftParams.fft_outputLayout == CLFFT_REAL)) )
+ if( !(this->signature.fft_RCsimple) && ((this->signature.fft_inputLayout == CLFFT_REAL) || (this->signature.fft_outputLayout == CLFFT_REAL)) )
count = DivRoundingUp<unsigned long long> (count, 2);
- count = std::max<unsigned long long> (count, 1) * fftParams.fft_SIMD;
+ count = std::max<unsigned long long> (count, 1) * this->signature.fft_SIMD;
// .. count of WorkItems, rounded up to next multiple of fft_SIMD.
// 1 dimension work group size
globalWS.push_back( static_cast< size_t >( count ) );
- localWS.push_back( fftParams.fft_SIMD );
+ localWS.push_back( this->signature.fft_SIMD );
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetMax1DLengthPvt<Stockham> (size_t * longest) const
+clfftStatus FFTPlan::GetMax1DLengthStockham (size_t * longest) const
{
// TODO The caller has already acquired the lock on *this
// However, we shouldn't depend on it.
@@ -3370,12 +3432,8 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Stockham> (size_t * longest) const
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedStockhamAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
{
- FFTKernelGenKeyParams params;
- OPENCL_V( this->GetKernelGenKeyPvt<Stockham> (params), _T("GetKernelGenKey() failed!") );
-
cl_int status = CL_SUCCESS;
cl_device_id Device = NULL;
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
@@ -3386,17 +3444,17 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
std::string programCode;
- Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
+ Precision pr = (this->signature.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
switch(pr)
{
case P_SINGLE:
{
- Kernel<P_SINGLE> kernel(params);
+ Kernel<P_SINGLE> kernel(this->signature);
kernel.GenerateKernel(programCode, Device);
} break;
case P_DOUBLE:
{
- Kernel<P_DOUBLE> kernel(params);
+ Kernel<P_DOUBLE> kernel(this->signature);
kernel.GenerateKernel(programCode, Device);
} break;
}
@@ -3405,8 +3463,8 @@ clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_comm
ReadKernelFromFile(programCode);
#endif
- OPENCL_V( fftRepo.setProgramCode( Stockham, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
- OPENCL_V( fftRepo.setProgramEntryPoints( Stockham, params, "fft_fwd", "fft_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramCode( this->getGenerator(), this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( this->getGenerator(), this->getSignatureData(), "fft_fwd", "fft_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
return CLFFT_SUCCESS;
}
diff --git a/src/library/generator.transpose.gcn.cpp b/src/library/generator.transpose.gcn.cpp
index 1d4a46d..c1a3952 100644
--- a/src/library/generator.transpose.gcn.cpp
+++ b/src/library/generator.transpose.gcn.cpp
@@ -28,6 +28,74 @@
#include "generator.transpose.gcn.h"
#include "generator.stockham.h"
+#include "action.h"
+
+FFTGeneratedTransposeGCNAction::FFTGeneratedTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTTransposeGCNAction(plHandle, plan, queue, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTTransposeGCNAction() failed, exit
+ fprintf(stderr, "FFTTransposeGCNAction() failed!\n");
+ return;
+ }
+
+ // Initialize the FFTAction::FFTKernelGenKeyParams member
+ err = this->initParams();
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeGCNAction::initParams() failed!\n");
+ return;
+ }
+
+ FFTRepo &fftRepo = FFTRepo::getInstance();
+
+ err = this->generateKernel(fftRepo, queue);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeGCNAction::generateKernel failed\n");
+ return;
+ }
+
+ err = compileKernels( queue, plHandle, plan);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeGCNAction::compileKernels failed\n");
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+
+bool FFTGeneratedTransposeGCNAction::buildForwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedTransposeGCNAction::buildBackwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || c2r_transform;
+}
+
+
// A structure that represents a bounding box or tile, with convenient names for the row and column addresses
// local work sizes
struct tile
@@ -142,7 +210,7 @@ const std::string pmImagOut( "pmImagOut" );
const std::string pmComplexIn( "pmComplexIn" );
const std::string pmComplexOut( "pmComplexOut" );
-static clfftStatus genTransposePrototype( const FFTKernelGenKeyParams& params, const tile& lwSize, const std::string& dtPlanar, const std::string& dtComplex,
+static clfftStatus genTransposePrototype( const FFTGeneratedTransposeGCNAction::Signature & params, const tile& lwSize, const std::string& dtPlanar, const std::string& dtComplex,
const std::string &funcName, std::stringstream& transKernel, std::string& dtInput, std::string& dtOutput )
{
@@ -231,7 +299,7 @@ static clfftStatus genTransposePrototype( const FFTKernelGenKeyParams& params, c
return CLFFT_SUCCESS;
}
-static clfftStatus genTransposeKernel( const FFTKernelGenKeyParams& params, std::string& strKernel, const tile& lwSize, const size_t reShapeFactor,
+static clfftStatus genTransposeKernel( const FFTGeneratedTransposeGCNAction::Signature & params, std::string& strKernel, const tile& lwSize, const size_t reShapeFactor,
const size_t loopCount, const size_t outRowPadding )
{
strKernel.reserve( 4096 );
@@ -497,67 +565,67 @@ static clfftStatus genTransposeKernel( const FFTKernelGenKeyParams& params, std:
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_GCN> (FFTKernelGenKeyParams & params) const
+
+clfftStatus FFTGeneratedTransposeGCNAction::initParams ()
{
- ::memset( ¶ms, 0, sizeof( params ) );
- params.fft_precision = this->precision;
- params.fft_placeness = this->placeness;
- params.fft_inputLayout = this->inputLayout;
- params.fft_outputLayout = this->outputLayout;
- params.fft_3StepTwiddle = false;
-
- params.transOutHorizontal = this->transOutHorizontal; // using the twiddle front flag to specify horizontal write
+
+ this->signature.fft_precision = this->plan->precision;
+ this->signature.fft_placeness = this->plan->placeness;
+ this->signature.fft_inputLayout = this->plan->inputLayout;
+ this->signature.fft_outputLayout = this->plan->outputLayout;
+ this->signature.fft_3StepTwiddle = false;
+
+ this->signature.transOutHorizontal = this->plan->transOutHorizontal; // using the twiddle front flag to specify horizontal write
// we do this so as to reuse flags in FFTKernelGenKeyParams
// and to avoid making a new one
- ARG_CHECK( this->inStride.size( ) == this->outStride.size( ) );
+ ARG_CHECK( this->plan->inStride.size( ) == this->plan->outStride.size( ) );
- if( CLFFT_INPLACE == params.fft_placeness )
+ if( CLFFT_INPLACE == this->signature.fft_placeness )
{
// If this is an in-place transform the
// input and output layout, dimensions and strides
// *MUST* be the same.
//
- ARG_CHECK( params.fft_inputLayout == params.fft_outputLayout )
+ ARG_CHECK( this->signature.fft_inputLayout == this->signature.fft_outputLayout )
- for( size_t u = this->inStride.size(); u-- > 0; )
+ for( size_t u = this->plan->inStride.size(); u-- > 0; )
{
- ARG_CHECK( this->inStride[u] == this->outStride[u] );
+ ARG_CHECK( this->plan->inStride[u] == this->plan->outStride[u] );
}
}
- params.fft_DataDim = this->length.size() + 1;
+ this->signature.fft_DataDim = this->plan->length.size() + 1;
int i = 0;
- for(i = 0; i < (params.fft_DataDim - 1); i++)
+ for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
{
- params.fft_N[i] = this->length[i];
- params.fft_inStride[i] = this->inStride[i];
- params.fft_outStride[i] = this->outStride[i];
+ this->signature.fft_N[i] = this->plan->length[i];
+ this->signature.fft_inStride[i] = this->plan->inStride[i];
+ this->signature.fft_outStride[i] = this->plan->outStride[i];
}
- params.fft_inStride[i] = this->iDist;
- params.fft_outStride[i] = this->oDist;
-
- if (this->large1D != 0) {
- ARG_CHECK (params.fft_N[0] != 0)
- ARG_CHECK ((this->large1D % params.fft_N[0]) == 0)
- params.fft_3StepTwiddle = true;
- ARG_CHECK ( this->large1D == (params.fft_N[1] * params.fft_N[0]) );
+ this->signature.fft_inStride[i] = this->plan->iDist;
+ this->signature.fft_outStride[i] = this->plan->oDist;
+
+ if (this->plan->large1D != 0) {
+ ARG_CHECK (this->signature.fft_N[0] != 0)
+ ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
+ this->signature.fft_3StepTwiddle = true;
+ ARG_CHECK ( this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
}
// Query the devices in this context for their local memory sizes
// How we generate a kernel depends on the *minimum* LDS size for all devices.
//
const FFTEnvelope * pEnvelope = NULL;
- OPENCL_V( this->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
+ OPENCL_V( this->plan->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
BUG_CHECK( NULL != pEnvelope );
// TODO: Since I am going with a 2D workgroup size now, I need a better check than this 1D use
// Check: CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
// CL_DEVICE_MAX_WORK_ITEM_SIZES
- params.fft_R = 1; // Dont think i'll use
- params.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+ this->signature.fft_R = 1; // Dont think i'll use
+ this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
return CLFFT_SUCCESS;
}
@@ -577,13 +645,10 @@ size_t loopCount = 0;
// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
// Feed this generator the FFTPlan, and it returns the generated program as a string
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedTransposeGCNAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
{
- FFTKernelGenKeyParams params;
- OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( params ), _T( "GetKernelGenKey() failed!" ) );
- switch( params.fft_precision )
+ switch( this->signature.fft_precision )
{
case CLFFT_SINGLE:
case CLFFT_SINGLE_FAST:
@@ -600,7 +665,7 @@ clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const
}
std::string programCode;
- OPENCL_V( genTransposeKernel( params, programCode, lwSize, reShapeFactor, loopCount, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
+ OPENCL_V( genTransposeKernel( this->signature, programCode, lwSize, reShapeFactor, loopCount, outRowPadding ), _T( "GenerateTransposeKernel() failed!" ) );
cl_int status = CL_SUCCESS;
cl_device_id Device = NULL;
@@ -612,39 +677,38 @@ clfftStatus FFTPlan::GenerateKernelPvt<Transpose_GCN> ( FFTRepo& fftRepo, const
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- OPENCL_V( fftRepo.setProgramCode( Transpose_GCN, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramCode( Transpose_GCN, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
// Note: See genFunctionPrototype( )
- if( params.fft_3StepTwiddle )
+ if( this->signature.fft_3StepTwiddle )
{
- OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn_tw_fwd", "transpose_gcn_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, this->getSignatureData(), "transpose_gcn_tw_fwd", "transpose_gcn_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
}
else
{
- OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, params, "transpose_gcn", "transpose_gcn", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_GCN, this->getSignatureData(), "transpose_gcn", "transpose_gcn", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
}
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_GCN>( std::vector< size_t >& globalWS, std::vector< size_t >& localWS ) const
+
+clfftStatus FFTGeneratedTransposeGCNAction::getWorkSizes( std::vector< size_t >& globalWS, std::vector< size_t >& localWS )
{
- FFTKernelGenKeyParams parameters;
- OPENCL_V( this->GetKernelGenKeyPvt<Transpose_GCN>( parameters ), _T( "GetKernelGenKey() failed!" ) );
+
// We need to make sure that the global work size is evenly divisible by the local work size
// Our transpose works in tiles, so divide tiles in each dimension to get count of blocks, rounding up for remainder items
- size_t numBlocksX = NumBlocksX(parameters.fft_N[ 0 ]);
- size_t numBlocksY = DivRoundingUp( parameters.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
+ size_t numBlocksX = NumBlocksX(this->signature.fft_N[ 0 ]);
+ size_t numBlocksY = DivRoundingUp( this->signature.fft_N[ 1 ], lwSize.y / reShapeFactor * loopCount );
size_t numWIX = numBlocksX * lwSize.x;
// Batches of matrices are lined up along the Y axis, 1 after the other
- size_t numWIY = numBlocksY * lwSize.y * this->batchsize;
+ size_t numWIY = numBlocksY * lwSize.y * this->plan->batchsize;
// fft_DataDim has one more dimension than the actual fft data, which is devoted to batch.
// dim from 2 to fft_DataDim - 2 are lined up along the Y axis
- for(int i = 2; i < parameters.fft_DataDim - 1; i++)
+ for(int i = 2; i < this->signature.fft_DataDim - 1; i++)
{
- numWIY *= parameters.fft_N[i];
+ numWIY *= this->signature.fft_N[i];
}
diff --git a/src/library/generator.transpose.vliw.cpp b/src/library/generator.transpose.vliw.cpp
index f715c7d..b075314 100644
--- a/src/library/generator.transpose.vliw.cpp
+++ b/src/library/generator.transpose.vliw.cpp
@@ -23,6 +23,75 @@
#include "stdafx.h"
#include <math.h>
#include "generator.transpose.vliw.h"
+#include "action.h"
+
+
+FFTGeneratedTransposeVLIWAction::FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTTransposeVLIWAction(plHandle, plan, queue, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTTransposeVLIWAction() failed, exit
+ fprintf(stderr, "FFTTransposeVLIWAction() failed!\n");
+ return;
+ }
+
+ // Initialize the FFTAction::FFTKernelGenKeyParams member
+ err = this->initParams();
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeVLIWAction::initParams() failed!\n");
+ return;
+ }
+
+ FFTRepo &fftRepo = FFTRepo::getInstance();
+
+ err = this->generateKernel(fftRepo, queue);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeVLIWAction::generateKernel failed\n");
+ return;
+ }
+
+ err = compileKernels( queue, plHandle, plan);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeVLIWAction::compileKernels failed\n");
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+
+bool FFTGeneratedTransposeVLIWAction::buildForwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedTransposeVLIWAction::buildBackwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || c2r_transform;
+}
+
+
#define QUOTEMARK(x) #x
@@ -75,7 +144,7 @@ typedef enum inputoutputflag_
ENDTRANSIO
} transio;
-static clfftStatus GenerateTransposeKernel (FFTKernelGenKeyParams & params,
+static clfftStatus GenerateTransposeKernel (FFTGeneratedTransposeVLIWAction::Signature & params,
std::string & kernel)
{
kernel.reserve (8000);
@@ -736,84 +805,78 @@ static clfftStatus GenerateTransposeKernel (FFTKernelGenKeyParams & params,
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetKernelGenKeyPvt<Transpose_VLIW> (FFTKernelGenKeyParams & params) const
+clfftStatus FFTGeneratedTransposeVLIWAction::initParams ()
{
// Query the devices in this context for their local memory sizes
// How we generate a kernel depends on the *minimum* LDS size for all devices.
//
const FFTEnvelope * pEnvelope = NULL;
- OPENCL_V(const_cast<FFTPlan*>(this)->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
+ OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
BUG_CHECK (NULL != pEnvelope);
- ::memset( ¶ms, 0, sizeof( params ) );
- params.fft_precision = this->precision;
- params.fft_placeness = this->placeness;
- params.fft_inputLayout = this->inputLayout;
- ARG_CHECK (this->inStride.size() == this->outStride.size())
+ this->signature.fft_precision = this->plan->precision;
+ this->signature.fft_placeness = this->plan->placeness;
+ this->signature.fft_inputLayout = this->plan->inputLayout;
+
+ ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
- if (CLFFT_INPLACE == this->placeness) {
+ if (CLFFT_INPLACE == this->plan->placeness) {
// If this is an in-place transform the
// input and output layout, dimensions and strides
// *MUST* be the same.
//
- ARG_CHECK (this->inputLayout == this->outputLayout)
- params.fft_outputLayout = this->inputLayout;
- for (size_t u = this->inStride.size(); u-- > 0; ) {
- ARG_CHECK (this->inStride[u] == this->outStride[u]);
+ ARG_CHECK (this->plan->inputLayout == this->plan->outputLayout)
+ this->signature.fft_outputLayout = this->plan->inputLayout;
+ for (size_t u = this->plan->inStride.size(); u-- > 0; ) {
+ ARG_CHECK (this->plan->inStride[u] == this->plan->outStride[u]);
}
} else {
- params.fft_outputLayout = this->outputLayout;
+ this->signature.fft_outputLayout = this->plan->outputLayout;
}
//we only support 2D transpose
- switch (this->inStride.size()) {
+ switch (this->plan->inStride.size()) {
// 2-D array is a 3-D data structure
// 2-D unit is a speical case of 2-D array.
case 2:
- ARG_CHECK(this->length .size() > 1);
- ARG_CHECK(this->outStride.size() > 1);
- params.fft_DataDim = 3;
- params.fft_N[0] = this->length[0];
- params.fft_N[1] = this->length[1];
- params.fft_inStride[0] = this->inStride[0];
- params.fft_inStride[1] = this->inStride[1];
- params.fft_inStride[2] = this->iDist;
- params.fft_outStride[0] = this->outStride[0];
- params.fft_outStride[1] = this->outStride[1];
- params.fft_outStride[2] = this->oDist;
+ ARG_CHECK(this->plan->length .size() > 1);
+ ARG_CHECK(this->plan->outStride.size() > 1);
+ this->signature.fft_DataDim = 3;
+ this->signature.fft_N[0] = this->plan->length[0];
+ this->signature.fft_N[1] = this->plan->length[1];
+ this->signature.fft_inStride[0] = this->plan->inStride[0];
+ this->signature.fft_inStride[1] = this->plan->inStride[1];
+ this->signature.fft_inStride[2] = this->plan->iDist;
+ this->signature.fft_outStride[0] = this->plan->outStride[0];
+ this->signature.fft_outStride[1] = this->plan->outStride[1];
+ this->signature.fft_outStride[2] = this->plan->oDist;
break;
default:
ARG_CHECK (false);
}
//ToDO: work group size setup
- params.fft_R = 32; // divide the element into 32x32 blocks
- params.fft_SIMD = 64; //work group size
+ this->signature.fft_R = 32; // divide the element into 32x32 blocks
+ this->signature.fft_SIMD = 64; //work group size
return CLFFT_SUCCESS;
}
-template<>
-clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_VLIW> (std::vector<size_t> & globalWS, std::vector<size_t> & localWS) const
+clfftStatus FFTGeneratedTransposeVLIWAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
{
- // How many numbers per workitem in the generated kernel?
- FFTKernelGenKeyParams fftParams;
- // Translate the user plan into the structure that we use to map plans to clPrograms
- OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW>( fftParams ), _T("GetKernelGenKey() failed!") );
unsigned long long count, count0, count1;
- count0 = DivRoundingUp<unsigned long long> (this->length[0], fftParams.fft_R);
- count1 = DivRoundingUp<unsigned long long> (this->length[1], fftParams.fft_R);
+ count0 = DivRoundingUp<unsigned long long> (this->plan->length[0], this->signature.fft_R);
+ count1 = DivRoundingUp<unsigned long long> (this->plan->length[1], this->signature.fft_R);
count = (count0>count1) ? count0 : count1;
count = (count * (count+1)) /2;
- count *= fftParams.fft_SIMD;
- count *= this->batchsize;
+ count *= this->signature.fft_SIMD;
+ count *= this->plan->batchsize;
globalWS.push_back( static_cast< size_t >( count ) );
- localWS.push_back( fftParams.fft_SIMD );
+ localWS.push_back( this->signature.fft_SIMD );
return CLFFT_SUCCESS;
}
@@ -821,14 +884,11 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Transpose_VLIW> (std::vector<size_t> & glob
// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
// Feed this generator the FFTPlan, and it returns the generated program as a string
-template<>
-clfftStatus FFTPlan::GenerateKernelPvt<Transpose_VLIW> ( FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const
+clfftStatus FFTGeneratedTransposeVLIWAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
{
- FFTKernelGenKeyParams params;
- OPENCL_V( this->GetKernelGenKeyPvt<Transpose_VLIW> (params), _T("GetKernelGenKey() failed!") );
std::string programCode;
- OPENCL_V( GenerateTransposeKernel( params, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
+ OPENCL_V( GenerateTransposeKernel( this->signature, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
cl_int status = CL_SUCCESS;
cl_device_id Device = NULL;
@@ -839,8 +899,8 @@ clfftStatus FFTPlan::GenerateKernelPvt<Transpose_VLIW> ( FFTRepo& fftRepo, const
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
- OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, params, programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
- OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW, params, "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
+ OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
+ OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW, this->getSignatureData(), "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
return CLFFT_SUCCESS;
}
diff --git a/src/library/lifetime.cpp b/src/library/lifetime.cpp
index 7548e9a..f53f670 100644
--- a/src/library/lifetime.cpp
+++ b/src/library/lifetime.cpp
@@ -34,6 +34,8 @@ clfftStatus clfftSetup( const clfftSetupData* sData )
// First invocation of this function will allocate the FFTRepo singleton; thereafter the object always exists
FFTRepo& fftRepo = FFTRepo::getInstance( );
+ clfftInitBinaryCache();
+
// Discover and load the timer module if present
fftRepo.timerHandle = LoadSharedLibrary( "lib", "StatTimer", true );
if( fftRepo.timerHandle )
diff --git a/src/library/mainpage.h b/src/library/mainpage.h
index 70e014c..5256a65 100644
--- a/src/library/mainpage.h
+++ b/src/library/mainpage.h
@@ -35,11 +35,11 @@ components as a pair contiguous in memory) formats.
@section InstallFFT Installation of clFFT library
@subsection DownBinaries Downloadable Binaries
-AMD provides clFFT library pre-compiled packages for recent versions of Microsoft Windows operating systems
-and several flavors of Linux.
+clFFT library pre-compiled packages for recent versions of Microsoft Windows operating systems
+and several flavors of Linux are available.
-The downloadable binary packages are freely available from AMD at
-http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/
+The downloadable binary packages are freely available at
+https://github.com/clMathLibraries/clFFT/releases
Once the appropriate package for the respective OS has finished downloading,
uncompress the package using the native tools available on the platform in a
diff --git a/src/library/md5sum.c b/src/library/md5sum.c
new file mode 100644
index 0000000..d5f9417
--- /dev/null
+++ b/src/library/md5sum.c
@@ -0,0 +1,312 @@
+/*
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001. No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * (This is a heavily cut-down "BSD license".)
+ *
+ * This differs from Colin Plumb's older public domain implementation in that
+ * no exactly 32-bit integer data type is required (any 32-bit or wider
+ * unsigned integer data type will do), there's no compile-time endianness
+ * configuration, and the function prototypes match OpenSSL's. No code from
+ * Colin Plumb's implementation has been reused; this comment merely compares
+ * the properties of the two independent implementations.
+ *
+ * The primary goals of this implementation are portability and ease of use.
+ * It is meant to be fast, but not as fast as possible. Some known
+ * optimizations are not included to reduce source code size and avoid
+ * compile-time configuration.
+ */
+
+#ifndef HAVE_OPENSSL
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "md5sum.h"
+
+/*
+ * The basic MD5 functions.
+ *
+ * F and G are optimized compared to their RFC 1321 definitions for
+ * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
+ * implementation.
+ */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z) (((x) ^ (y)) ^ (z))
+#define H2(x, y, z) ((x) ^ ((y) ^ (z)))
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+
+/*
+ * The MD5 transformation for all four rounds.
+ */
+#define STEP(f, a, b, c, d, x, t, s) \
+ (a) += f((b), (c), (d)) + (x) + (t); \
+ (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
+ (a) += (b);
+
+/*
+ * SET reads 4 input bytes in little-endian byte order and stores them
+ * in a properly aligned word in host byte order.
+ *
+ * The check for little-endian architectures that tolerate unaligned
+ * memory accesses is just an optimization. Nothing will break if it
+ * doesn't work.
+ */
+#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
+#define SET(n) \
+ (*(MD5_u32plus *)&ptr[(n) * 4])
+#define GET(n) \
+ SET(n)
+#else
+#define SET(n) \
+ (ctx->block[(n)] = \
+ (MD5_u32plus)ptr[(n) * 4] | \
+ ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \
+ ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \
+ ((MD5_u32plus)ptr[(n) * 4 + 3] << 24))
+#define GET(n) \
+ (ctx->block[(n)])
+#endif
+
+/*
+ * This processes one or more 64-byte data blocks, but does NOT update
+ * the bit counters. There are no alignment requirements.
+ */
+static const void *body(MD5_CTX *ctx, const void *data, unsigned long size)
+{
+ const unsigned char *ptr;
+ MD5_u32plus a, b, c, d;
+ MD5_u32plus saved_a, saved_b, saved_c, saved_d;
+
+ ptr = (const unsigned char *)data;
+
+ a = ctx->a;
+ b = ctx->b;
+ c = ctx->c;
+ d = ctx->d;
+
+ do {
+ saved_a = a;
+ saved_b = b;
+ saved_c = c;
+ saved_d = d;
+
+/* Round 1 */
+ STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
+ STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
+ STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
+ STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
+ STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
+ STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
+ STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
+ STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
+ STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
+ STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
+ STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
+ STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
+ STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
+ STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
+ STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
+ STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
+
+/* Round 2 */
+ STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+ STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+ STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+ STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+ STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+ STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+ STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+ STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+ STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+ STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+ STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+ STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+ STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+ STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+ STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+ STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+
+/* Round 3 */
+ STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+ STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
+ STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+ STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
+ STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+ STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+ STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+ STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
+ STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+ STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
+ STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+ STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
+ STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+ STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
+ STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+ STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
+
+/* Round 4 */
+ STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+ STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+ STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+ STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+ STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+ STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+ STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+ STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+ STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+ STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+ STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+ STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+ STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+ STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+ STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+ STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+
+ a += saved_a;
+ b += saved_b;
+ c += saved_c;
+ d += saved_d;
+
+ ptr += 64;
+ } while (size -= 64);
+
+ ctx->a = a;
+ ctx->b = b;
+ ctx->c = c;
+ ctx->d = d;
+
+ return ptr;
+}
+
+void MD5_Init(MD5_CTX *ctx)
+{
+ ctx->a = 0x67452301;
+ ctx->b = 0xefcdab89;
+ ctx->c = 0x98badcfe;
+ ctx->d = 0x10325476;
+
+ ctx->lo = 0;
+ ctx->hi = 0;
+}
+
+void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size)
+{
+ MD5_u32plus saved_lo;
+ unsigned long used, available;
+
+ saved_lo = ctx->lo;
+ if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
+ ctx->hi++;
+ ctx->hi += size >> 29;
+
+ used = saved_lo & 0x3f;
+
+ if (used) {
+ available = 64 - used;
+
+ if (size < available) {
+ memcpy(&ctx->buffer[used], data, size);
+ return;
+ }
+
+ memcpy(&ctx->buffer[used], data, available);
+ data = (const unsigned char *)data + available;
+ size -= available;
+ body(ctx, ctx->buffer, 64);
+ }
+
+ if (size >= 64) {
+ data = body(ctx, data, size & ~(unsigned long)0x3f);
+ size &= 0x3f;
+ }
+
+ memcpy(ctx->buffer, data, size);
+}
+
+void MD5_Final(unsigned char *result, MD5_CTX *ctx)
+{
+ unsigned long used, available;
+
+ used = ctx->lo & 0x3f;
+
+ ctx->buffer[used++] = 0x80;
+
+ available = 64 - used;
+
+ if (available < 8) {
+ memset(&ctx->buffer[used], 0, available);
+ body(ctx, ctx->buffer, 64);
+ used = 0;
+ available = 64;
+ }
+
+ memset(&ctx->buffer[used], 0, available - 8);
+
+ ctx->lo <<= 3;
+ ctx->buffer[56] = ctx->lo;
+ ctx->buffer[57] = ctx->lo >> 8;
+ ctx->buffer[58] = ctx->lo >> 16;
+ ctx->buffer[59] = ctx->lo >> 24;
+ ctx->buffer[60] = ctx->hi;
+ ctx->buffer[61] = ctx->hi >> 8;
+ ctx->buffer[62] = ctx->hi >> 16;
+ ctx->buffer[63] = ctx->hi >> 24;
+
+ body(ctx, ctx->buffer, 64);
+
+ result[0] = ctx->a;
+ result[1] = ctx->a >> 8;
+ result[2] = ctx->a >> 16;
+ result[3] = ctx->a >> 24;
+ result[4] = ctx->b;
+ result[5] = ctx->b >> 8;
+ result[6] = ctx->b >> 16;
+ result[7] = ctx->b >> 24;
+ result[8] = ctx->c;
+ result[9] = ctx->c >> 8;
+ result[10] = ctx->c >> 16;
+ result[11] = ctx->c >> 24;
+ result[12] = ctx->d;
+ result[13] = ctx->d >> 8;
+ result[14] = ctx->d >> 16;
+ result[15] = ctx->d >> 24;
+
+ memset(ctx, 0, sizeof(*ctx));
+}
+
+#endif
+
+void md5sum(const void * data, unsigned long size, char * md5string)
+{
+ unsigned char digest[16];
+ int i;
+ MD5_CTX context;
+ MD5_Init(&context);
+ MD5_Update(&context, data, size);
+ MD5_Final(digest, &context);
+
+ for(i = 0; i < 16; ++i)
+ sprintf(&md5string[i*2], "%02x", (unsigned int)digest[i]);
+}
+
diff --git a/src/library/md5sum.h b/src/library/md5sum.h
new file mode 100644
index 0000000..2f3d739
--- /dev/null
+++ b/src/library/md5sum.h
@@ -0,0 +1,50 @@
+/*
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001. No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * See md5.c for more information.
+ */
+
+#ifdef HAVE_OPENSSL
+#include <openssl/md5.h>
+#elif !defined(_MD5_H)
+#define _MD5_H
+
+/* Any 32-bit or wider unsigned integer data type will do */
+typedef unsigned int MD5_u32plus;
+
+typedef struct {
+ MD5_u32plus lo, hi;
+ MD5_u32plus a, b, c, d;
+ unsigned char buffer[64];
+ MD5_u32plus block[16];
+} MD5_CTX;
+
+extern void MD5_Init(MD5_CTX *ctx);
+extern void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size);
+extern void MD5_Final(unsigned char *result, MD5_CTX *ctx);
+
+void md5sum (const void * data, unsigned long size, char * md5sum);
+
+
+
+#endif
+
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 60389ad..1cfe054 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -26,6 +26,8 @@
#include "plan.h"
#include "generator.stockham.h"
#include "../include/convenienceFunctions.h"
+#include "action.h"
+#include "fft_binary_lookup.h"
using std::vector;
@@ -33,16 +35,6 @@ const std::string beginning_of_binary( "<[
const std::string end_of_binary( "<[�_I_may_be_a_sorry_case,_but_I_don't_write_jokes_in_base_13_�]>" );
const std::string end_of_file( "<[�_You're_off_the_edge_of_the_map,_mate._Here_there_be_monsters_�]>" );
-// This operator is used to sort FFTKernelGenKeyParams structs inside of a std::map
-bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs)
-{
- int ret = ::memcmp( &lhs, &rhs, sizeof( FFTKernelGenKeyParams ) );
-
- if( ret < 0 )
- return true;
-
- return false;
-}
// Returns CLFFT_SUCCESS if the fp64 is present, CLFFT_DEVICE_NO_DOUBLE if it is not found.
clfftStatus checkDevExt( std::string ext, const cl_device_id &device )
@@ -213,128 +205,84 @@ clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context contex
break;
}
+ fftPlan->plHandle = *plHandle;
+
return CLFFT_SUCCESS;
}
-
-// **************** TODO TODO TODO ***********************
-// Making CompileKernels function take in command queue parameter so we can build for 1 particular device only;
-// this may not be desirable for persistent plans, where we may have to compile for all devices in the context;
-// make changes appropriately before enabling persistent plans and then remove this comment
-
-// Compile the kernels that this plan uses, and store into the plan
-clfftStatus CompileKernels( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, const clfftGenerators gen, FFTPlan* fftPlan )
+std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle)
{
- cl_int status = 0;
- size_t deviceListSize = 0;
-
- FFTRepo& fftRepo = FFTRepo::getInstance( );
-
-
- // create a cl program executable for the device associated with command queue
- // Get the device
- cl_device_id &q_device = fftPlan->bakeDevice;
- //clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &q_device, NULL);
-
- FFTKernelGenKeyParams fftParams;
- OPENCL_V( fftPlan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
+ // Logic to define a sensible filename
+ const std::string kernelPrefix( "clfft.kernel." );
+ std::string generatorName;
+ std::stringstream kernelPath;
+
+ switch( gen )
+ {
+ case Stockham: generatorName = "Stockham"; break;
+ case Transpose_GCN: generatorName = "Transpose"; break;
+ case Transpose_VLIW: generatorName = "Transpose"; break;
+ case Copy: generatorName = "Copy"; break;
+ }
- cl_program program;
- if( fftRepo.getclProgram( gen, fftParams, program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
- {
+ kernelPath << kernelPrefix << generatorName ;
- std::string programCode;
- OPENCL_V( fftRepo.getProgramCode( gen, fftParams, programCode, q_device, fftPlan->context ), _T( "fftRepo.getProgramCode failed." ) );
+ if (withPlHandle)
+ kernelPath << plHandle;
- const char* source = programCode.c_str();
- program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
- OPENCL_V( status, _T( "clCreateProgramWithSource failed." ) );
+ kernelPath << ".cl";
- // create a cl program executable for the device associated with command queue
+ return kernelPath.str();
+}
-#if defined(DEBUGGING)
- status = clBuildProgram( program, 1, &q_device, "-g -cl-opt-disable", NULL, NULL); // good for debugging kernels
-// if you have trouble creating smbols that GDB can pick up to set a breakpoint after kernels are loaded into memory
-// this can be used to stop execution to allow you to set a breakpoint in a kernel after kernel symbols are in memory.
-#ifdef DEBUG_BREAK_GDB
- __debugbreak();
-#endif
-#else
- status = clBuildProgram( program, 1, &q_device, NULL, NULL, NULL);
-#endif
- if( status != CL_SUCCESS )
+clfftStatus selectAction(FFTPlan * fftPlan, FFTAction *& action, cl_command_queue* commQueueFFT)
+{
+ // set the action we are baking a leaf
+ clfftStatus err;
+ switch (fftPlan->gen)
+ {
+ case Stockham:
{
- if( status == CL_BUILD_PROGRAM_FAILURE )
- {
- size_t buildLogSize = 0;
- OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
- _T( "clGetProgramBuildInfo failed" ) );
-
- vector< char > buildLog( buildLogSize );
- ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
-
- OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
- _T( "clGetProgramBuildInfo failed" ) );
-
- std::cerr << "\n\t\t\tBUILD LOG\n";
- std::cerr << "************************************************\n";
- std::cerr << &buildLog[ 0 ] << std::endl;
- std::cerr << "************************************************\n";
- }
-
- OPENCL_V( status, _T( "clBuildProgram failed" ) );
+ // Instantiate the default stockham generator
+ action = new FFTGeneratedStockhamAction (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+ OPENCL_V( err, "FFTGeneratedStockhamAction() failed");
}
+ break;
- fftRepo.setclProgram( gen, fftParams, program, q_device, fftPlan->context );
-
- // For real transforms we comppile either forward or backward kernel
- bool r2c_transform = (fftParams.fft_inputLayout == CLFFT_REAL);
- bool c2r_transform = (fftParams.fft_outputLayout == CLFFT_REAL);
- bool h2c = (gen == Copy) && ((fftParams.fft_inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_inputLayout == CLFFT_HERMITIAN_INTERLEAVED));
- bool c2h = (gen == Copy) && ((fftParams.fft_outputLayout == CLFFT_HERMITIAN_PLANAR) || (fftParams.fft_outputLayout == CLFFT_HERMITIAN_INTERLEAVED));
- bool generalCopy = !(h2c || c2h) && (gen == Copy);
- bool complexTransform = ( !(r2c_transform || c2r_transform) && (gen != Copy) );
-
- // get a kernel object handle for a kernel with the given name
- cl_kernel kernel;
- if( complexTransform || r2c_transform || c2h || generalCopy)
+ case Transpose_GCN:
{
- if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel ) == CLFFT_INVALID_KERNEL )
- {
- std::string entryPoint;
- OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
-
- kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
- OPENCL_V( status, _T( "clCreateKernel failed" ) );
-
- fftRepo.setclKernel( program, CLFFT_FORWARD, kernel );
- }
+ action = new FFTGeneratedTransposeGCNAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+ OPENCL_V( err, "FFTGeneratedTransposeGCNAction() failed");
}
+ break;
- if( complexTransform || c2r_transform || h2c || generalCopy)
+ case Transpose_VLIW:
{
- if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel ) == CLFFT_INVALID_KERNEL )
- {
- std::string entryPoint;
- OPENCL_V( fftRepo.getProgramEntryPoint( gen, fftParams, CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
-
- kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
- OPENCL_V( status, _T( "clCreateKernel failed" ) );
+ action = new FFTGeneratedTransposeVLIWAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+ OPENCL_V( err, "FFTGeneratedTransposeVLIWAction() failed");
+ }
+ break;
- fftRepo.setclKernel( program, CLFFT_BACKWARD, kernel );
- }
+ case Copy:
+ {
+ action = new FFTGeneratedCopyAction (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
+ OPENCL_V( err, "FFTGeneratedCopyAction() failed");
}
- }
+ break;
+ default:
+ {
+ assert(false);
+ OPENCL_V( CLFFT_NOTIMPLEMENTED, "selectAction() failed");
+ }
+ }
- return CLFFT_SUCCESS;
+ return CLFFT_SUCCESS;
}
-
-
inline size_t PrecisionWidth(clfftPrecision pr)
{
switch(pr)
@@ -345,6 +293,8 @@ inline size_t PrecisionWidth(clfftPrecision pr)
}
}
+
+
clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
void (CL_CALLBACK *pfn_notify)( clfftPlanHandle plHandle, void *user_data ), void* user_data )
{
@@ -427,8 +377,9 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if(fftPlan->gen == Copy)
{
- OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateKernel() failed" ) );
- OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+ clfftStatus err;
+ fftPlan->action = new FFTGeneratedCopyAction(plHandle, fftPlan, *commQueueFFT, err);
+ OPENCL_V( err, _T( "FFTGeneratedCopyAction() failed" ) );
fftPlan->baked = true;
return CLFFT_SUCCESS;
}
@@ -1559,8 +1510,9 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
//break;
if (fftPlan->transflag) //Transpose for 2D
{
- OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateTransposeProgram() failed" ) );
- OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+ clfftStatus err;
+ fftPlan->action = new FFTGeneratedTransposeVLIWAction(plHandle, fftPlan, *commQueueFFT, err);
+ OPENCL_V( err, "FFTGeneratedTransposeVLIWAction failed");
fftPlan->baked = true;
return CLFFT_SUCCESS;
@@ -2484,11 +2436,8 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
}
}
- // For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
- OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateKernel() failed" ) );
-
- // For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
- OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
+
+ clfftStatus err = selectAction(fftPlan, fftPlan->action, commQueueFFT);
// Allocate resources
OPENCL_V( fftPlan->AllocateBuffers (), _T("AllocateBuffers() failed"));
@@ -2560,428 +2509,6 @@ clfftStatus FFTPlan::ConstructAndEnqueueConstantBuffers( cl_command_queue* commQ
return CLFFT_SUCCESS;
}
-//TODO caching kernel binaries for later reload
-#if 0
-typedef std::pair<std::string, clfftPlanHandle> plan_tree_node_t;
-typedef std::vector< std::pair<std::string, clfftPlanHandle> > plan_tree_t;
-
-void make_plan_tree( plan_tree_t & tree, std::string name, clfftPlanHandle handle )
-{
- tree.push_back( plan_tree_node_t(name, handle) );
-
- FFTPlan* plan = NULL;
- FFTRepo& repo = FFTRepo::getInstance();
- lockRAII* lock = NULL;
- clfftStatus status = repo.getPlan( handle, plan, lock );
- if( status != CLFFT_SUCCESS )
- {
- throw( "make_plan_tree failure: repo.getPlan" );
- }
-
- if( plan->planX )
- {
- std::string subplan(name);
- subplan += "X";
- make_plan_tree(tree, subplan, plan->planX );
- }
-
- if( plan->planY )
- {
- std::string subplan(name);
- subplan += "Y";
- make_plan_tree(tree, subplan, plan->planY );
- }
-
- if( plan->planZ )
- {
- std::string subplan(name);
- subplan += "Z";
- make_plan_tree(tree, subplan, plan->planZ );
- }
-}
-
-clfftStatus clfftWritePlanToDisk( clfftPlanHandle plan_handle, const char* filename )
-{
- plan_tree_t plan_tree;
- make_plan_tree( plan_tree, "plan", plan_handle );
-
- std::ofstream planfile;
- planfile.open(filename, std::ios::binary);
-
- while( !plan_tree.empty() )
- {
- plan_tree_node_t node( *plan_tree.begin() );
- plan_tree.erase( plan_tree.begin() );
-
- FFTPlan* plan = NULL;
- FFTRepo& repo = FFTRepo::getInstance();
- lockRAII* lock = NULL;
- OPENCL_V(repo.getPlan( node.second, plan, lock ), _T("getPlan failure"));
-
- // print the name of the node (plan, planX, planXX, planY, plan XY, etc)
- planfile << node.first << " ";
-
- planfile << "dimensions " << plan->dim << " " << plan->length.size();
- // dimensions must be listed first because clfftReadPlanFromDisk
- // will need to use dimensions for reading in strides and such
-
- for( int i = 0; i < plan->length.size(); ++i )
- {
- planfile << " " << plan->length[i];
- }
-
- planfile << " batch " << plan->batchsize;
-
- planfile << " instride " << plan->inStride.size();
- for( int i = 0; i < plan->inStride.size(); ++i )
- {
- planfile << " " << plan->inStride[i];
- }
-
- planfile << " outstride " << plan->outStride.size();
- for( int i = 0; i < plan->outStride.size(); ++i )
- {
- planfile << " " << plan->outStride[i];
- }
-
- planfile << " in-out-distances " << plan->iDist << " " << plan->oDist;
- planfile << " in-out-layouts " << plan->inputLayout << " " << plan->outputLayout;
- planfile << " resultlocation " << plan->placeness;
- planfile << " precision " << plan->precision;
- planfile << " forwardscale " << float_as_hex<double>(plan->forwardScale);
- planfile << " backwardscale " << float_as_hex<double>(plan->backwardScale);
- // we need to stash scales as hex so that we don't have any roundoff error
- // clfftReadPlanFromDisk will read the hex back in as float
-
- planfile << " gen " << plan->gen;
- planfile << " tmpBufSize " << plan->tmpBufSize;
- planfile << " large1D " << plan->large1D;
- planfile << " large2D " << plan->large2D;
-
- if( plan->baked == true )
- {
- planfile << " number-of-devices " << plan->number_of_devices;
-
- if( plan->number_of_devices > 0 )
- {
- planfile << " binary-sizes";
- for( int i = 0; i < plan->number_of_devices; i++ )
- {
- planfile << " " << *(plan->binary_sizes.get() + i);
- }
-
- planfile << " binaries ";
- for( int i = 0; i < plan->number_of_devices; i++ )
- {
- planfile << beginning_of_binary;
- planfile.write( plan->binaries[i].get(), plan->binary_sizes[i] );
- planfile << end_of_binary;
- }
- }
- else
- {
- planfile << " ";
- }
- }
- }
-
- planfile << " " << end_of_file;
- planfile.close();
- return CLFFT_SUCCESS;
-}
-
-void FFTPlan::ResetBinarySizes()
-{
- binary_sizes.reset(new size_t[number_of_devices]);
-}
-
-void FFTPlan::ResetBinaries()
-{
- binaries.clear();
- for( int i = 0; i < number_of_devices; i++ )
- {
- binaries.push_back( std::unique_ptr<char[]>(new char[binary_sizes[i]] ) );
- }
-}
-
-std::string pop_next_word( std::string & str )
-{
- size_t next_space = str.find_first_of(' ');
-
- std::string next_word( str.substr( 0, next_space ) );
- str.erase( 0, next_space+1 ); // we need the extra +1 to munch off the space
-
- return next_word;
-}
-
-int my_string_to_int( std::string str )
-{
- int i;
- std::stringstream string_to_int( str );
- string_to_int >> i;
- return i;
-}
-
-bool start_of_a_plan( std::string word )
-{
- if( word.substr(0,4) == "plan" )
- return true;
- else
- return false;
-}
-
-clfftStatus clfftReadPlanFromDisk( clfftPlanHandle plan_handle, const char* filename )
-{
- plan_tree_t tree;
-
- FFTPlan* plan = NULL;
- FFTRepo& repo = FFTRepo::getInstance();
- lockRAII* lock = NULL;
- OPENCL_V(repo.getPlan( plan_handle, plan, lock ), _T("getPlan failure"));
-
- std::ifstream planfile;
- planfile.open(filename, std::ios::in | std::ios::binary);
-
- unsigned int dimensions = 0;
- std::string next_word;
-
- while( planfile >> next_word )
- {
- if( start_of_a_plan( next_word ) )
- {
- if( next_word.length() > 4 )
- // if true, this is not a base plan
- {
- clfftDim temp_dimension = CLFFT_1D;
- size_t temp_lengths[3] = {1,1,1};
-
- // let's create the plan to represent the child plan
- clfftPlanHandle child_plan;
- OPENCL_V(clfftCreateDefaultPlan( &child_plan, plan->context, temp_dimension, temp_lengths ),
- "clfftReadPlanFromDisk(): error calling clfftCreateDefaultPlan()");
-
- tree.push_back( plan_tree_node_t( next_word, child_plan ) );
-
- // we need to update the planX, Y, or Z pointer to point at the child plan
- char child_plan_name = next_word.rbegin()[0]; // this tells us if this is planX, Y, or Z
- next_word.erase( next_word.end()-1 ); // this tells us the parent plan
- std::string parent_plan_name = next_word;
-
- clfftPlanHandle parent_plan = 0;
-
- for( int i = 0; i < tree.size(); i++ )
- {
- if( tree[i].first == parent_plan_name )
- {
- parent_plan = tree[i].second;
- }
- }
-
- plan = NULL;
- OPENCL_V(repo.getPlan( parent_plan, plan, lock ), _T("getPlan failure"));
-
- if( child_plan_name == 'X' )
- plan->planX = child_plan;
- else if( child_plan_name == 'Y' )
- plan->planY = child_plan;
- else if( child_plan_name == 'Z' )
- plan->planZ = child_plan;
- else
- OPENCL_V(CLFFT_INVALID_PLAN, "clfftReadPlanFromDisk(): could not identify child plan" );
-
- // our child plan is now the active plan
- plan = NULL;
- OPENCL_V(repo.getPlan( child_plan, plan, lock ), _T("getPlan failure"));
- plan_handle = child_plan;
- }
- else
- // if this is a base plan, we don't need to do anything fancy.
- // just add the node to the tree
- {
- tree.push_back( plan_tree_node_t( next_word, plan_handle ) );
- }
-
- plan->readFromFile = true;
- }
- else if( next_word == "dimensions" )
- {
- size_t lengths[3];
-
- // read number of dimensions
- planfile >> dimensions;
-
- // number of length values that follow (subplans have some really strange things going on,
- // so this might not always match the dimension of the transform)
- size_t number_of_lengths = 0;
- planfile >> number_of_lengths;
-
- OPENCL_V( clfftSetPlanDim(plan_handle, static_cast<clfftDim>(dimensions)), _T("clfftReadPlanFromDisk: clfftSetPlanDim") );
-
- for( unsigned int i = 0; i < number_of_lengths; ++i )
- {
- planfile >> lengths[i]; // read one dimension
-
- // We have to explicitly set the lengths instead of using clfftSetPlanLength here.
- // Because the number of values to add might be greater than the number of dimensions in plan->dimension,
- // we don't want to miss out on any super awesome numbers getting added to plan->length with clfftSetPlanLength
- if( i >= plan->length.size() ) plan->length.push_back(1);
- plan->length[i] = lengths[i];
- }
- }
- else if( next_word == "batch" )
- {
- unsigned int batch;
- planfile >> batch;
-
- OPENCL_V( clfftSetPlanBatchSize(plan_handle, batch), _T("clfftReadPlanFromDisk: clfftSetPlanBatchSize") );
- }
- else if( next_word == "instride" )
- {
- size_t strides[3];
-
- // number of stride values that follow (subplans have some really strange things going on,
- // so this might not always match the dimension of the transform)
- size_t number_of_strides = 0;
- planfile >> number_of_strides;
-
- for( unsigned int i = 0; i < number_of_strides; ++i )
- {
- planfile >> strides[i]; // read one dimension
-
- // We have to explicitly set inStride instead of using clfftSetPlanInStride here.
- // Because the number of values to add might be greater than the number of dimensions in plan->dimension,
- // we don't want to miss out on any super awesome numbers getting added to plan->inStride with clfftSetPlanInStride
- if( i >= plan->inStride.size() ) plan->inStride.push_back(1);
- plan->inStride[i] = strides[i];
- }
- }
- else if( next_word == "outstride" )
- {
- size_t strides[3];
-
- // number of stride values that follow (subplans have some really strange things going on,
- // so this might not always match the dimension of the transform)
- size_t number_of_strides = 0;
- planfile >> number_of_strides;
-
- for( unsigned int i = 0; i < number_of_strides; ++i )
- {
- planfile >> strides[i]; // read one dimension
-
- // We have to explicitly set outStride instead of using clfftSetPlanOutStride here.
- // Because the number of values to add might be greater than the number of dimensions in plan->dimension,
- // we don't want to miss out on any super awesome numbers getting added to plan->outStride with clfftSetPlanOutStride
- if( i >= plan->outStride.size() ) plan->outStride.push_back(1);
- plan->outStride[i] = strides[i];
- }
- }
- else if( next_word == "in-out-distances" )
- {
- size_t indistance, outdistance;
- planfile >> indistance >> outdistance;
-
- OPENCL_V( clfftSetPlanDistance( plan_handle, indistance, outdistance ), _T("clfftReadPlanFromDisk: clfftSetPlanDistance" ) );
- }
- else if( next_word == "in-out-layouts" )
- {
- size_t inlayout, outlayout;
- planfile >> inlayout >> outlayout;
-
- OPENCL_V( clfftSetLayout( plan_handle, static_cast<clfftLayout>(inlayout), static_cast<clfftLayout>(outlayout) ), _T("clfftReadPlanFromDisk: clfftSetLayout") );
- }
- else if( next_word == "resultlocation" )
- {
- size_t location;
- planfile >> location;
-
- OPENCL_V( clfftSetResultLocation( plan_handle, static_cast<clfftResultLocation>(location) ), _T("clfftReadPlanFromDisk: clfftSetResultLocation") );
- }
- else if( next_word == "precision" )
- {
- size_t precision;
- planfile >> precision;
-
- OPENCL_V( clfftSetPlanPrecision( plan_handle, static_cast<clfftPrecision>(precision) ), _T("clfftReadPlanFromDisk: clfftSetPlanPrecision") );
- }
- else if( next_word == "forwardscale" || next_word == "backwardscale" )
- {
- size_t scale;
- planfile >> scale;
-
- if( next_word == "forwardscale" )
- {
- OPENCL_V( clfftSetPlanScale( plan_handle, CLFFT_FORWARD, hex_as_float<float>((unsigned int)scale) ), _T("clfftReadPlanFromDisk: clfftSetPlanScale") );
- }
- else
- {
- OPENCL_V( clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, hex_as_float<float>((unsigned int)scale) ), _T("clfftReadPlanFromDisk: clfftSetPlanScale") );
- }
- }
- else if( next_word == "gen" )
- {
- int gen_read;
- planfile >> gen_read;
- plan->gen = static_cast<clfftGenerators>(gen_read);
- }
- else if( next_word == "tmpBufSize" )
- {
- planfile >> plan->tmpBufSize;
- }
- else if( next_word == "large1D" )
- {
- planfile >> plan->large1D;
- }
- else if( next_word == "large2D" )
- {
- planfile >> plan->large2D;
- }
- else if( next_word == "number-of-devices" )
- {
- planfile >> plan->number_of_devices;
- }
- else if( next_word == "binary-sizes" )
- {
- plan->ResetBinarySizes();
- for( int i = 0; i < plan->number_of_devices; i++ )
- {
- planfile >> plan->binary_sizes[i];
- }
- }
- else if( next_word == "binaries" )
- {
- plan->ResetBinaries();
-
- size_t number_of_devices = plan->number_of_devices;
-
- while( static_cast<char>(planfile.peek()) == ' ' )
- planfile.ignore();
-
- // consume the beginning of binary message. the binary will begin with the character immediately following
- std::unique_ptr<char[]> beginning_message( new char[beginning_of_binary.size()] );
- planfile.read( beginning_message.get(), beginning_of_binary.size() );
-
- for( int i = 0; i < plan->number_of_devices; i++ )
- {
- planfile.read( plan->binaries[i].get(), plan->binary_sizes[i] );
- }
-
- std::unique_ptr<char[]> end_message( new char[end_of_binary.size()] );
- planfile.read( end_message.get(), end_of_binary.size() );
- }
- else if( next_word == end_of_file )
- {
- // we're at the end of the file
- }
- else
- {
- std::cout << next_word << std::endl;
- OPENCL_V( CLFFT_INVALID_ARG_VALUE, _T("clfftReadPlanFromDisk: unrecognized parameter") );
- }
- }
-
- return CLFFT_SUCCESS;
-}
-#endif
clfftStatus clfftDestroyPlan( clfftPlanHandle* plHandle )
{
@@ -3179,47 +2706,13 @@ clfftStatus FFTPlan::ReleaseBuffers ()
return result;
}
-clfftStatus FFTPlan::GetWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws) const
-{
- switch(gen)
- {
- case Stockham: return GetWorkSizesPvt<Stockham>( globalws, localws );
- case Transpose_VLIW: return GetWorkSizesPvt<Transpose_VLIW>( globalws, localws );
- case Transpose_GCN: return GetWorkSizesPvt<Transpose_GCN>( globalws, localws );
- case Copy: return GetWorkSizesPvt<Copy>( globalws, localws );
- default: assert( false ); return CLFFT_NOTIMPLEMENTED;
- }
-}
-
-clfftStatus FFTPlan::GetKernelGenKey (FFTKernelGenKeyParams & params) const
-{
- switch(gen)
- {
- case Stockham: return GetKernelGenKeyPvt<Stockham>(params);
- case Transpose_VLIW: return GetKernelGenKeyPvt<Transpose_VLIW>(params);
- case Transpose_GCN: return GetKernelGenKeyPvt<Transpose_GCN>( params );
- case Copy: return GetKernelGenKeyPvt<Copy>( params );
- default: assert(false); return CLFFT_NOTIMPLEMENTED;
- }
-}
-clfftStatus FFTPlan::GenerateKernel (FFTRepo & fftRepo, const cl_command_queue commQueueFFT) const
-{
- switch(gen)
- {
- case Stockham: return GenerateKernelPvt<Stockham>(fftRepo, commQueueFFT);
- case Transpose_VLIW: return GenerateKernelPvt<Transpose_VLIW>(fftRepo, commQueueFFT);
- case Transpose_GCN: return GenerateKernelPvt<Transpose_GCN>( fftRepo, commQueueFFT );
- case Copy: return GenerateKernelPvt<Copy>( fftRepo, commQueueFFT );
- default: assert(false); return CLFFT_NOTIMPLEMENTED;
- }
-}
clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
{
switch(gen)
{
- case Stockham: return GetMax1DLengthPvt<Stockham>(longest);
+ case Stockham: return GetMax1DLengthStockham(longest);
//No restriction for Transpose_VLIW kernel
case Transpose_VLIW: *longest = 4096; return CLFFT_SUCCESS;
case Transpose_GCN: *longest = 4096; return CLFFT_SUCCESS;
diff --git a/src/library/plan.h b/src/library/plan.h
index 56f5df4..787755c 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -23,6 +23,8 @@
#include "lock.h"
#include "generator.h"
+std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle);
+
namespace ARBITRARY {
// TODO: These arbitrary parameters should be tuned for the type of GPU
// being used. These values are probably OK for Radeon 58xx and 68xx.
@@ -168,6 +170,130 @@ struct FFTKernelGenKeyParams {
// Sorting operator for struct FFTKernelGenKeyParams, such that it can be used in a map
bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs);
+class FFTPlan;
+class FFTRepo;
+
+// Action ID
+enum FFTActionImplID
+{
+ FFT_DEFAULT_STOCKHAM_ACTION,
+ FFT_DEFAULT_TRANSPOSE_ACTION,
+ FFT_DEFAULT_COPY_ACTION,
+ FFT_STATIC_STOCKHAM_ACTION
+};
+
+//
+// FFTKernelSignatureHeader
+//
+// This structure is a wrapper for the FFTKernelSignature.
+// It stores the signature size and the action ID. This ensure that every
+// FFTKernelSignature (even with an empty DATA) is unique
+//
+// This class is used as the return type of FFTAction::getSignatureData()
+//
+struct FFTKernelSignatureHeader
+{
+ int datasize;
+ FFTActionImplID id;
+
+ //clfftLayout fft_inputLayout;
+ //clfftLayout fft_outputLayout;
+
+ FFTKernelSignatureHeader(int size_, FFTActionImplID id_)
+ {
+ // Set to 0 the whole signature structure
+ ::memset(this, 0, size_);
+ datasize = size_;
+ id = id_;
+ }
+};
+
+//
+// FFTKernelSignature
+//
+// This struct represents the signature of an action. An action signature
+// stores (by inheritage):
+// - the action ID
+// - its signature data size
+// - a set of bytes caracterizes a FFT action
+//
+// This template class FFTKernelSignature provides a simple mechanism to
+// build a proper signature (see also in src/library/repo.h) from any POD type.
+//
+// It is used as a key in the different cache mecanisms (binary cache and
+// dynamic cache managed by FFTRepo)
+//
+template <typename DATA, FFTActionImplID ID>
+struct FFTKernelSignature : public FFTKernelSignatureHeader, public DATA
+{
+ FFTKernelSignature()
+ : FFTKernelSignatureHeader(sizeof(FFTKernelSignature<DATA, ID>), ID)
+ {
+ }
+};
+
+
+
+//
+// FFTAction is the base class for all actions used to implement FFT computes
+//
+// An action basically implements some OpenCL related actions, for instance:
+// - Generating a kernel source code from kernel characteristics
+// - Computing the work-group local sizes according to a kernel
+// - Enqueuing arguments to the kernel call
+//
+// Kernels generated and compiled by an action are stored in the different
+// cache mechanism (see repo.h for the dynamic cache and fft_binary_lookup.h
+// for disk cache for more information). Each cache entry can be obtained from
+// the action signature which is set of byte characterizing the action itself.
+//
+// At the moment, FFTAction only implements the enqueue method which is
+// inherited by every action subclasses. But it may change in the time. There
+// are no clear rules and the choices made until now are still subject to
+// change.
+//
+class FFTAction
+{
+public:
+ FFTAction(FFTPlan * plan, clfftStatus & err);
+
+ virtual clfftStatus enqueue(clfftPlanHandle plHandle,
+ clfftDirection dir,
+ cl_uint numQueuesAndEvents,
+ cl_command_queue* commQueues,
+ cl_uint numWaitEvents,
+ const cl_event* waitEvents,
+ cl_event* outEvents,
+ cl_mem* clInputBuffers,
+ cl_mem* clOutputBuffers);
+
+protected:
+
+ virtual clfftGenerators getGenerator() = 0;
+
+ clfftStatus compileKernels ( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan);
+ clfftStatus writeKernel ( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device);
+
+ virtual clfftStatus generateKernel ( FFTRepo & fftRepo, const cl_command_queue commQueueFFT) = 0;
+ virtual clfftStatus getWorkSizes ( std::vector<size_t> & globalws, std::vector<size_t> & localws) = 0;
+
+ virtual const FFTKernelSignatureHeader * getSignatureData() = 0;
+
+ FFTPlan * plan;
+
+private:
+
+ clfftStatus selectBufferArguments(FFTPlan * plan,
+ cl_mem* clInputBuffers,
+ cl_mem* clOutputBuffers,
+ std::vector< cl_mem > &inputBuff,
+ std::vector< cl_mem > &outputBuff);
+
+ virtual bool buildForwardKernel() = 0;
+ virtual bool buildBackwardKernel() = 0;
+};
+
+
// The "envelope" is a set of limits imposed by the hardware
// This will depend on the GPU(s) in the OpenCL context.
// If there are multiple devices, this should be the least
@@ -194,23 +320,11 @@ struct FFTEnvelope {
}
};
-class FFTRepo;
// This class contains objects that are specific to a particular FFT transform, and the data herein is useful
// for us to know ahead of transform time such that we can optimize for these settings
class FFTPlan
{
- template <clfftGenerators G>
- clfftStatus GetWorkSizesPvt (std::vector<size_t> & globalws, std::vector<size_t> & localws) const;
-
- template <clfftGenerators G>
- clfftStatus GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const;
-
- template <clfftGenerators G>
- clfftStatus GenerateKernelPvt (FFTRepo& fftRepo, const cl_command_queue& commQueueFFT ) const;
-
- template <clfftGenerators G>
- clfftStatus GetMax1DLengthPvt (size_t *longest ) const;
public:
@@ -301,6 +415,12 @@ public:
BlockComputeType blockComputeType;
+ clfftPlanHandle plHandle;
+
+ // The action
+ FFTAction * action;
+
+
FFTPlan ()
: baked (false)
, dim (CLFFT_1D)
@@ -340,6 +460,8 @@ public:
, planCopy(0)
, const_buffer( NULL )
, gen(Stockham)
+ , action(0)
+ , plHandle(0)
{};
@@ -348,9 +470,6 @@ public:
clfftStatus AllocateBuffers ();
clfftStatus ReleaseBuffers ();
- clfftStatus GetWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws) const;
- clfftStatus GetKernelGenKey (FFTKernelGenKeyParams & params) const;
- clfftStatus GenerateKernel (FFTRepo & fftRepo, const cl_command_queue commQueueFFT) const;
clfftStatus GetMax1DLength (size_t *longest ) const;
void ResetBinarySizes();
@@ -362,6 +481,8 @@ public:
clfftStatus GetEnvelope (const FFTEnvelope **) const;
clfftStatus SetEnvelope ();
+ clfftStatus GetMax1DLengthStockham (size_t *longest ) const;
+
~FFTPlan ()
{
ReleaseBuffers ();
diff --git a/src/library/private.h b/src/library/private.h
index 000ab65..fecb84f 100644
--- a/src/library/private.h
+++ b/src/library/private.h
@@ -90,6 +90,8 @@
}
#endif
+void clfftInitBinaryCache();
+
// This header file is not visible to clients, and contains internal structures and functions for use
// by the FFT library. Since this header is private to this implementation, there is no need to keep
// strict C compliance.
@@ -270,6 +272,8 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
// This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
// If an error occurs, we issue a return statement to exit the calling function.
+#if defined( _DEBUG )
+
#define OPENCL_V( fn, msg ) \
{ \
clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
@@ -288,6 +292,23 @@ inline tstring clfftErrorStatusAsString( const cl_int& status )
} \
}
+#else
+
+#define OPENCL_V( fn, msg ) \
+{ \
+ clfftStatus vclStatus = static_cast< clfftStatus >( fn ); \
+ switch( vclStatus ) \
+ { \
+ case CL_SUCCESS: /**< No error */ \
+ break; \
+ default: \
+ { \
+ return vclStatus; \
+ } \
+ } \
+}
+#endif
+
static inline bool IsPo2 (size_t u) {
return (u != 0) && (0 == (u & (u-1)));
}
diff --git a/src/library/repo.cpp b/src/library/repo.cpp
index 6d44985..9e2ab8d 100644
--- a/src/library/repo.cpp
+++ b/src/library/repo.cpp
@@ -60,6 +60,11 @@ clfftStatus FFTRepo::releaseResources( )
//
for (fftRepo_iterator iProg = mapFFTs.begin( ); iProg != mapFFTs.end( ); ++iProg )
{
+ if (iProg->first.data != NULL)
+ {
+ const_cast<FFTRepoKey*>(&iProg->first)->deleteData();
+ }
+
cl_program p = iProg->second.clProgram;
iProg->second.clProgram = NULL;
if (NULL != p)
@@ -91,14 +96,13 @@ clfftStatus FFTRepo::releaseResources( )
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const std::string& kernel, const cl_device_id &device, const cl_context& planContext )
+clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const std::string& kernel, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "setProgramCode" ) );
- ClPair clPair = std::make_pair(planContext, device);
- std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
- fftRepoKey key = std::make_pair( gen, Params );
+ FFTRepoKey key(gen, data, planContext, device);
+ key.privatizeData();
// Prefix copyright statement at the top of generated kernels
std::stringstream ss;
@@ -127,13 +131,11 @@ clfftStatus FFTRepo::setProgramCode( const clfftGenerators gen, const FFTKernelG
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
+clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "getProgramCode" ) );
- ClPair clPair = std::make_pair(planContext, device);
- std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
- fftRepoKey key = std::make_pair( gen, Params );
+ FFTRepoKey key(gen, data, planContext, device);
fftRepo_iterator pos = mapFFTs.find( key);
if( pos == mapFFTs.end( ) )
@@ -143,14 +145,12 @@ clfftStatus FFTRepo::getProgramCode( const clfftGenerators gen, const FFTKernelG
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
+clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFTKernelSignatureHeader * data,
const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "setProgramEntryPoints" ) );
- ClPair clPair = std::make_pair(planContext, device);
- std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
- fftRepoKey key = std::make_pair( gen, Params );
+ FFTRepoKey key(gen, data, planContext, device);
fftRepoValue& fft = mapFFTs[ key ];
fft.EntryPoint_fwd = kernel_fwd;
@@ -159,14 +159,12 @@ clfftStatus FFTRepo::setProgramEntryPoints( const clfftGenerators gen, const FFT
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam,
+clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTKernelSignatureHeader * data,
clfftDirection dir, std::string& kernel, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "getProgramEntryPoint" ) );
- ClPair clPair = std::make_pair(planContext, device);
- std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
- fftRepoKey key = std::make_pair( gen, Params );
+ FFTRepoKey key(gen, data, planContext, device);
fftRepo_iterator pos = mapFFTs.find( key );
if( pos == mapFFTs.end( ) )
@@ -190,23 +188,18 @@ clfftStatus FFTRepo::getProgramEntryPoint( const clfftGenerators gen, const FFTK
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext )
+clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const cl_program& prog, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "setclProgram" ) );
- cl_int status = CL_SUCCESS;
- cl_context ProgramContext = NULL;
- status = clGetProgramInfo(prog, CL_PROGRAM_CONTEXT, sizeof(cl_context), &ProgramContext, NULL);
-
- OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
- ClPair clPair = std::make_pair(planContext, device);
- std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
- fftRepoKey key = std::make_pair( gen, Params );
+ FFTRepoKey key(gen, data, planContext, device);
fftRepo_iterator pos = mapFFTs.find( key );
if( pos == mapFFTs.end( ) )
+ {
+ key.privatizeData(); // the key owns the data
mapFFTs[ key ].clProgram = prog;
+ }
else {
cl_program p = pos->second.clProgram;
assert (NULL == p);
@@ -218,13 +211,11 @@ clfftStatus FFTRepo::setclProgram( const clfftGenerators gen, const FFTKernelGen
return CLFFT_SUCCESS;
}
-clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext )
+clfftStatus FFTRepo::getclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, cl_program& prog, const cl_device_id &device, const cl_context& planContext )
{
scopedLock sLock( lockRepo, _T( "getclProgram" ) );
- ClPair clPair = std::make_pair(planContext, device);
- std::pair<FFTKernelGenKeyParams, ClPair> Params = std::make_pair(fftParam, clPair);
- fftRepoKey key = std::make_pair( gen, Params );
+ FFTRepoKey key(gen, data, planContext, device);
fftRepo_iterator pos = mapFFTs.find( key );
if( pos == mapFFTs.end( ) )
diff --git a/src/library/repo.h b/src/library/repo.h
index 9adc349..5cce489 100644
--- a/src/library/repo.h
+++ b/src/library/repo.h
@@ -27,6 +27,8 @@
+
+
// This class contains objects that we wish to retain between individual calls into the FFT interface.
// These objects will be shared across different individual FFT plans, and we wish to keep only one
// copy of these programs, objects and events. When the client decides that they either want to reset
@@ -34,6 +36,61 @@
// up as much as it can. It is implemented as a Singleton object.
class FFTRepo
{
+
+ struct FFTRepoKey
+ {
+ clfftGenerators gen;
+ const FFTKernelSignatureHeader * data;
+ cl_context context;
+ cl_device_id device;
+
+ FFTRepoKey(clfftGenerators gen_, const FFTKernelSignatureHeader * data_, cl_context context_, cl_device_id device_)
+ : gen(gen_), data(data_), context(context_), device(device_)
+ {
+
+ }
+
+ void privatizeData()
+ {
+ char * tmp = new char[data->datasize];
+ ::memcpy(tmp, data, data->datasize);
+ this->data = (FFTKernelSignatureHeader*) tmp;
+ }
+
+ void deleteData()
+ {
+ if (this->data)
+ {
+ delete this->data;
+ }
+
+ this->data = NULL;
+ }
+
+ bool operator<(const FFTRepoKey & b) const
+ {
+ const FFTRepoKey & a = *this;
+
+ if (a.gen != b.gen)
+ {
+ return a.gen < b.gen;
+ }
+ else if (a.data->datasize != b.data->datasize)
+ {
+ return a.data->datasize < b.data->datasize;
+ }
+ else if (a.context != b.context)
+ {
+ return a.context < b.context;
+ }
+ else
+ {
+ return ::memcmp(a.data, b.data, a.data->datasize) < 0;
+ }
+ }
+ };
+
+
// Structure containing all the data we need to remember for a specific invokation of a kernel
// generator
struct fftRepoValue {
@@ -51,9 +108,7 @@ class FFTRepo
// has created
//typedef std::pair< clfftGenerators, FFTKernelGenKeyParams > fftRepoKey;
- typedef std::pair< cl_context, cl_device_id > ClPair;
- typedef std::pair< clfftGenerators, std::pair<FFTKernelGenKeyParams, ClPair> > fftRepoKey;
- typedef std::map< fftRepoKey, fftRepoValue > fftRepoType;
+ typedef std::map< FFTRepoKey, fftRepoValue > fftRepoType;
typedef fftRepoType::iterator fftRepo_iterator;
@@ -140,14 +195,14 @@ public:
clfftStatus releaseResources( );
- clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, const std::string& kernel, const cl_device_id &device, const cl_context& planContext );
- clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelGenKeyParams&, std::string& kernel, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus setProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const std::string& kernel, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus getProgramCode( const clfftGenerators gen, const FFTKernelSignatureHeader * data, std::string& kernel, const cl_device_id &device, const cl_context& planContext );
- clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext );
- clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, clfftDirection dir, std::string& kernel , const cl_device_id &device, const cl_context& planContext );
+ clfftStatus setProgramEntryPoints( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const char * kernel_fwd, const char * kernel_back, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus getProgramEntryPoint( const clfftGenerators gen, const FFTKernelSignatureHeader * data, clfftDirection dir, std::string& kernel , const cl_device_id &device, const cl_context& planContext );
- clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, const cl_program& prog, const cl_device_id &device, const cl_context& planContext );
- clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelGenKeyParams& fftParam, cl_program& prog, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus setclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, const cl_program& prog, const cl_device_id &device, const cl_context& planContext );
+ clfftStatus getclProgram( const clfftGenerators gen, const FFTKernelSignatureHeader * data, cl_program& prog, const cl_device_id &device, const cl_context& planContext );
clfftStatus setclKernel ( cl_program prog, clfftDirection dir, const cl_kernel& kernel );
clfftStatus getclKernel ( cl_program prog, clfftDirection dir, cl_kernel& kernel );
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 8ab7efa..8dc5e9f 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -846,488 +846,13 @@ clfftStatus clfftEnqueueTransform(
}
}
- // 1d with normal length will fall into the below category
- // add: 2d transpose kernel will fall into here too.
- vector< cl_mem > inputBuff;
- vector< cl_mem > outputBuff;
- inputBuff.reserve( 2 );
- outputBuff.reserve( 2 );
-
- // Decode the relevant properties from the plan paramter to figure out how many input/output buffers we have
- switch( fftPlan->inputLayout )
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- switch( fftPlan->outputLayout )
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_COMPLEX_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- // Invalid to be an inplace transform, and go from 1 to 2 buffers
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- case CLFFT_REAL:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- default:
- {
- // Don't recognize output layout
- return CLFFT_INVALID_ARG_VALUE;
- }
- }
-
- break;
- }
- case CLFFT_COMPLEX_PLANAR:
- {
- switch( fftPlan->outputLayout )
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_COMPLEX_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- case CLFFT_REAL:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- default:
- {
- // Don't recognize output layout
- return CLFFT_INVALID_ARG_VALUE;
- }
- }
-
- break;
- }
- case CLFFT_HERMITIAN_INTERLEAVED:
- {
- switch( fftPlan->outputLayout )
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_COMPLEX_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_INTERLEAVED:
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- case CLFFT_HERMITIAN_PLANAR:
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- case CLFFT_REAL:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- default:
- {
- // Don't recognize output layout
- return CLFFT_INVALID_ARG_VALUE;
- }
- }
-
- break;
- }
- case CLFFT_HERMITIAN_PLANAR:
- {
- switch( fftPlan->outputLayout )
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_COMPLEX_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_INTERLEAVED:
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- case CLFFT_HERMITIAN_PLANAR:
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- case CLFFT_REAL:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- inputBuff.push_back( clInputBuffers[ 1 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- default:
- {
- // Don't recognize output layout
- return CLFFT_INVALID_ARG_VALUE;
- }
- }
-
- break;
- }
- case CLFFT_REAL:
- {
- switch( fftPlan->outputLayout )
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_COMPLEX_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_INTERLEAVED:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- }
-
- break;
- }
- case CLFFT_HERMITIAN_PLANAR:
- {
- if( fftPlan->placeness == CLFFT_INPLACE )
- {
- return CLFFT_INVALID_ARG_VALUE;
- }
- else
- {
- inputBuff.push_back( clInputBuffers[ 0 ] );
-
- outputBuff.push_back( clOutputBuffers[ 0 ] );
- outputBuff.push_back( clOutputBuffers[ 1 ] );
- }
-
- break;
- }
- default:
- {
- // Don't recognize output layout
- return CLFFT_INVALID_ARG_VALUE;
- }
- }
-
- break;
- }
- default:
- {
- // Don't recognize output layout
- return CLFFT_INVALID_ARG_VALUE;
- }
- }
-
- // TODO: In the case of length == 1, FFT is a trivial NOP, but we still need to apply the forward and backwards tranforms
- // TODO: Are map lookups expensive to call here? We can cache a pointer to the cl_program/cl_kernel in the plan
-
- FFTKernelGenKeyParams fftParams;
- // Translate the user plan into the structure that we use to map plans to clPrograms
- OPENCL_V( fftPlan->GetKernelGenKey( fftParams ), _T("GetKernelGenKey() failed!") );
-
- cl_program prog;
- cl_kernel kern;
- OPENCL_V( fftRepo.getclProgram( fftPlan->gen, fftParams, prog, fftPlan->bakeDevice, fftPlan->context ), _T( "fftRepo.getclProgram failed" ) );
- OPENCL_V( fftRepo.getclKernel( prog, dir, kern ), _T( "fftRepo.getclKernels failed" ) );
-
-
-
- cl_uint uarg = 0;
- if (!fftPlan->transflag && !(fftPlan->gen == Copy))
- {
- // ::clSetKernelArg() is not thread safe, according to the openCL spec for the same cl_kernel object
- // TODO: Need to verify that two different plans (which would get through our lock above) with exactly the same
- // parameters would NOT share the same cl_kernel objects
-
- /* constant buffer */
- OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&fftPlan->const_buffer ), _T( "clSetKernelArg failed" ) );
- }
-
- // Input buffer(s)
- // Input may be 1 buffer (CLFFT_COMPLEX_INTERLEAVED)
- // or 2 buffers (CLFFT_COMPLEX_PLANAR)
-
- for (size_t i = 0; i < inputBuff.size(); ++i)
- {
- OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&inputBuff[i] ), _T( "clSetKernelArg failed" ) );
- }
- // Output buffer(s)
- // Output may be 0 buffers (CLFFT_INPLACE)
- // or 1 buffer (CLFFT_COMPLEX_INTERLEAVED)
- // or 2 buffers (CLFFT_COMPLEX_PLANAR)
- for (size_t o = 0; o < outputBuff.size(); ++o)
- {
- OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
- }
-
- vector< size_t > gWorkSize;
- vector< size_t > lWorkSize;
- clfftStatus result = fftPlan->GetWorkSizes (gWorkSize, lWorkSize);
-
- // TODO: if GetWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
- // that this multidimensional input data array is too large to be transformed
- // with a single call to clEnqueueNDRangeKernel. For now, we will just return
- // the error code back up the call stack.
- // The *correct* course of action would be to split the work into mutliple
- // calls to clEnqueueNDRangeKernel.
- if (CLFFT_INVALID_GLOBAL_WORK_SIZE == result)
- {
- OPENCL_V( result, _T("Work size too large for clEnqueNDRangeKernel()"));
- }
- else
- {
- OPENCL_V( result, _T("FFTPlan::GetWorkSizes failed"));
- }
- BUG_CHECK (gWorkSize.size() == lWorkSize.size());
-
- //size_t *lwSize = NULL;
- //if(fftPlan->gen != Copy) lwSize = &lWorkSize[ 0 ];
-
- status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
- NULL, &gWorkSize[ 0 ], &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
- OPENCL_V( status, _T( "clEnqueueNDRangeKernel failed" ) );
-
- if( fftRepo.pStatTimer )
- {
- fftRepo.pStatTimer->AddSample( plHandle, fftPlan, kern, numQueuesAndEvents, outEvents, gWorkSize );
- }
-
- return CLFFT_SUCCESS;
+ return fftPlan->action->enqueue(plHandle,
+ dir,
+ numQueuesAndEvents,
+ commQueues,
+ numWaitEvents,
+ waitEvents,
+ outEvents,
+ clInputBuffers,
+ clOutputBuffers);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list