[clfft] 99/128: fixing some bugs in real2hermitian, debug continues
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:44 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 0993a32c23c0f767948b731ce7f72b58c3c6bc5e
Author: bragadeesh <bragadeesh.natarajan at amd.com>
Date: Wed Oct 7 23:57:51 2015 -0700
fixing some bugs in real2hermitian, debug continues
---
src/callback-client/CMakeLists.txt | 7 +-
src/client/CMakeLists.pack | 182 ------------------------------
src/library/generator.stockham.cpp | 5 +-
src/library/plan.cpp | 199 +++++++++++++++++++--------------
src/library/transform.cpp | 34 ++++--
src/statTimer/statisticalTimer.GPU.cpp | 3 +
src/statTimer/statisticalTimer.GPU.h | 4 +-
7 files changed, 145 insertions(+), 289 deletions(-)
diff --git a/src/callback-client/CMakeLists.txt b/src/callback-client/CMakeLists.txt
index dc63b8e..2bc8384 100644
--- a/src/callback-client/CMakeLists.txt
+++ b/src/callback-client/CMakeLists.txt
@@ -54,9 +54,4 @@ if( APPLE )
set_target_properties( clFFT-callback-client PROPERTIES INSTALL_RPATH "@loader_path/../lib${SUFFIX_LIB}")
endif()
-# CPack configuration; include the executable into the package
-install( TARGETS clFFT-callback-client
- RUNTIME DESTINATION bin${SUFFIX_BIN}
- LIBRARY DESTINATION lib${SUFFIX_LIB}
- ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
- )
+
diff --git a/src/client/CMakeLists.pack b/src/client/CMakeLists.pack
deleted file mode 100644
index 2fcf3ea..0000000
--- a/src/client/CMakeLists.pack
+++ /dev/null
@@ -1,182 +0,0 @@
-# ########################################################################
-# Copyright 2013 Advanced Micro Devices, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ########################################################################
-cmake_minimum_required( VERSION 2.6 )
-project( clFFT.Sample )
-
-# If AMDAPPSDKROOT is defined as an environment value, use that value and cache it so it's visible in the cmake-gui.
-# Otherwise, create a sensible default that the user can change
-if( DEFINED ENV{AMDAPPSDKROOT} )
- set( AMD_APP_SDK_ROOT $ENV{AMDAPPSDKROOT} CACHE PATH "Environment variable defining the root of the ATI Stream SDK" )
-else( )
- set( AMD_APP_SDK_ROOT "/Path/To/ATI_Stream_SDK" CACHE PATH "Modify this variable to point to the root of the ATI Stream SDK installation" )
-endif( )
-
-# If BOOST_ROOT is defined as an environment value, use that value and cache it so it's visible in the cmake-gui.
-# Otherwise, create a sensible default that the user can change
-if( DEFINED ENV{BOOST_ROOT} )
- set( BOOST_ROOT $ENV{BOOST_ROOT} CACHE PATH "Environment variable defining the root of the Boost installation" )
-else( )
- if( UNIX )
- set( BOOST_ROOT "/usr" CACHE PATH "Modify this variable to point to the root of the Boost installation" )
- else( )
- set( BOOST_ROOT "/Path/To/boost_x_xx_x" CACHE PATH "Modify this variable to point to the root of the Boost installation" )
- endif()
-endif( )
-
-# Currently, linux has a problem outputing both narrow and wide characters,
-# which happens in our client because openCL only supports narrow characters
-if( WIN32 )
- option( UNICODE "Build with Unicode Support" ON )
- if( UNICODE )
- message( STATUS "UNICODE build" )
- endif( )
-else()
- set( UNICODE OFF )
- message( STATUS "UNICODE feature disabled on linux" )
-endif()
-
-if( MSVC_IDE )
- set( BUILD64 ${CMAKE_CL_64} )
-else()
- option( BUILD64 "Build a 64-bit product" ON )
- if( BUILD64 )
- message( STATUS "64-bit build" )
- endif( )
-
- if( IS_DIRECTORY ${PROJECT_SOURCE_DIR}/library/test )
- option( CODE_COVERAGE "Build makefiles with code coverage instrumentation" OFF )
- if( CODE_COVERAGE )
- message( STATUS "Code coverage instrumentation on" )
- endif()
- endif()
-endif()
-
-# For linux, modify the global find property to help us find libraries like Boost in the correct paths
-if( UNIX )
- if( BUILD64 )
- set_property( GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE )
- message( STATUS "64bit build - FIND_LIBRARY_USE_LIB64_PATHS: ${FIND_LIBRARY_USE_LIB64_PATHS}" )
- else()
- set_property( GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS FALSE )
- message( STATUS "32bit build - FIND_LIBRARY_USE_LIB64_PATHS: ${FIND_LIBRARY_USE_LIB64_PATHS}" )
- endif()
-endif()
-
-# Find the absolute path to the opencl library that we need to link too; the path depends on being 64bit or 32bit
-if( BUILD64 )
- find_library( OPENCL_LIBRARIES
- NAMES OpenCL
- HINTS
- ${AMD_APP_SDK_ROOT}/lib/
- ENV AMD_APP_SDK_ROOT
- PATH_SUFFIXES x86_64 x86
- )
-else()
- find_library( OPENCL_LIBRARIES
- NAMES OpenCL
- HINTS
- ${AMD_APP_SDK_ROOT}/lib/
- ENV AMD_APP_SDK_ROOT
- PATH_SUFFIXES x86
- )
-endif()
-message( STATUS "OPENCL_LIBRARIES: ${OPENCL_LIBRARIES}" )
-
-set( Boost_USE_MULTITHREADED ON )
-set( Boost_USE_STATIC_LIBS ON )
-set( Boost_DETAILED_FAILURE_MSG ON )
-set( Boost_DEBUG ON )
-set( Boost_ADDITIONAL_VERSIONS "1.44.0" "1.44" )
-# On linux, the boost installed in the system always appears to override any user boost installs
-if( UNIX )
- set( Boost_NO_SYSTEM_PATHS TRUE )
-endif( )
-find_package( Boost 1.33.0 COMPONENTS program_options )
-message(STATUS "Boost_PROGRAM_OPTIONS_LIBRARY: ${Boost_PROGRAM_OPTIONS_LIBRARY}")
-
-# FFLAGS depend on the compiler, grab the compiler name from the path
-get_filename_component( C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME_WE )
-# message( "C_COMPILER_NAME: " ${C_COMPILER_NAME} )
-# message( "CMAKE_C_COMPILER: " ${CMAKE_C_COMPILER} )
-
-# Set common compile and link options
-if( C_COMPILER_NAME STREQUAL "cl" )
- # Following options for nMake
- message( STATUS "Detected MSVS Ver: " ${MSVC_VERSION} )
- if( NOT MSVC_IDE )
- message( STATUS "Using an nMake environment to build" )
-
- endif( )
-
-elseif( C_COMPILER_NAME STREQUAL "gcc" )
- message( STATUS "Detected GNU fortran compiler." )
- # set( CMAKE_CXX_FLAGS "-std=c++0x ${CMAKE_CXX_FLAGS}" )
-
- if( BUILD64 )
- set( CMAKE_CXX_FLAGS "-m64 ${CMAKE_CXX_FLAGS}" )
- set( CMAKE_C_FLAGS "-m64 ${CMAKE_C_FLAGS}" )
- else( )
- set( CMAKE_CXX_FLAGS "-m32 ${CMAKE_CXX_FLAGS}" )
- set( CMAKE_C_FLAGS "-m32 ${CMAKE_C_FLAGS}" )
- endif( )
-else( )
- message( FATAL_ERROR "Compiler name not detected" )
-endif( )
-
-# If UNICODE is defined, pass extra definitions into
-if( UNICODE )
- add_definitions( "/DUNICODE /D_UNICODE" )
-endif( )
-
-# Print out compiler flags for viewing/debug
-message( STATUS "CMAKE_CXX_COMPILER flags: " ${CMAKE_CXX_FLAGS} )
-message( STATUS "CMAKE_CXX_COMPILER debug flags: " ${CMAKE_CXX_FLAGS_DEBUG} )
-message( STATUS "CMAKE_CXX_COMPILER release flags: " ${CMAKE_CXX_FLAGS_RELEASE} )
-message( STATUS "CMAKE_CXX_COMPILER relwithdebinfo flags: " ${CMAKE_CXX_FLAGS_RELWITHDEBINFO} )
-message( STATUS "CMAKE_EXE_LINKER link flags: " ${CMAKE_EXE_LINKER_FLAGS} )
-
-include_directories( ${Boost_INCLUDE_DIRS} ${AMD_APP_SDK_ROOT}/include ${PROJECT_SOURCE_DIR}/../include )
-
-# Set the OpenCL library include path depending on target platform
-if( BUILD64 )
- if( WIN32 )
- link_directories( ${AMD_APP_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64/import )
- elseif( UNIX )
- link_directories( ${AMD_APP_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64 )
- endif()
-else()
- if( WIN32 )
- link_directories( ${AMD_APP_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32/import )
- elseif( UNIX )
- link_directories( ${AMD_APP_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32 )
- endif()
-endif()
-
-add_executable( Client
- # sources follow
- client.cpp
- openCL.misc.cpp
- statisticalTimer.cpp
- stdafx.cpp
- client.h
- openCL.misc.h
- statisticalTimer.h
- stdafx.h
- targetver.h
- unicode.compatibility.h
- ../include/clFFT.h )
-
-target_link_libraries( clFFT.Client clFFT ${Boost_LIBRARIES} ${OPENCL_LIBRARIES})
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index c3fa5f9..40f54eb 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -2746,10 +2746,7 @@ namespace StockhamGenerator
// Set half lds for real transforms
halfLds = r2c2r ? true : halfLds;
- // Set half lds for radix7
- halfLds = (length % 7 == 0) ? true : halfLds;
-
- linearRegs = halfLds;
+ linearRegs = true;
realSpecial = params.fft_realSpecial;
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 62a3da4..a90f43d 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -605,7 +605,6 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
clLengths[0] = fftPlan->length[0]/clLengths[1];
-
// Start of block where transposes are generated; 1D FFT
while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
{
@@ -1058,13 +1057,13 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
fftPlan->baked = true;
return CLFFT_SUCCESS;
}
- else if(fftPlan->inputLayout == CLFFT_REAL)
+ else if (fftPlan->inputLayout == CLFFT_REAL)
{
- if (fftPlan->tmpBufSizeRC==0 )
+ if (fftPlan->tmpBufSizeRC == 0)
{
fftPlan->tmpBufSizeRC = length0 * length1 *
fftPlan->batchsize * fftPlan->ElementSize();
- for (size_t index=1; index < fftPlan->length.size(); index++)
+ for (size_t index = 1; index < fftPlan->length.size(); index++)
{
fftPlan->tmpBufSizeRC *= fftPlan->length[index];
}
@@ -1072,12 +1071,12 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
// transposed output
- OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
- _T( "CreateDefaultPlan Large1d column failed" ) );
+ OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1]),
+ _T("CreateDefaultPlan Large1d column failed"));
- FFTPlan* colTPlan = NULL;
- lockRAII* colLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+ FFTPlan* colTPlan = NULL;
+ lockRAII* colLock = NULL;
+ OPENCL_V(fftRepo.getPlan(fftPlan->planX, colTPlan, colLock), _T("fftRepo.getPlan failed"));
// current plan is to create intermediate buffer, packed and interleave
// This is a column FFT, the first elements distance between each FFT is the distance of the first two
@@ -1085,39 +1084,39 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
//this part are common for both passes
- colTPlan->placeness = CLFFT_OUTOFPLACE;
- colTPlan->precision = fftPlan->precision;
- colTPlan->forwardScale = 1.0f;
+ colTPlan->placeness = CLFFT_OUTOFPLACE;
+ colTPlan->precision = fftPlan->precision;
+ colTPlan->forwardScale = 1.0f;
colTPlan->backwardScale = 1.0f;
- colTPlan->tmpBufSize = 0;
- colTPlan->batchsize = fftPlan->batchsize;
+ colTPlan->tmpBufSize = 0;
+ colTPlan->batchsize = fftPlan->batchsize;
- colTPlan->gen = fftPlan->gen;
- colTPlan->envelope = fftPlan->envelope;
+ colTPlan->gen = fftPlan->gen;
+ colTPlan->envelope = fftPlan->envelope;
//Pass large1D flag to confirm we need multiply twiddle factor
- colTPlan->large1D = fftPlan->length[0];
- colTPlan->RCsimple = true;
+ colTPlan->large1D = fftPlan->length[0];
+ colTPlan->RCsimple = true;
colTPlan->length.push_back(clLengths[0]);
// first Pass
- colTPlan->inputLayout = fftPlan->inputLayout;
- colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
- colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
- colTPlan->outStride[0] = 1;
- colTPlan->iDist = fftPlan->iDist;
- colTPlan->oDist = length0 * length1;//fftPlan->length[0];
+ colTPlan->inputLayout = fftPlan->inputLayout;
+ colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
+ colTPlan->outStride[0] = 1;
+ colTPlan->iDist = fftPlan->iDist;
+ colTPlan->oDist = length0 * length1;//fftPlan->length[0];
colTPlan->inStride.push_back(fftPlan->inStride[0]);
colTPlan->outStride.push_back(length1);//clLengths[1]);
- for (size_t index=1; index < fftPlan->length.size(); index++)
+ for (size_t index = 1; index < fftPlan->length.size(); index++)
{
colTPlan->length.push_back(fftPlan->length[index]);
colTPlan->inStride.push_back(fftPlan->inStride[index]);
// tmp buffer is tightly packed
colTPlan->outStride.push_back(colTPlan->oDist);
- colTPlan->oDist *= fftPlan->length[index];
+ colTPlan->oDist *= fftPlan->length[index];
}
//Set callback data if set on top level plan
@@ -1128,97 +1127,127 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colTPlan->precallUserData = fftPlan->precallUserData;
}
- OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+ OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d first column plan failed"));
//another column FFT, size clLengths[0], batch clLengths[1], output without transpose
- OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
- _T( "CreateDefaultPlan large1D row failed" ) );
+ OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0]),
+ _T("CreateDefaultPlan large1D row failed"));
- FFTPlan* col2Plan = NULL;
- lockRAII* rowLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+ FFTPlan* col2Plan = NULL;
+ lockRAII* rowLock = NULL;
+ OPENCL_V(fftRepo.getPlan(fftPlan->planY, col2Plan, rowLock), _T("fftRepo.getPlan failed"));
// This is second column fft, intermediate buffer is packed and interleaved
// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
- // common part for both passes
- col2Plan->placeness = CLFFT_INPLACE;
- col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
-
- col2Plan->precision = fftPlan->precision;
- col2Plan->forwardScale = fftPlan->forwardScale;
+ col2Plan->precision = fftPlan->precision;
+ col2Plan->forwardScale = fftPlan->forwardScale;
col2Plan->backwardScale = fftPlan->backwardScale;
- col2Plan->tmpBufSize = 0;
- col2Plan->batchsize = fftPlan->batchsize;
+ col2Plan->tmpBufSize = 0;
+ col2Plan->batchsize = fftPlan->batchsize;
- col2Plan->gen = fftPlan->gen;
- col2Plan->envelope = fftPlan->envelope;
+ col2Plan->gen = fftPlan->gen;
+ col2Plan->envelope = fftPlan->envelope;
col2Plan->length.push_back(length1);
- col2Plan->inStride[0] = length1;
+ col2Plan->inStride[0] = length1;
col2Plan->inStride.push_back(1);
- col2Plan->iDist = length0 * length1;
+ col2Plan->iDist = length0 * length1;
- col2Plan->outStride[0] = length1;
- col2Plan->outStride.push_back(1);
- col2Plan->oDist = length0 * length1;
+ if (colTPlan->planX)
+ {
+ col2Plan->large1D = fftPlan->length[0];
+ col2Plan->twiddleFront = true;
+ }
- for (size_t index=1; index < fftPlan->length.size(); index++)
+ if ((fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+ (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR))
{
- col2Plan->length.push_back(fftPlan->length[index]);
- col2Plan->inStride.push_back(col2Plan->iDist);
- col2Plan->outStride.push_back(col2Plan->oDist);
- col2Plan->iDist *= fftPlan->length[index];
- col2Plan->oDist *= fftPlan->length[index];
+ col2Plan->placeness = CLFFT_INPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+ col2Plan->outStride[0] = length1;
+ col2Plan->outStride.push_back(1);
+ col2Plan->oDist = length0 * length1;
+
+ for (size_t index = 1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
+ col2Plan->outStride.push_back(col2Plan->oDist);
+ col2Plan->iDist *= fftPlan->length[index];
+ col2Plan->oDist *= fftPlan->length[index];
+ }
+ }
+ else
+ {
+ col2Plan->placeness = CLFFT_OUTOFPLACE;
+ col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ col2Plan->outputLayout = fftPlan->outputLayout;
+
+ col2Plan->outStride[0] = length1*fftPlan->outStride[0];
+ col2Plan->outStride.push_back(fftPlan->outStride[0]);
+ col2Plan->oDist = fftPlan->oDist;
+
+ for (size_t index = 1; index < fftPlan->length.size(); index++)
+ {
+ col2Plan->length.push_back(fftPlan->length[index]);
+ col2Plan->inStride.push_back(col2Plan->iDist);
+ col2Plan->outStride.push_back(fftPlan->outStride[index]);
+ col2Plan->iDist *= fftPlan->length[index];
+ }
}
OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
+ if ( (fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+ (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR) )
+ {
+ // copy plan to get back to hermitian
+ OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0]),
+ _T("CreateDefaultPlan RC copy failed"));
- // copy plan to get back to hermitian
- OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
- _T( "CreateDefaultPlan RC copy failed" ) );
+ FFTPlan* copyPlan = NULL;
+ lockRAII* copyLock = NULL;
+ OPENCL_V(fftRepo.getPlan(fftPlan->planRCcopy, copyPlan, copyLock), _T("fftRepo.getPlan failed"));
- FFTPlan* copyPlan = NULL;
- lockRAII* copyLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
- // This is second column fft, intermediate buffer is packed and interleaved
- // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+ // common part for both passes
+ copyPlan->placeness = CLFFT_OUTOFPLACE;
+ copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ copyPlan->outputLayout = fftPlan->outputLayout;
- // common part for both passes
- copyPlan->placeness = CLFFT_OUTOFPLACE;
- copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
- copyPlan->outputLayout = fftPlan->outputLayout;
+ copyPlan->precision = fftPlan->precision;
+ copyPlan->forwardScale = 1.0f;
+ copyPlan->backwardScale = 1.0f;
+ copyPlan->tmpBufSize = 0;
+ copyPlan->batchsize = fftPlan->batchsize;
- copyPlan->precision = fftPlan->precision;
- copyPlan->forwardScale = 1.0f;
- copyPlan->backwardScale = 1.0f;
- copyPlan->tmpBufSize = 0;
- copyPlan->batchsize = fftPlan->batchsize;
+ copyPlan->gen = Copy;
+ copyPlan->envelope = fftPlan->envelope;
- copyPlan->gen = Copy;
- copyPlan->envelope = fftPlan->envelope;
+ copyPlan->inStride[0] = 1;
+ copyPlan->iDist = fftPlan->length[0];
- copyPlan->inStride[0] = 1;
- copyPlan->iDist = fftPlan->length[0];
+ copyPlan->outStride[0] = fftPlan->outStride[0];
+ copyPlan->oDist = fftPlan->oDist;
- copyPlan->outStride[0] = fftPlan->outStride[0];
- copyPlan->oDist = fftPlan->oDist;
+ for (size_t index = 1; index < fftPlan->length.size(); index++)
+ {
+ copyPlan->length.push_back(fftPlan->length[index]);
+ copyPlan->inStride.push_back(copyPlan->inStride[index - 1] * fftPlan->length[index - 1]);
+ copyPlan->iDist *= fftPlan->length[index];
+ copyPlan->outStride.push_back(fftPlan->outStride[index]);
+ }
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- copyPlan->length.push_back(fftPlan->length[index]);
- copyPlan->inStride.push_back(copyPlan->inStride[index-1] * fftPlan->length[index-1]);
- copyPlan->iDist *= fftPlan->length[index];
- copyPlan->outStride.push_back(fftPlan->outStride[index]);
+ OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
}
- OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
-
}
else if(fftPlan->outputLayout == CLFFT_REAL)
{
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index d88aab4..bd8bc5d 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -225,20 +225,32 @@ clfftStatus clfftEnqueueTransform(
_T("clfftEnqueueTransform large1D col pass failed"));
- // another column FFT output, INPLACE
- OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &colOutEvents,
- ©InEvents, &(fftPlan->intBufferRC), &(fftPlan->intBufferRC), localIntBuffer ),
- _T("clfftEnqueueTransform large1D second column failed"));
- clReleaseEvent(colOutEvents);
-
cl_mem *out_local;
out_local = (fftPlan->placeness==CLFFT_INPLACE) ? clInputBuffers : clOutputBuffers;
- // copy from full complex to hermitian
- OPENCL_V( clfftEnqueueTransform( fftPlan->planRCcopy, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, ©InEvents,
- outEvents, &(fftPlan->intBufferRC), out_local, localIntBuffer ),
- _T("clfftEnqueueTransform large1D RC copy failed"));
- clReleaseEvent(copyInEvents);
+ if ((fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+ (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR))
+ {
+ // another column FFT output, INPLACE
+ OPENCL_V(clfftEnqueueTransform(fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+ ©InEvents, &(fftPlan->intBufferRC), &(fftPlan->intBufferRC), localIntBuffer),
+ _T("clfftEnqueueTransform large1D second column failed"));
+ clReleaseEvent(colOutEvents);
+
+ // copy from full complex to hermitian
+ OPENCL_V(clfftEnqueueTransform(fftPlan->planRCcopy, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, ©InEvents,
+ outEvents, &(fftPlan->intBufferRC), out_local, localIntBuffer),
+ _T("clfftEnqueueTransform large1D RC copy failed"));
+ clReleaseEvent(copyInEvents);
+ }
+ else
+ {
+ // another column FFT output, OUTOFPLACE
+ OPENCL_V(clfftEnqueueTransform(fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+ outEvents, &(fftPlan->intBufferRC), out_local, localIntBuffer),
+ _T("clfftEnqueueTransform large1D second column failed"));
+ clReleaseEvent(colOutEvents);
+ }
}
else if( fftPlan->outputLayout == CLFFT_REAL )
diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp
index 9cce59b..c4c94e1 100644
--- a/src/statTimer/statisticalTimer.GPU.cpp
+++ b/src/statTimer/statisticalTimer.GPU.cpp
@@ -582,6 +582,9 @@ GpuStatTimer::Print( )
<< std::setw( tableThird ) << mean[ t ].batchSize << std::endl;
}
+ tout << std::setw(tableFourth) << _T("Input Dist:") << std::setw(tableThird) << mean[t].iDist << std::endl;
+ tout << std::setw(tableFourth) << _T("Output Dist:") << std::setw(tableThird) << mean[t].oDist << std::endl;
+
tout << std::setw( tableFourth ) << _T( "Input Stride:" );
catLengths.str( _T( "" ) );
diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h
index d52e7dd..9dcb161 100644
--- a/src/statTimer/statisticalTimer.GPU.h
+++ b/src/statTimer/statisticalTimer.GPU.h
@@ -54,6 +54,8 @@ struct StatData
std::vector< size_t > lengths;
std::vector< size_t > inStride;
std::vector< size_t > outStride;
+ size_t iDist;
+ size_t oDist;
std::vector< size_t > enqueueWorkSize;
std::vector< cl_event > outEvents;
@@ -66,7 +68,7 @@ struct StatData
plHandle( id ), planX( plan->planX ), planY( plan->planY ), planZ( plan->planZ ),
planTX( plan->planTX ), planTY( plan->planTY ), planTZ( plan->planTZ ),
planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ),
- inStride( plan->inStride ), outStride( plan->outStride ),
+ inStride( plan->inStride ), outStride( plan->outStride ), iDist( plan->iDist ), oDist( plan->oDist ),
lengths( plan->length ), enqueueWorkSize( gWorkSize )
{
for( cl_uint e = 0; e < nEv; ++e )
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list