[clfft] 99/128: fixing some bugs in real2hermitian, debug continues

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:44 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 0993a32c23c0f767948b731ce7f72b58c3c6bc5e
Author: bragadeesh <bragadeesh.natarajan at amd.com>
Date:   Wed Oct 7 23:57:51 2015 -0700

    fixing some bugs in real2hermitian, debug continues
---
 src/callback-client/CMakeLists.txt     |   7 +-
 src/client/CMakeLists.pack             | 182 ------------------------------
 src/library/generator.stockham.cpp     |   5 +-
 src/library/plan.cpp                   | 199 +++++++++++++++++++--------------
 src/library/transform.cpp              |  34 ++++--
 src/statTimer/statisticalTimer.GPU.cpp |   3 +
 src/statTimer/statisticalTimer.GPU.h   |   4 +-
 7 files changed, 145 insertions(+), 289 deletions(-)

diff --git a/src/callback-client/CMakeLists.txt b/src/callback-client/CMakeLists.txt
index dc63b8e..2bc8384 100644
--- a/src/callback-client/CMakeLists.txt
+++ b/src/callback-client/CMakeLists.txt
@@ -54,9 +54,4 @@ if( APPLE )
     set_target_properties( clFFT-callback-client PROPERTIES INSTALL_RPATH "@loader_path/../lib${SUFFIX_LIB}")
 endif()
 
-# CPack configuration; include the executable into the package
-install( TARGETS clFFT-callback-client
-        RUNTIME DESTINATION bin${SUFFIX_BIN}
-        LIBRARY DESTINATION lib${SUFFIX_LIB}
-        ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
-        )
+
diff --git a/src/client/CMakeLists.pack b/src/client/CMakeLists.pack
deleted file mode 100644
index 2fcf3ea..0000000
--- a/src/client/CMakeLists.pack
+++ /dev/null
@@ -1,182 +0,0 @@
-# ########################################################################
-# Copyright 2013 Advanced Micro Devices, Inc.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ########################################################################
-cmake_minimum_required( VERSION 2.6 )
-project( clFFT.Sample )
-
-# If AMDAPPSDKROOT is defined as an environment value, use that value and cache it so it's visible in the cmake-gui.  
-# Otherwise, create a sensible default that the user can change
-if( DEFINED ENV{AMDAPPSDKROOT} )
-	set( AMD_APP_SDK_ROOT $ENV{AMDAPPSDKROOT} CACHE PATH "Environment variable defining the root of the ATI Stream SDK" )
-else( )
-	set( AMD_APP_SDK_ROOT "/Path/To/ATI_Stream_SDK" CACHE PATH "Modify this variable to point to the root of the ATI Stream SDK installation" )
-endif( )
-
-# If BOOST_ROOT is defined as an environment value, use that value and cache it so it's visible in the cmake-gui.  
-# Otherwise, create a sensible default that the user can change
-if( DEFINED ENV{BOOST_ROOT} )
-	set( BOOST_ROOT $ENV{BOOST_ROOT} CACHE PATH "Environment variable defining the root of the Boost installation" )
-else( )
-	if( UNIX )
-		set( BOOST_ROOT "/usr" CACHE PATH "Modify this variable to point to the root of the Boost installation" )
-	else( )
-		set( BOOST_ROOT "/Path/To/boost_x_xx_x" CACHE PATH "Modify this variable to point to the root of the Boost installation" )
-	endif()
-endif( )
-
-# Currently, linux has a problem outputing both narrow and wide characters,
-# which happens in our client because openCL only supports narrow characters
-if( WIN32 )
-	option( UNICODE "Build with Unicode Support" ON )
-	if( UNICODE )
-		message( STATUS "UNICODE build" )
-	endif( )
-else()
-	set( UNICODE OFF )
-	message( STATUS "UNICODE feature disabled on linux" )
-endif()
-
-if( MSVC_IDE )
-	set( BUILD64 ${CMAKE_CL_64} )
-else()
-	option( BUILD64 "Build a 64-bit product" ON )
-	if( BUILD64 )
-		message( STATUS "64-bit build" )
-	endif( )
-
-	if( IS_DIRECTORY ${PROJECT_SOURCE_DIR}/library/test )
-		option( CODE_COVERAGE "Build makefiles with code coverage instrumentation" OFF )
-		if( CODE_COVERAGE )
-			message( STATUS "Code coverage instrumentation on" )
-		endif()
-	endif()
-endif()
-
-# For linux, modify the global find property to help us find libraries like Boost in the correct paths
-if( UNIX )
-	if( BUILD64 )
-		set_property( GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE )
-		message( STATUS "64bit build - FIND_LIBRARY_USE_LIB64_PATHS: ${FIND_LIBRARY_USE_LIB64_PATHS}" )
-	else()
-		set_property( GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS FALSE )
-		message( STATUS "32bit build - FIND_LIBRARY_USE_LIB64_PATHS: ${FIND_LIBRARY_USE_LIB64_PATHS}" )
-	endif()
-endif()
-
-# Find the absolute path to the opencl library that we need to link too; the path depends on being 64bit or 32bit
-if( BUILD64 )
-	find_library( OPENCL_LIBRARIES
-		NAMES OpenCL
-		HINTS
-			${AMD_APP_SDK_ROOT}/lib/
-			ENV AMD_APP_SDK_ROOT
-		PATH_SUFFIXES x86_64 x86
-	)
-else()
-	find_library( OPENCL_LIBRARIES
-		NAMES OpenCL
-		HINTS
-			${AMD_APP_SDK_ROOT}/lib/
-			ENV AMD_APP_SDK_ROOT
-		PATH_SUFFIXES x86
-	)
-endif()
-message( STATUS "OPENCL_LIBRARIES: ${OPENCL_LIBRARIES}" )
-
-set( Boost_USE_MULTITHREADED ON )
-set( Boost_USE_STATIC_LIBS   ON )
-set( Boost_DETAILED_FAILURE_MSG   ON )
-set( Boost_DEBUG ON )
-set( Boost_ADDITIONAL_VERSIONS "1.44.0" "1.44" )
-# On linux, the boost installed in the system always appears to override any user boost installs
-if( UNIX )
-	set( Boost_NO_SYSTEM_PATHS TRUE )
-endif( )
-find_package( Boost 1.33.0 COMPONENTS program_options )
-message(STATUS "Boost_PROGRAM_OPTIONS_LIBRARY: ${Boost_PROGRAM_OPTIONS_LIBRARY}")
-
-# FFLAGS depend on the compiler, grab the compiler name from the path
-get_filename_component( C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME_WE )
-# message( "C_COMPILER_NAME: " ${C_COMPILER_NAME} )
-# message( "CMAKE_C_COMPILER: " ${CMAKE_C_COMPILER} )
-
-# Set common compile and link options
-if( C_COMPILER_NAME STREQUAL "cl" )
-	# Following options for nMake
-	message( STATUS "Detected MSVS Ver: " ${MSVC_VERSION} )
-	if( NOT MSVC_IDE )
-		message( STATUS "Using an nMake environment to build" )
-
-	endif( )
-
-elseif( C_COMPILER_NAME STREQUAL "gcc" )
-	message( STATUS "Detected GNU fortran compiler." )
-	# set( CMAKE_CXX_FLAGS "-std=c++0x ${CMAKE_CXX_FLAGS}" )
-	
-	if( BUILD64 )
-		set( CMAKE_CXX_FLAGS "-m64 ${CMAKE_CXX_FLAGS}" )
-		set( CMAKE_C_FLAGS "-m64 ${CMAKE_C_FLAGS}" )
-	else( )
-		set( CMAKE_CXX_FLAGS "-m32 ${CMAKE_CXX_FLAGS}" )
-		set( CMAKE_C_FLAGS "-m32 ${CMAKE_C_FLAGS}" )
-	endif( )
-else( )
-	message( FATAL_ERROR "Compiler name not detected" )
-endif( )
-
-# If UNICODE is defined, pass extra definitions into 
-if( UNICODE )
-	add_definitions( "/DUNICODE /D_UNICODE" )
-endif( )
-
-# Print out compiler flags for viewing/debug
-message( STATUS "CMAKE_CXX_COMPILER flags: " ${CMAKE_CXX_FLAGS} )
-message( STATUS "CMAKE_CXX_COMPILER debug flags: " ${CMAKE_CXX_FLAGS_DEBUG} )
-message( STATUS "CMAKE_CXX_COMPILER release flags: " ${CMAKE_CXX_FLAGS_RELEASE} )
-message( STATUS "CMAKE_CXX_COMPILER relwithdebinfo flags: " ${CMAKE_CXX_FLAGS_RELWITHDEBINFO} )
-message( STATUS "CMAKE_EXE_LINKER link flags: " ${CMAKE_EXE_LINKER_FLAGS} )
-
-include_directories( ${Boost_INCLUDE_DIRS} ${AMD_APP_SDK_ROOT}/include ${PROJECT_SOURCE_DIR}/../include )
-
-# Set the OpenCL library include path depending on target platform
-if( BUILD64 )
-    if( WIN32 )
-	    link_directories( ${AMD_APP_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64/import )
-    elseif( UNIX )
-	    link_directories( ${AMD_APP_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64 )
-    endif()
-else()
-    if( WIN32 )
-	    link_directories( ${AMD_APP_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32/import )
-    elseif( UNIX )
-	    link_directories( ${AMD_APP_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32 )
-    endif()
-endif()
-
-add_executable( Client 
-		# sources follow
-		client.cpp
-		openCL.misc.cpp
-		statisticalTimer.cpp
-		stdafx.cpp
-		client.h
-		openCL.misc.h
-		statisticalTimer.h
-		stdafx.h
-		targetver.h
-		unicode.compatibility.h
-		../include/clFFT.h )
-
-target_link_libraries( clFFT.Client clFFT ${Boost_LIBRARIES} ${OPENCL_LIBRARIES})
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index c3fa5f9..40f54eb 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -2746,10 +2746,7 @@ namespace StockhamGenerator
 			// Set half lds for real transforms
 			halfLds = r2c2r ? true : halfLds;
 
-			// Set half lds for radix7
-			halfLds = (length % 7 == 0) ? true : halfLds;
-
-			linearRegs = halfLds;
+			linearRegs = true;
 
 			realSpecial = params.fft_realSpecial;
 
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 62a3da4..a90f43d 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -605,7 +605,6 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 				clLengths[0] = fftPlan->length[0]/clLengths[1];
 
-
                 // Start of block where transposes are generated; 1D FFT
 				while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
 				{
@@ -1058,13 +1057,13 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					fftPlan->baked = true;
 					return	CLFFT_SUCCESS;
 				}
-				else if(fftPlan->inputLayout == CLFFT_REAL)
+				else if (fftPlan->inputLayout == CLFFT_REAL)
 				{
-					if (fftPlan->tmpBufSizeRC==0 )
+					if (fftPlan->tmpBufSizeRC == 0)
 					{
 						fftPlan->tmpBufSizeRC = length0 * length1 *
 							fftPlan->batchsize * fftPlan->ElementSize();
-						for (size_t index=1; index < fftPlan->length.size(); index++)
+						for (size_t index = 1; index < fftPlan->length.size(); index++)
 						{
 							fftPlan->tmpBufSizeRC *= fftPlan->length[index];
 						}
@@ -1072,12 +1071,12 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 
 					// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
 					// transposed output
-					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
-						_T( "CreateDefaultPlan Large1d column failed" ) );
+					OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1]),
+						_T("CreateDefaultPlan Large1d column failed"));
 
-					FFTPlan* colTPlan	= NULL;
-					lockRAII* colLock	= NULL;
-					OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
+					FFTPlan* colTPlan = NULL;
+					lockRAII* colLock = NULL;
+					OPENCL_V(fftRepo.getPlan(fftPlan->planX, colTPlan, colLock), _T("fftRepo.getPlan failed"));
 
 					// current plan is to create intermediate buffer, packed and interleave
 					// This is a column FFT, the first elements distance between each FFT is the distance of the first two
@@ -1085,39 +1084,39 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
 
 					//this part are common for both passes
-					colTPlan->placeness     = CLFFT_OUTOFPLACE;
-					colTPlan->precision     = fftPlan->precision;
-					colTPlan->forwardScale  = 1.0f;
+					colTPlan->placeness = CLFFT_OUTOFPLACE;
+					colTPlan->precision = fftPlan->precision;
+					colTPlan->forwardScale = 1.0f;
 					colTPlan->backwardScale = 1.0f;
-					colTPlan->tmpBufSize    = 0;
-					colTPlan->batchsize     = fftPlan->batchsize;
+					colTPlan->tmpBufSize = 0;
+					colTPlan->batchsize = fftPlan->batchsize;
 
-					colTPlan->gen			= fftPlan->gen;
-					colTPlan->envelope			= fftPlan->envelope;
+					colTPlan->gen = fftPlan->gen;
+					colTPlan->envelope = fftPlan->envelope;
 
 					//Pass large1D flag to confirm we need multiply twiddle factor
-					colTPlan->large1D       = fftPlan->length[0];
-					colTPlan->RCsimple		= true;
+					colTPlan->large1D = fftPlan->length[0];
+					colTPlan->RCsimple = true;
 
 					colTPlan->length.push_back(clLengths[0]);
 
 					// first Pass
-					colTPlan->inputLayout   = fftPlan->inputLayout;
-					colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-					colTPlan->inStride[0]   = fftPlan->inStride[0] * clLengths[0];
-					colTPlan->outStride[0]  = 1;
-					colTPlan->iDist         = fftPlan->iDist;
-					colTPlan->oDist         = length0 * length1;//fftPlan->length[0];
+					colTPlan->inputLayout = fftPlan->inputLayout;
+					colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+					colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
+					colTPlan->outStride[0] = 1;
+					colTPlan->iDist = fftPlan->iDist;
+					colTPlan->oDist = length0 * length1;//fftPlan->length[0];
 					colTPlan->inStride.push_back(fftPlan->inStride[0]);
 					colTPlan->outStride.push_back(length1);//clLengths[1]);
 
-					for (size_t index=1; index < fftPlan->length.size(); index++)
+					for (size_t index = 1; index < fftPlan->length.size(); index++)
 					{
 						colTPlan->length.push_back(fftPlan->length[index]);
 						colTPlan->inStride.push_back(fftPlan->inStride[index]);
 						// tmp buffer is tightly packed
 						colTPlan->outStride.push_back(colTPlan->oDist);
-						colTPlan->oDist        *= fftPlan->length[index];
+						colTPlan->oDist *= fftPlan->length[index];
 					}
 
 					//Set callback data if set on top level plan
@@ -1128,97 +1127,127 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						colTPlan->precallUserData = fftPlan->precallUserData;
 					}
 
-					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
+					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d first column plan failed"));
 
 					//another column FFT, size clLengths[0], batch clLengths[1], output without transpose
-					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
-						_T( "CreateDefaultPlan large1D row failed" ) );
+					OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0]),
+						_T("CreateDefaultPlan large1D row failed"));
 
-					FFTPlan* col2Plan	= NULL;
-					lockRAII* rowLock	= NULL;
-					OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
+					FFTPlan* col2Plan = NULL;
+					lockRAII* rowLock = NULL;
+					OPENCL_V(fftRepo.getPlan(fftPlan->planY, col2Plan, rowLock), _T("fftRepo.getPlan failed"));
 
 					// This is second column fft, intermediate buffer is packed and interleaved
 					// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
 
-					// common part for both passes
-					col2Plan->placeness     = CLFFT_INPLACE;
-					col2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-					col2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
-
-					col2Plan->precision     = fftPlan->precision;
-					col2Plan->forwardScale  = fftPlan->forwardScale;
+					col2Plan->precision = fftPlan->precision;
+					col2Plan->forwardScale = fftPlan->forwardScale;
 					col2Plan->backwardScale = fftPlan->backwardScale;
-					col2Plan->tmpBufSize    = 0;
-					col2Plan->batchsize     = fftPlan->batchsize;
+					col2Plan->tmpBufSize = 0;
+					col2Plan->batchsize = fftPlan->batchsize;
 
-					col2Plan->gen			= fftPlan->gen;
-					col2Plan->envelope			= fftPlan->envelope;
+					col2Plan->gen = fftPlan->gen;
+					col2Plan->envelope = fftPlan->envelope;
 
 					col2Plan->length.push_back(length1);
 
-					col2Plan->inStride[0]  = length1;
+					col2Plan->inStride[0] = length1;
 					col2Plan->inStride.push_back(1);
-					col2Plan->iDist        = length0 * length1;
+					col2Plan->iDist = length0 * length1;
 
-					col2Plan->outStride[0] = length1;
-					col2Plan->outStride.push_back(1);
-					col2Plan->oDist         = length0 * length1;
+					if (colTPlan->planX)
+					{
+						col2Plan->large1D = fftPlan->length[0];
+						col2Plan->twiddleFront = true;
+					}
 
-					for (size_t index=1; index < fftPlan->length.size(); index++)
+					if ((fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+						(fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR))
 					{
-						col2Plan->length.push_back(fftPlan->length[index]);
-						col2Plan->inStride.push_back(col2Plan->iDist);
-						col2Plan->outStride.push_back(col2Plan->oDist);
-						col2Plan->iDist   *= fftPlan->length[index];
-						col2Plan->oDist   *= fftPlan->length[index];
+						col2Plan->placeness = CLFFT_INPLACE;
+						col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+						col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+
+						col2Plan->outStride[0] = length1;
+						col2Plan->outStride.push_back(1);
+						col2Plan->oDist = length0 * length1;
+
+						for (size_t index = 1; index < fftPlan->length.size(); index++)
+						{
+							col2Plan->length.push_back(fftPlan->length[index]);
+							col2Plan->inStride.push_back(col2Plan->iDist);
+							col2Plan->outStride.push_back(col2Plan->oDist);
+							col2Plan->iDist *= fftPlan->length[index];
+							col2Plan->oDist *= fftPlan->length[index];
+						}
+					}
+					else
+					{
+						col2Plan->placeness = CLFFT_OUTOFPLACE;
+						col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+						col2Plan->outputLayout = fftPlan->outputLayout;
+
+						col2Plan->outStride[0] = length1*fftPlan->outStride[0];
+						col2Plan->outStride.push_back(fftPlan->outStride[0]);
+						col2Plan->oDist = fftPlan->oDist;
+
+						for (size_t index = 1; index < fftPlan->length.size(); index++)
+						{
+							col2Plan->length.push_back(fftPlan->length[index]);
+							col2Plan->inStride.push_back(col2Plan->iDist);
+							col2Plan->outStride.push_back(fftPlan->outStride[index]);
+							col2Plan->iDist *= fftPlan->length[index];
+						}
 					}
 
 					OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
 
+					if ( (fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+						 (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR) )
+					{
+						// copy plan to get back to hermitian
+						OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0]),
+							_T("CreateDefaultPlan RC copy failed"));
 
-					// copy plan to get back to hermitian
-					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D,  &fftPlan->length[0] ),
-						_T( "CreateDefaultPlan RC copy failed" ) );
+						FFTPlan* copyPlan = NULL;
+						lockRAII* copyLock = NULL;
+						OPENCL_V(fftRepo.getPlan(fftPlan->planRCcopy, copyPlan, copyLock), _T("fftRepo.getPlan failed"));
 
-					FFTPlan* copyPlan	= NULL;
-					lockRAII* copyLock	= NULL;
-					OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+						// This is second column fft, intermediate buffer is packed and interleaved
+						// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
 
-					// This is second column fft, intermediate buffer is packed and interleaved
-					// we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+						// common part for both passes
+						copyPlan->placeness = CLFFT_OUTOFPLACE;
+						copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
+						copyPlan->outputLayout = fftPlan->outputLayout;
 
-					// common part for both passes
-					copyPlan->placeness     = CLFFT_OUTOFPLACE;
-					copyPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
-					copyPlan->outputLayout  = fftPlan->outputLayout;
+						copyPlan->precision = fftPlan->precision;
+						copyPlan->forwardScale = 1.0f;
+						copyPlan->backwardScale = 1.0f;
+						copyPlan->tmpBufSize = 0;
+						copyPlan->batchsize = fftPlan->batchsize;
 
-					copyPlan->precision     = fftPlan->precision;
-					copyPlan->forwardScale  = 1.0f;
-					copyPlan->backwardScale = 1.0f;
-					copyPlan->tmpBufSize    = 0;
-					copyPlan->batchsize     = fftPlan->batchsize;
+						copyPlan->gen = Copy;
+						copyPlan->envelope = fftPlan->envelope;
 
-					copyPlan->gen			= Copy;
-					copyPlan->envelope		= fftPlan->envelope;
 
+						copyPlan->inStride[0] = 1;
+						copyPlan->iDist = fftPlan->length[0];
 
-					copyPlan->inStride[0]  = 1;
-					copyPlan->iDist        = fftPlan->length[0];
+						copyPlan->outStride[0] = fftPlan->outStride[0];
+						copyPlan->oDist = fftPlan->oDist;
 
-					copyPlan->outStride[0] = fftPlan->outStride[0];
-					copyPlan->oDist         = fftPlan->oDist;
+						for (size_t index = 1; index < fftPlan->length.size(); index++)
+						{
+							copyPlan->length.push_back(fftPlan->length[index]);
+							copyPlan->inStride.push_back(copyPlan->inStride[index - 1] * fftPlan->length[index - 1]);
+							copyPlan->iDist *= fftPlan->length[index];
+							copyPlan->outStride.push_back(fftPlan->outStride[index]);
+						}
 
-					for (size_t index=1; index < fftPlan->length.size(); index++)
-					{
-						copyPlan->length.push_back(fftPlan->length[index]);
-						copyPlan->inStride.push_back(copyPlan->inStride[index-1] * fftPlan->length[index-1]);
-						copyPlan->iDist   *= fftPlan->length[index];
-						copyPlan->outStride.push_back(fftPlan->outStride[index]);
+						OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
 					}
 
-					OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
-
 				}
 				else if(fftPlan->outputLayout == CLFFT_REAL)
 				{
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index d88aab4..bd8bc5d 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -225,20 +225,32 @@ clfftStatus clfftEnqueueTransform(
 					_T("clfftEnqueueTransform large1D col pass failed"));
 
 
-				// another column FFT output, INPLACE
-				OPENCL_V( clfftEnqueueTransform( fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &colOutEvents,
-					&copyInEvents, &(fftPlan->intBufferRC), &(fftPlan->intBufferRC), localIntBuffer ),
-					_T("clfftEnqueueTransform large1D second column failed"));
-				clReleaseEvent(colOutEvents);
-
 				cl_mem *out_local;
 				out_local = (fftPlan->placeness==CLFFT_INPLACE) ? clInputBuffers : clOutputBuffers;
 
-				// copy from full complex to hermitian
-				OPENCL_V( clfftEnqueueTransform( fftPlan->planRCcopy, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &copyInEvents,
-					outEvents, &(fftPlan->intBufferRC), out_local, localIntBuffer ),
-					_T("clfftEnqueueTransform large1D RC copy failed"));
-				clReleaseEvent(copyInEvents);
+				if ((fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+					(fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR))
+				{
+					// another column FFT output, INPLACE
+					OPENCL_V(clfftEnqueueTransform(fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+						&copyInEvents, &(fftPlan->intBufferRC), &(fftPlan->intBufferRC), localIntBuffer),
+						_T("clfftEnqueueTransform large1D second column failed"));
+					clReleaseEvent(colOutEvents);
+
+					// copy from full complex to hermitian
+					OPENCL_V(clfftEnqueueTransform(fftPlan->planRCcopy, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &copyInEvents,
+						outEvents, &(fftPlan->intBufferRC), out_local, localIntBuffer),
+						_T("clfftEnqueueTransform large1D RC copy failed"));
+					clReleaseEvent(copyInEvents);
+				}
+				else
+				{
+					// another column FFT output, OUTOFPLACE
+					OPENCL_V(clfftEnqueueTransform(fftPlan->planY, CLFFT_FORWARD, numQueuesAndEvents, commQueues, 1, &colOutEvents,
+						outEvents, &(fftPlan->intBufferRC), out_local, localIntBuffer),
+						_T("clfftEnqueueTransform large1D second column failed"));
+					clReleaseEvent(colOutEvents);
+				}
 
 			}
 			else if( fftPlan->outputLayout == CLFFT_REAL )
diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp
index 9cce59b..c4c94e1 100644
--- a/src/statTimer/statisticalTimer.GPU.cpp
+++ b/src/statTimer/statisticalTimer.GPU.cpp
@@ -582,6 +582,9 @@ GpuStatTimer::Print( )
 					<< std::setw( tableThird )  << mean[ t ].batchSize << std::endl;
 			}
 
+			tout << std::setw(tableFourth) << _T("Input Dist:") << std::setw(tableThird) << mean[t].iDist << std::endl;
+			tout << std::setw(tableFourth) << _T("Output Dist:") << std::setw(tableThird) << mean[t].oDist << std::endl;
+
 			tout << std::setw( tableFourth ) << _T( "Input Stride:" );
 
 			catLengths.str( _T( "" ) );
diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h
index d52e7dd..9dcb161 100644
--- a/src/statTimer/statisticalTimer.GPU.h
+++ b/src/statTimer/statisticalTimer.GPU.h
@@ -54,6 +54,8 @@ struct StatData
 	std::vector< size_t > lengths;
 	std::vector< size_t > inStride;
 	std::vector< size_t > outStride;
+	size_t iDist;
+	size_t oDist;
 	std::vector< size_t > enqueueWorkSize;
 	std::vector< cl_event > outEvents;
 
@@ -66,7 +68,7 @@ struct StatData
 		plHandle( id ), planX( plan->planX ), planY( plan->planY ), planZ( plan->planZ ),
 		planTX( plan->planTX ), planTY( plan->planTY ), planTZ( plan->planTZ ),
 		planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ),
-		inStride( plan->inStride ), outStride( plan->outStride ),
+		inStride( plan->inStride ), outStride( plan->outStride ), iDist( plan->iDist ), oDist( plan->oDist ),
 		lengths( plan->length ), enqueueWorkSize( gWorkSize )
 	{
 		for( cl_uint e = 0; e < nEv; ++e )

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list