[clfft] 27/128: adding cuda client

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:35 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit e232087da2670e5e5b8e97e2693efb92748d54ff
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Thu Aug 20 10:35:54 2015 -0500

    adding cuda client
---
 src/cuFFT-client/CMakeLists.txt   |   41 +
 src/cuFFT-client/cuFFT-client.cpp | 2748 +++++++++++++++++++++++++++++++++++++
 2 files changed, 2789 insertions(+)

diff --git a/src/cuFFT-client/CMakeLists.txt b/src/cuFFT-client/CMakeLists.txt
new file mode 100644
index 0000000..d0cfb54
--- /dev/null
+++ b/src/cuFFT-client/CMakeLists.txt
@@ -0,0 +1,41 @@
+# ########################################################################
+# Copyright 2015 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+cmake_minimum_required(VERSION 2.8)
+find_package(CUDA REQUIRED)
+
+#  client
+set( cuFFT-client.Source	cuFFT-client.cpp )
+
+set( cuFFT-client.Files ${cuFFT-client.Source} )
+
+# Pass options to NVCC
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_20,code=sm_20; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_37,code=sm_37; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52)
+
+include_directories( ./ ../common/inc/ )
+
+CUDA_ADD_EXECUTABLE( cuFFT-client ${cuFFT-client.Files} )
+
+CUDA_ADD_CUFFT_TO_TARGET( cuFFT-client )
+
+target_link_libraries( cuFFT-client ${CUDA_LIBRARIES})
+
+# Set output directory to bin
+if( MSVC )
+	set(CUDA_GENERATED_OUTPUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/bin/${BITNESS})
+else()
+	set(CUDA_GENERATED_OUTPUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/bin/${BITNESS}/${CMAKE_BUILD_TYPE})
+endif()
diff --git a/src/cuFFT-client/cuFFT-client.cpp b/src/cuFFT-client/cuFFT-client.cpp
new file mode 100644
index 0000000..edc559b
--- /dev/null
+++ b/src/cuFFT-client/cuFFT-client.cpp
@@ -0,0 +1,2748 @@
+// ########################################################################
+// Copyright 2015 Advanced Micro Devices, Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ########################################################################
+
+/* Example showing the use of CUFFT  */
+
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+// includes, project
+#include <cuda_runtime.h>
+#include <cufft.h>
+#include <helper_functions.h>
+#include <helper_cuda.h>
+#include "statisticalTimer.extern.h"
+#include "client.h"
+
+#ifdef __linux
+#define strcmpi strcasecmp
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest(int argc, char **argv);
+
+template<typename T>
+bool runC2CFFT(int rank, size_t lengthX, size_t lengthY, size_t lengthZ, int direction,
+			   cufftType type, int profile_count, int batch_size, bool outPlace,
+			   int iDist, int oDist, int iStride, int oStride);
+
+template<typename In, typename Out>
+bool runR2CFFT(int rank, size_t lengthX, size_t lengthY, size_t lengthZ,
+			   cufftType type, int profile_count, int batch_size, bool outPlace,
+			   int iDist, int oDist, int iStride, int oStride);
+
+template<typename In, typename Out>
+bool runC2RFFT(int rank, size_t lengthX, size_t lengthY, size_t lengthZ,
+			   cufftType type, int profile_count, int batch_size, bool outPlace,
+			   int iDist, int oDist, int iStride, int oStride);
+
+template<typename T>
+bool runFFTTransformC2CAdv(size_t fftLength, cufftType type, int rank, int* n, size_t inmem_size, size_t outmem_size, int* inembed, int* outembed,    
+						  int direction, int profile_count, int batch_size, bool outPlace, int istride, int ostride, int idist, int odist);
+
+template<typename In, typename Out>
+bool runFFTTransformR2CAdv(size_t fftLength, cufftType type, size_t outLength, int rank, int* n, size_t inmem_size, size_t outmem_size, int* inembed, int* outembed,    
+						  int profile_count, int batch_size, bool outPlace, int istride, int ostride, int idist, int odist);
+
+template<typename In, typename Out>
+bool runFFTTransformC2RAdv(size_t fftLength, cufftType type, size_t outLength, size_t innerDimLength, int rank, int* n, size_t inmem_size, size_t outmem_size,    
+						  int* inembed, int* outembed, int profile_count, int batch_size, bool outPlace, int istride, int ostride, int idist, int odist);
+
+template<typename T>
+bool run1DFFTTransformC2C(size_t lengthX, int direction, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename In, typename Out>
+bool run1DFFTTransformR2C(size_t lengthX, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename In, typename Out>
+bool run1DFFTTransformC2R(size_t lengthX, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename T>
+bool run2DFFTTransformC2C(size_t lengthX, size_t lengthY, int direction, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename In, typename Out>
+bool run2DFFTTransformR2C(size_t lengthX, size_t lengthY, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename In, typename Out>
+bool run2DFFTTransformC2R(size_t lengthX, size_t lengthY, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename T>
+bool run3DFFTTransformC2C(size_t lengthX, size_t lengthY, size_t lengthZ, int direction, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename In, typename Out>
+bool run3DFFTTransformR2C(size_t lengthX, size_t lengthY, size_t lengthZ, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename In, typename Out>
+bool run3DFFTTransformC2R(size_t lengthX, size_t lengthY, size_t lengthZ, cufftType type, int profile_count, int batch_size, bool outPlace);
+
+template<typename In, typename Out>
+void cleanup(In *h_in_signal, In *d_in_signal, Out *h_output_signal, Out *d_output_signal, bool outplace);
+
+#define SIGNAL_SIZE        1024
+
+// Global variables
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+    runTest(argc, argv);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test for CUDA FFT
+////////////////////////////////////////////////////////////////////////////////
+void runTest(int argc, char **argv)
+{
+	bool isInverse = false;
+	cufftType fftType = CUFFT_C2C;
+	int intType = 1;
+	size_t lengthX = SIGNAL_SIZE;
+	size_t lengthY = 1;
+	size_t lengthZ = 1;
+	int profile_count = 5;
+	int batch_size = 1;
+	bool outPlace = false;
+	int iStride = 1;
+	int oStride = 1;
+	int iDist = 0;
+	int oDist = 0;
+	int rank = 1;
+	bool bTestResult = false;
+	bool isDouble = false;
+
+	//Parse command-line options
+	while (argv[1] && argv[1][0] == '-') {
+    if (strcmpi(argv[1], "-inv") == 0) {
+      argv++; argc--;
+      isInverse = atoi(argv[1]) ? true : false;
+    } else if (strcmpi(argv[1], "-type") == 0) {
+      argv++; argc--;
+	  intType = atoi(argv[1]);
+	} 
+	else if (strcmpi(argv[1], "-b") == 0) {
+      argv++; argc--;
+	  batch_size = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-d") == 0) {
+      argv++; argc--;
+	  isDouble = atoi(argv[1]) ? true : false;
+	} else if (strcmpi(argv[1], "-x") == 0) {
+      argv++; argc--;
+	  lengthX = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-y") == 0) {
+      argv++; argc--;
+	  lengthY = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-z") == 0) {
+      argv++; argc--;
+	  lengthZ = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-p") == 0) {
+      argv++; argc--;
+	  profile_count = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-o") == 0) {
+      argv++; argc--;
+	  outPlace = atoi(argv[1]) ? true : false;
+	} else if (strcmpi(argv[1], "-istr") == 0) {
+      argv++; argc--;
+	  iStride = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-ostr") == 0) {
+      argv++; argc--;
+	  oStride = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-idist") == 0) {
+      argv++; argc--;
+	  iDist = atoi(argv[1]);
+	} else if (strcmpi(argv[1], "-odist") == 0) {
+      argv++; argc--;
+	  oDist = atoi(argv[1]);
+	} else {
+	  if (strcmpi(argv[1], "-h") != 0)
+		fprintf(stderr, "Illegal option %s ignored\n", argv[1]);
+
+		printf("Usage:\n %s\n[-inv <0|1 Inverse transform (default: forward 1)>]\n"
+			"[-type <Type of FFT Transform:\n	1) C2C (default)\n	2) R2C\n	3) C2R>]\n"
+			  "[-b <batch size>]\n"
+			 "[-d <0|1 Use Double Precision (default: 0 i.e. Single Precision)]\n"
+			 "[-x <Specify the length of the 1st dimension of a test array>]\n"
+			 "[-y <Specify the length of the 2nd dimension of a test array>]\n"
+			 "[-z <Specify the length of the 3rd dimension of a test array>]\n"
+			 "[-p <Specify number of iterations for profiling measurements. default 5>]\n"
+			 "[-o <0|1 Out of place FFT transform (default: 0 i.e. in place)]\n"
+			 "[-istr <Specify the input stride of the innermost dimension of a test array]\n"
+			 "[-ostr <Specify the output stride of the innermost dimension of a test array]\n"
+			 "[-idist <Specify the distance between first element of two consecutive batches of input ]\n"
+			 "[-odist <Specify the distance between first element of two consecutive batches of output ]\n"
+			 "[-h (usage)]\n", argv[0]);
+		  exit(1);
+		}
+		argv++;
+		argc--;
+	  }
+
+	if (lengthX < 1 || lengthY < 1 || lengthZ < 1)
+	{
+		printf("Length of test array must be >= 1 in each dimension. Exiting..\n"); 
+		exit(EXIT_FAILURE);
+	}
+
+	switch (intType)
+	{
+	case 1: 
+		
+		fftType = isDouble ? CUFFT_Z2Z : CUFFT_C2C; 
+		break;
+	case 2: 
+		fftType = isDouble ? CUFFT_D2Z : CUFFT_R2C; 
+		break;
+	case 3: 
+		fftType = isDouble ? CUFFT_Z2D : CUFFT_C2R; 
+		break;
+	default: printf("Input Arguments ERROR!! Invalid FFT type. Use -h to check the correct options. Exiting..\n"); exit(EXIT_FAILURE);
+	}
+
+    printf("[cuFFT-ClientApp] is starting...\n");
+
+    findCudaDevice(argc, (const char **)argv);
+
+	//Find the rank
+	if (lengthY == 1 && lengthZ == 1)
+	{
+		rank = 1;
+	}
+	else if (lengthY > 1 && lengthZ == 1)
+	{
+		rank = 2;
+	}
+	else if (lengthY > 1 && lengthZ > 1)
+	{
+		rank = 3;
+	}
+
+	int direction = isInverse ? CUFFT_INVERSE : CUFFT_FORWARD;
+
+	//Switch based on the type of transform
+	switch (fftType)
+	{
+		case CUFFT_C2C: 
+		{
+			//Single C2C FFT
+			bTestResult = runC2CFFT<cufftComplex>(rank, lengthX, lengthY, lengthZ, direction, fftType , profile_count, batch_size, outPlace, iDist, oDist, iStride, oStride);
+			
+			break;
+		}	
+		case CUFFT_Z2Z: 
+		{
+			//Double C2C FFT
+			bTestResult = runC2CFFT<cufftDoubleComplex>(rank, lengthX, lengthY, lengthZ, direction, fftType , profile_count, batch_size, outPlace, iDist, oDist, iStride, oStride);
+			
+			break;
+		}	
+		case CUFFT_R2C: 
+		{
+			//R2C FFT
+			bTestResult = runR2CFFT<cufftReal, cufftComplex>(rank, lengthX, lengthY, lengthZ, fftType, profile_count, batch_size, outPlace, iDist, oDist, iStride, oStride);
+			break;
+		}
+		case CUFFT_D2Z: 
+		{
+			//Double R2C FFT
+			bTestResult = runR2CFFT<cufftDoubleReal, cufftDoubleComplex>(rank, lengthX, lengthY, lengthZ, fftType, profile_count, batch_size, outPlace, iDist, oDist, iStride, oStride);
+			break;
+		}
+		case CUFFT_C2R: 
+		{
+			//C2R FFT
+			bTestResult = runC2RFFT<cufftComplex, cufftReal>(rank, lengthX, lengthY, lengthZ, fftType, profile_count, batch_size, outPlace, iDist, oDist, iStride, oStride);
+			break;
+		}
+		case CUFFT_Z2D: 
+		{
+			//C2R FFT
+			bTestResult = runC2RFFT<cufftDoubleComplex, cufftDoubleReal>(rank, lengthX, lengthY, lengthZ, fftType, profile_count, batch_size, outPlace, iDist, oDist, iStride, oStride);
+			break;
+		}
+		default: printf("Invalid FFT type. Exiting..\n");
+			break;
+	}
+
+	if (bTestResult)
+	{
+		printf("FFT Transformation PASSED!\n");
+	}
+	else
+	{
+		printf("FFT Transformation FAILED!!\n");
+	}
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+/*
+* Run C2R FFT for 1D/2D/3D
+*/
+template<typename In, typename Out>
+bool runC2RFFT(int rank, size_t lengthX, size_t lengthY, size_t lengthZ,
+			   cufftType type, int profile_count, int batch_size, bool outPlace,
+			   int iDist, int oDist, int iStride, int oStride)
+{
+	bool bTestResult = false;
+
+	if (!outPlace && iStride < oStride)
+	{
+		printf("Input Arguments ERROR!! For in-place transform, output stride must be less than or equal to input stride. Exiting..\n\n");
+
+		exit(EXIT_FAILURE);
+	}
+
+	if (type == CUFFT_Z2D)
+		printf("Transforming signal Double Complex to Double Real \n");
+	else
+		printf("Transforming signal Complex to Real \n");
+
+	switch (rank)
+	{
+		case 1:
+		{
+			if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+			{
+				//Basic Data layout
+				printf("Dimension : 1\n");
+				printf("Data Layout : Basic\n");
+
+				bTestResult = run1DFFTTransformC2R<In, Out>(lengthX, type, profile_count, batch_size, outPlace); 
+			}
+			else
+			{
+				//Advanced Data layout
+
+				size_t fftLength = lengthX;
+				size_t innerDimLength = outPlace ? lengthX  : (lengthX + 2) ;
+				size_t inLength = (lengthX/2 + 1)*iStride;
+				size_t outLength = innerDimLength * oStride;
+
+				if (iDist > 0 && iDist < inLength)
+				{
+					printf("Input Arguments ERROR!! Value of idist cannot be less than product of lengths along each dimension and stride, ((x/2+1)*istr). Exiting..\n\n");
+					exit(EXIT_FAILURE);
+				}
+
+				if (oDist > 0 && (oDist < outLength))
+				{
+					printf("Input Arguments ERROR!! Value of odist cannot be less than product of lengths along each dimension and stride, ((x+2)*ostr). Exiting..\n\n");
+					exit(EXIT_FAILURE);
+				}
+								
+				iDist = iDist? iDist : (int)inLength;
+				oDist = oDist? oDist : (int)outLength;
+				
+				int n[1] = {(int)lengthX};
+
+				int inembed[1] = {(int)(lengthX/2+1)};
+				int outembed[1] = {(int)innerDimLength};
+
+				size_t inmem_size = sizeof(In) * iDist * batch_size;
+				size_t outmem_size = sizeof(Out) * oDist * batch_size;
+
+				printf("Dimension : 1\n");
+				printf("Data Layout : Advanced\n");
+
+				bTestResult = runFFTTransformC2RAdv<In, Out>(fftLength, type, outLength, innerDimLength, rank, n, inmem_size, outmem_size, 
+													inembed, outembed, profile_count, batch_size, outPlace, iStride, oStride,
+													iDist, oDist);
+			}
+			break;
+		}
+		case 2:
+		{
+			if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+			{
+				//Basic Data layout
+				printf("Dimension : 2\n");
+				printf("Data Layout : Basic\n");
+
+				bTestResult = run2DFFTTransformC2R<In, Out>(lengthX, lengthY, type, profile_count, batch_size, outPlace); 
+			}
+			else
+			{
+				//Advanced Data layout
+
+				size_t fftLength = lengthX * lengthY;
+				size_t innerDimLength = outPlace ? lengthY  : (lengthY + 2) ;
+				size_t inLength = lengthX * (lengthY/2 + 1)*iStride;
+				size_t outLength = lengthX * innerDimLength * oStride;
+
+				if (iDist > 0 && iDist < inLength)
+				{
+					printf("Input Arguments ERROR!! Value of idist cannot be less than product of lengths along each dimension and stride, (x*(y/2+1)*istr). Exiting..\n\n");
+					exit(EXIT_FAILURE);
+				}
+
+				if (oDist > 0 && (oDist < outLength))
+				{
+					printf("Input Arguments ERROR!! Value of odist cannot be less than product of lengths along each dimension and stride, (x*(y+2)*ostr). Exiting..\n\n");
+					exit(EXIT_FAILURE);
+				}
+								
+				iDist = iDist? iDist : (int)inLength;
+				oDist = oDist? oDist : (int)outLength;
+				
+				int n[2] = {(int)lengthX, (int)lengthY};
+
+				int inembed[2] = {(int)lengthX, (int)(lengthY/2+1)};
+				int outembed[2] = {(int)lengthX, (int)innerDimLength};
+
+				size_t inmem_size = sizeof(In) * iDist * batch_size;
+				size_t outmem_size = sizeof(Out) * oDist * batch_size;
+
+				printf("Dimension : 2\n");
+				printf("Data Layout : Advanced\n");
+
+				bTestResult = runFFTTransformC2RAdv<In, Out>(fftLength, type, outLength, innerDimLength, rank, n, inmem_size, outmem_size, 
+													inembed, outembed, profile_count, batch_size, outPlace, iStride, oStride,
+													iDist, oDist);
+			}
+			break;
+		}
+		case 3:
+		{
+			if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+			{
+				//Basic Data layout
+				printf("Dimension : 3\n");
+				printf("Data Layout : Basic\n");
+
+				bTestResult = run3DFFTTransformC2R<In, Out>(lengthX, lengthY, lengthZ, type, profile_count, batch_size, outPlace); 
+			}
+			else
+			{
+				//Advanced Data layout
+
+				size_t fftLength = lengthX * lengthY * lengthZ;
+				size_t innerDimLength = outPlace ? lengthZ  : (lengthZ + 2) ;
+				size_t inLength = lengthX * lengthY * (lengthZ/2 + 1)*iStride;
+				size_t outLength = lengthX * lengthY * innerDimLength * oStride;
+
+				if (iDist > 0 && iDist < inLength)
+				{
+					printf("Input Arguments ERROR!! Value of idist cannot be less than product of lengths along each dimension and stride, (x*y*(z/2+1)*istr). Exiting..\n\n");
+					exit(EXIT_FAILURE);
+				}
+
+				if (oDist > 0 && (oDist < outLength))
+				{
+					printf("Input Arguments ERROR!! Value of odist cannot be less than product of lengths along each dimension and stride, (x*y*(z+2)*ostr). Exiting..\n\n");
+					exit(EXIT_FAILURE);
+				}
+								
+				iDist = iDist? iDist : (int)inLength;
+				oDist = oDist? oDist : (int)outLength;
+				
+				int n[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+
+				int inembed[3] = {(int)lengthX, (int)lengthY, (int)(lengthZ/2+1)};
+				int outembed[3] = {(int)lengthX, (int)lengthY, (int)innerDimLength};
+
+				size_t inmem_size = sizeof(In) * iDist * batch_size;
+				size_t outmem_size = sizeof(Out) * oDist * batch_size;
+
+				printf("Dimension : 3\n");
+				printf("Data Layout : Advanced\n");
+
+				bTestResult = runFFTTransformC2RAdv<In, Out>(fftLength, type, outLength, innerDimLength, rank, n, inmem_size, outmem_size, 
+													inembed, outembed, profile_count, batch_size, outPlace, iStride, oStride,
+													iDist, oDist);
+			}
+			break;
+		}
+	}
+
+	return bTestResult;
+}
+
+/*
+* Run R2C FFT for 1D/2D/3D
+*/
+template<typename In, typename Out>
+bool runR2CFFT(int rank, size_t lengthX, size_t lengthY, size_t lengthZ,
+			   cufftType type, int profile_count, int batch_size, bool outPlace,
+			   int iDist, int oDist, int iStride, int oStride)
+{
+	bool bTestResult = false;
+
+	if (!outPlace && (iStride < oStride || iDist < oDist))
+	{
+		printf("Input Arguments ERROR!! For in-place transform, output stride and distance must be less than or equal to input stride and distance respectively. Exiting..\n\n");
+
+		exit(EXIT_FAILURE);
+	}
+
+	if (type == CUFFT_D2Z)
+		printf("Transforming signal Double Real to Double Complex \n");
+	else
+		printf("Transforming signal Real to Complex \n");
+	
+	switch (rank)
+	{
+		case 1:
+		{
+			//1D Transform
+			if (iDist > 0 && iDist < ((lengthX+2)*iStride))
+			{
+				printf("Input Arguments ERROR!! Value of idist cannot be less than product of lengths along each dimension and stride, ((x+2)*istr). Exiting..\n\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (oDist > 0 && (oDist < ((lengthX/2 + 1)*oStride)))
+			{
+				printf("Input Arguments ERROR!! Value of odist cannot be less than product of lengths along each dimension and stride, ((x/2+1)*ostr). Exiting..\n\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+			{
+				//Basic Data layout
+				printf("Dimension : 1\n");
+				printf("Data Layout : Basic\n");
+
+				bTestResult = run1DFFTTransformR2C<In, Out>(lengthX, type, profile_count, batch_size, outPlace);
+			}
+			else
+			{
+				//Advanced Data layout
+				iDist = iDist? iDist : (int)((lengthX+2)*iStride);
+				oDist = oDist? oDist : (int)((lengthX/2 + 1)*oStride);
+
+				if (!outPlace && oDist > (iDist/2))
+				{
+					printf("Input Arguments ERROR!! For in-place Real to Complex transform, output array distance must not be greater than half of input array distance. Exiting..\n\n");
+						
+					exit(EXIT_FAILURE);
+				}
+
+				int n[1] = {(int)lengthX};
+				size_t fftLength = lengthX;
+				size_t outLength = (lengthX/2 + 1)*oStride;
+
+				int inembed[1] = {(int)(lengthX + 2)};
+				int outembed[1] = {(int)(lengthX/2 + 1)};
+
+				size_t inmem_size = sizeof(In) * iDist * batch_size;
+				size_t outmem_size = sizeof(Out) * oDist * batch_size;
+						
+				printf("Dimension : 1\n");
+				printf("Data Layout : Advanced\n");
+	
+				bTestResult = runFFTTransformR2CAdv<In, Out>(fftLength, type, outLength, rank, n, inmem_size, outmem_size, inembed, outembed,
+													profile_count, batch_size, outPlace, iStride, oStride, iDist, oDist);
+			}
+
+			break;
+		}
+		case 2:
+		{
+			if (iDist > 0 && iDist < (lengthX *(lengthY+2)*iStride))
+			{
+				printf("Input Arguments ERROR!! Value of idist cannot be less than product of lengths along each dimension and stride, (x*(y+2)*istr). Exiting..\n\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (oDist > 0 && (oDist < (lengthX *(lengthY/2 + 1)*oStride)))
+			{
+				printf("Input Arguments ERROR!! Value of odist cannot be less than product of lengths along each dimension and stride, (x*(y/2+1)*ostr). Exiting..\n\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+			{
+				// Basic Data layout
+				printf("Dimension : 2\n");
+				printf("Data Layout : Basic\n");
+
+				bTestResult = run2DFFTTransformR2C<In, Out>(lengthX, lengthY, type, profile_count, batch_size, outPlace);
+			}
+			else
+			{
+				//Advanced Data layout
+				iDist = iDist? iDist : (int)(lengthX * (lengthY+2)*iStride);
+				oDist = oDist? oDist : (int)(lengthX * (lengthY/2 + 1)*oStride);
+
+				if (!outPlace && oDist > (iDist/2))
+				{
+					printf("Input Arguments ERROR!! For in-place Real to Complex transform, output array distance must not be greater than half of input array distance. Exiting..\n\n");
+						
+					exit(EXIT_FAILURE);
+				}
+
+				int n[2] = {(int)lengthX, (int)lengthY};
+				size_t fftLength = lengthX * lengthY;
+				size_t outLength = lengthX * (lengthY/2 + 1)*oStride;
+
+				int inembed[2] = {(int)lengthX, (int)(lengthY+2)};
+				int outembed[2] = {(int)lengthX, (int)(lengthY/2+1)};
+
+				size_t inmem_size = sizeof(In) * iDist * batch_size;
+				size_t outmem_size = sizeof(Out) * oDist * batch_size;
+
+				printf("Dimension : 2\n");
+				printf("Data Layout : Advanced\n");
+	
+				bTestResult = runFFTTransformR2CAdv<In, Out>(fftLength, type, outLength, rank, n, inmem_size, outmem_size, inembed, outembed,
+													profile_count, batch_size, outPlace, iStride, oStride, iDist, oDist);
+			}
+
+			break;
+		}
+		case 3:
+		{
+			if (iDist > 0 && iDist < (lengthX * lengthY *(lengthZ+2)*iStride))
+			{
+				printf("Input Arguments ERROR!! Value of idist cannot be less than product of lengths along each dimension and stride, (x*y*(z+2)*istr). Exiting..\n\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (oDist > 0 && (oDist < (lengthX * lengthY *(lengthZ/2 +  1)*oStride)))
+			{
+				printf("Input Arguments ERROR!! Value of odist cannot be less than product of lengths along each dimension and stride, (x*y*(z/2+1)*ostr). Exiting..\n\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+			{
+				// Basic Data layout
+				printf("Dimension : 3\n");
+				printf("Data Layout : Basic\n");
+
+				bTestResult = run3DFFTTransformR2C<In, Out>(lengthX, lengthY, lengthZ, type, profile_count, batch_size, outPlace);
+			}
+			else
+			{
+				//Advanced Data layout
+				iDist = iDist? iDist : (int)(lengthX * lengthY * (lengthZ+2)*iStride);
+				oDist = oDist? oDist : (int)(lengthX * lengthY * (lengthZ/2 + 1)*oStride);
+
+				if (!outPlace && oDist > (iDist/2))
+				{
+					printf("Input Arguments ERROR!! For in-place Real to Complex transform, output array distance must not be greater than half of input array distance. Exiting..\n\n");
+						
+					exit(EXIT_FAILURE);
+				}
+
+				int n[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+				size_t fftLength = lengthX * lengthY * lengthZ;
+				size_t outLength = lengthX * lengthY * (lengthZ/2 + 1)*oStride;
+
+				int inembed[3] = {(int)lengthX, (int)lengthY, (int)(lengthZ+2)};
+				int outembed[3] = {(int)lengthX, (int)lengthY, (int)(lengthZ/2+1)};
+
+				size_t inmem_size = sizeof(In) * iDist * batch_size;
+				size_t outmem_size = sizeof(Out) * oDist * batch_size;
+
+				printf("Dimension : 3\n");
+				printf("Data Layout : Advanced\n");
+	
+				bTestResult = runFFTTransformR2CAdv<In, Out>(fftLength, type, outLength, rank, n, inmem_size, outmem_size, inembed, outembed,
+													profile_count, batch_size, outPlace, iStride, oStride, iDist, oDist);
+			}
+
+			break;
+		}
+	}
+	return bTestResult;
+}
+
+/*
+* Run C2C FFT for 1D/2D/3D
+*/
+template<typename T>
+bool runC2CFFT(int rank, size_t lengthX, size_t lengthY, size_t lengthZ, int direction,
+			   cufftType type, int profile_count, int batch_size, bool outPlace,
+			   int iDist, int oDist, int iStride, int oStride)
+{
+	bool bTestResult = false;
+
+	if (!outPlace && (iStride < oStride || iDist < oDist))
+	{
+		printf("Input Arguments ERROR!! For in-place transform, output stride and distance must be less than or equal to input stride and distance respectively. Exiting..\n\n");
+
+		exit(EXIT_FAILURE);
+	}
+
+	if (iDist > 0 && iDist < (lengthX *lengthY*lengthZ*iStride))
+	{
+		printf("Input Arguments ERROR!! Value of idist cannot be less than product of lengths along each dimension and stride, (x*y*z*istr). Exiting..\n\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (oDist > 0 && (oDist < (lengthX *lengthY*lengthZ*oStride)))
+	{
+		printf("Input Arguments ERROR!! Value of odist cannot be less than product of lengths along each dimension and stride, (x*y*z*ostr). Exiting..\n\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (type == CUFFT_Z2Z)
+		printf("Transforming signal Double Complex to Double Complex \n");
+	else
+		printf("Transforming signal Complex to Complex \n");
+
+	switch (rank)
+		{
+			case 1:
+			{
+				//1D Transform
+
+				if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+				{
+					//basic data layout
+					
+					printf("Dimension : 1\n");
+					printf("Data Layout : Basic\n");
+
+					bTestResult = run1DFFTTransformC2C<T>(lengthX, direction, type, profile_count, batch_size, outPlace);
+				}
+				else
+				{
+					//Advanced Data layout
+					iDist = iDist? iDist : (int)(lengthX * iStride);
+					oDist = oDist? oDist : (int)(lengthX * oStride);
+
+					int n[1] = {(int)lengthX};
+					size_t fftLength = lengthX;
+
+					int inembed[1] = {(int)lengthX};
+					int outembed[1] = {(int)lengthX};
+
+					size_t inmem_size = sizeof(T) * iDist * batch_size;
+					size_t outmem_size = sizeof(T) * oDist * batch_size;
+
+					printf("Dimension : 1\n");
+					printf("Data Layout : Advanced\n");
+	
+					bTestResult = runFFTTransformC2CAdv<T>(fftLength, type, rank, n, inmem_size, outmem_size, inembed, outembed,
+														direction, profile_count, batch_size, outPlace, iStride, oStride, iDist, oDist);
+				}
+				break;
+			}
+			case 2:
+			{
+				//2D Transform
+
+				if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+				{
+					//basic data layout
+					printf("Dimension : 2\n");
+					printf("Data Layout : Basic\n");
+
+					bTestResult = run2DFFTTransformC2C<T>(lengthX, lengthY, direction, type, profile_count, batch_size, outPlace);
+				}
+				else
+				{
+					//Advanced Data layout
+					iDist = iDist? iDist : (int)(lengthX * lengthY*iStride);
+					oDist = oDist? oDist : (int)(lengthX * lengthY*oStride);
+
+					int n[2] = {(int)lengthX, (int)lengthY};
+					size_t fftLength = lengthX * lengthY;
+
+					int inembed[2] = {(int)lengthX, (int)lengthY};
+					int outembed[2] = {(int)lengthX, (int)lengthY};
+
+					size_t inmem_size = sizeof(T) * iDist * batch_size;
+					size_t outmem_size = sizeof(T) * oDist * batch_size;
+
+					printf("Dimension : 2\n");
+					printf("Data Layout : Advanced\n");
+	
+					bTestResult = runFFTTransformC2CAdv<T>(fftLength, type, rank, n, inmem_size, outmem_size, inembed, outembed,
+														direction, profile_count, batch_size, outPlace, iStride, oStride, iDist, oDist);
+				}
+								
+				break;
+			}
+			case 3:
+			{				
+				//3D Transform
+
+				if (iStride == 1 && oStride == 1 && iDist == 0 && oDist == 0)
+				{
+					//basic data layout					
+					printf("Dimension : 3\n");
+					printf("Data Layout : Basic\n");
+
+					bTestResult = run3DFFTTransformC2C<T>(lengthX, lengthY, lengthZ, direction, type, profile_count, batch_size, outPlace);
+				}
+				else
+				{
+					//Advanced Data layout
+					iDist = iDist? iDist : (int)(lengthX * lengthY * lengthZ * iStride);
+					oDist = oDist? oDist : (int)(lengthX * lengthY * lengthZ * oStride);
+
+					int n[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+					size_t fftLength = lengthX * lengthY*lengthZ;
+
+					int inembed[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+					int outembed[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+
+					size_t inmem_size = sizeof(T) * iDist * batch_size;
+					size_t outmem_size = sizeof(T) * oDist * batch_size;
+					
+					printf("Dimension : 3\n");
+					printf("Data Layout : Advanced\n");
+	
+					bTestResult = runFFTTransformC2CAdv<T>(fftLength, type, rank, n, inmem_size, outmem_size, inembed, outembed,
+														direction, profile_count, batch_size, outPlace, iStride, oStride, iDist, oDist);
+				}
+				break;
+			}
+		}
+
+	return bTestResult;
+}
+
+/*
+* Run C2R FFT Advanced Data Layout
+*/
+template<typename In, typename Out>
+bool runFFTTransformC2RAdv(size_t fftLength, cufftType type, size_t outLength, size_t innerDimLength, int rank, int* n, size_t inmem_size, size_t outmem_size,    
+						  int* inembed, int* outembed, int profile_count, int batch_size, bool outPlace, int istride, int ostride, int idist, int odist)
+{
+	size_t invectorLength = idist;
+
+	// Allocate host memory for the signal
+    In *h_in_signal = (In *)malloc(inmem_size);
+	
+	Out *h_output_signal;
+	In *d_in_signal;
+	Out *d_output_signal;
+
+    // Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, inmem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (Out *)malloc(outmem_size);
+
+		// Allocate device memory for out signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, outmem_size));
+	}
+	else
+	{
+		h_output_signal = (Out*)h_in_signal;
+		d_output_signal = (Out*)d_in_signal;
+	}
+
+    // CUFFT plan
+    cufftHandle plan;
+    checkCudaErrors(cufftPlanMany(&plan, rank, n,
+						inembed, istride, idist, // *inembed, istride, idist
+						outembed, ostride, odist,  // *onembed, ostride, odist
+						type, batch_size));
+
+	Timer tr;
+	double wtime_t = 0.0;
+
+	// Initalize the memory for the signal
+	memset(h_in_signal,0,inmem_size);
+	for (size_t j = 0; j < (invectorLength * batch_size); j+=invectorLength)
+	{
+		h_in_signal[j].x = (float)fftLength;
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, inmem_size,
+							   cudaMemcpyHostToDevice));
+
+	for (int i = 0; i < profile_count; i++)
+	{
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_Z2D:
+			checkCudaErrors(cufftExecZ2D(plan, (cufftDoubleComplex*)d_in_signal, (cufftDoubleReal*)d_output_signal));
+			break;
+		case CUFFT_C2R:
+			checkCudaErrors(cufftExecC2R(plan, (cufftComplex*)d_in_signal, (cufftReal*)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+		
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = fftLength;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    	// Copy device memory to host
+    	checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, outmem_size,
+                               cudaMemcpyDeviceToHost));
+
+	bool bTestResult = true;
+    	// check result only if profile count == 1 or out-place transform
+        if (profile_count == 1 || outPlace)
+	{
+		// check result
+		size_t idx = 0;
+		size_t innerDim = 0;
+		for (int i = 0; i < batch_size; i++)
+		{
+			idx = i * odist;
+
+			for (int k = 0; k < outLength; k += ostride)
+			{
+				//For in-place transform, skip the 2 elements after iterating every inner dimension length
+				if (!outPlace && (innerDim == (innerDimLength - 2)*ostride || innerDim == (innerDimLength - 1)*ostride))
+				{
+					if (innerDim == (innerDimLength - 1)*ostride)
+						innerDim = 0;
+					else
+						innerDim += ostride;
+
+					idx += ostride;
+					continue;
+				}
+
+				/*****************************************************************************************
+				* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+				* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+				* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+				* of the data set is left for the user to perform as seen fit
+				******************************************************************************************/
+				if (h_output_signal[idx] != fftLength)
+				{
+					printf("failed at %d, value=%f\n", idx, h_output_signal[idx]);
+					bTestResult = false;
+					//break;
+				}
+
+				idx += ostride;
+				innerDim += ostride;
+			}
+			innerDim = 0;
+		}
+	}
+	
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run R2C FFT Advanced Data Layout
+*/
+template<typename In, typename Out>
+bool runFFTTransformR2CAdv(size_t fftLength, cufftType type, size_t outLength, int rank, int* n, size_t inmem_size, size_t outmem_size, int* inembed, int* outembed,    
+						  int profile_count, int batch_size, bool outPlace, int istride, int ostride, int idist, int odist)
+{
+	size_t invectorLength = idist;
+
+	// Allocate host memory for the signal
+    In *h_in_signal = (In *)malloc(inmem_size);
+	
+	Out *h_output_signal;
+	In *d_in_signal;
+	Out *d_output_signal;
+
+    // Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, inmem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (Out *)malloc(outmem_size);
+
+		// Allocate device memory for out signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, outmem_size));
+	}
+	else
+	{
+		h_output_signal = (Out*)h_in_signal;
+		d_output_signal = (Out*)d_in_signal;
+	}
+
+    // CUFFT plan
+    cufftHandle plan;
+    checkCudaErrors(cufftPlanMany(&plan, rank, n,
+						inembed, istride, idist, // *inembed, istride, idist
+						outembed, ostride, odist,  // *onembed, ostride, odist
+						type, batch_size));
+
+	// Initalize the memory for the signal
+	for (unsigned int j = 0; j < (invectorLength * batch_size); ++j)
+	{
+		h_in_signal[j] = 1;
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, inmem_size,
+							   cudaMemcpyHostToDevice));
+	Timer tr;
+	double wtime_t = 0.0;
+
+	for (int i = 0; i < profile_count; i++)
+	{
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_D2Z:
+			checkCudaErrors(cufftExecD2Z(plan, (cufftDoubleReal*)d_in_signal, (cufftDoubleComplex*)d_output_signal));
+			break;
+		case CUFFT_R2C:
+			checkCudaErrors(cufftExecR2C(plan, (cufftReal*)d_in_signal, (cufftComplex*)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+		
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = fftLength;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+	// Copy device memory to host
+    	checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, outmem_size,
+                               cudaMemcpyDeviceToHost));
+
+    	// check result only if profile count == 1 or outplace transform
+	bool bTestResult = true;
+	if (profile_count == 1 || outPlace)
+	{
+		int idx = 0;
+		for (int k = 0; k < batch_size; k++)
+		{
+			idx = k * odist;
+
+			for( int i = idx; i < (idx + outLength); i += ostride)
+			{
+				if (0 == (i % odist))
+				{
+					if (h_output_signal[i].x != fftLength)
+					{
+						printf("fail at %d, value %f\n", i, h_output_signal[i].x);
+						bTestResult = false;
+						break;
+					}
+				}
+				else
+				{
+					if (h_output_signal[i].x != 0)
+					{
+						printf("fail at %d, value %f\n", i, h_output_signal[i].x);
+						bTestResult = false;
+						break;
+					}
+				}
+
+				if (h_output_signal[ i ].y != 0)
+				{
+					printf("fail at %d, value %f\n", i, h_output_signal[i].y);
+					bTestResult = false;
+					break;
+				}
+			}
+		}
+	}
+	
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run C2C FFT Advanced Data Layout
+*/
+template<typename T>
+bool runFFTTransformC2CAdv(size_t fftLength, cufftType type, int rank, int* n, size_t inmem_size, size_t outmem_size, int* inembed, int* outembed,    
+						  int direction, int profile_count, int batch_size, bool outPlace, int istride, int ostride, int idist, int odist)
+{
+	size_t invectorLength = idist;
+	size_t outvectorLength = odist; 
+
+	// Allocate host memory for the signal
+    T *h_in_signal = (T *)malloc(inmem_size);
+	
+	T *h_output_signal;
+	T *d_in_signal;
+	T *d_output_signal;
+
+    // Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, inmem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (T *)malloc(outmem_size);
+
+		// Allocate device memory for out signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, outmem_size));
+	}
+	else
+	{
+		h_output_signal = h_in_signal;
+		d_output_signal = d_in_signal;
+	}
+
+    // CUFFT plan
+    cufftHandle plan;
+    checkCudaErrors(cufftPlanMany(&plan, rank, n,
+						inembed, istride, idist, // *inembed, istride, idist
+						outembed, ostride, odist,  // *onembed, ostride, odist
+						type, batch_size));
+
+	Timer tr;
+	double wtime_t = 0.0;
+
+	// Initalize the memory for the signal
+	if (direction == CUFFT_FORWARD)
+	{
+		for (size_t idx = 0; idx < (invectorLength*batch_size); ++idx)
+		{
+			h_in_signal[idx].x = 1;
+			h_in_signal[idx].y = 0;
+		}
+	}
+	else
+	{
+		//Inverse FFT
+		memset(h_in_signal, 0, inmem_size);
+
+		for (size_t idx = 0; idx < (invectorLength*batch_size); idx+=invectorLength)
+		{
+			h_in_signal[idx].x = (float) (fftLength);
+		}
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, inmem_size,
+							   cudaMemcpyHostToDevice));
+
+	for (int i = 0; i < profile_count; i++)
+	{
+		tr.Start();
+
+		//Execute FFT
+		switch (type)
+		{
+		case CUFFT_C2C:
+			checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_in_signal, (cufftComplex *)d_output_signal, direction));
+			break;
+		case CUFFT_Z2Z:
+			checkCudaErrors(cufftExecZ2Z(plan, (cufftDoubleComplex *)d_in_signal, (cufftDoubleComplex *)d_output_signal, direction));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = fftLength;
+	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, outmem_size,
+                               cudaMemcpyDeviceToHost));
+
+    // check result
+	bool bTestResult = true;
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		int idx = 0;
+		for (int k = 0; k < batch_size; k++)
+		{
+			idx = k * outvectorLength;
+
+			for( int i = idx; i < (idx + fftLength*ostride); i += ostride)
+			{
+				if (direction == CUFFT_FORWARD)
+				{
+					if (0 == (i % outvectorLength))
+					{
+						if (h_output_signal[i].x != fftLength)
+						{
+							printf("fail at %d, value %f\n", i, h_output_signal[i].x);
+							bTestResult = false;
+							break;
+						}
+					}
+					else
+					{
+						if (h_output_signal[i].x != 0)
+						{
+							printf("fail at %d, value %f\n", i, h_output_signal[i].x);
+							bTestResult = false;
+							break;
+						}
+					}
+
+					if (h_output_signal[ i ].y != 0)
+					{
+						printf("fail at %d, value %f\n", i, h_output_signal[i].y);
+						bTestResult = false;
+						break;
+					}
+				}
+				else
+				{
+					/*****************************************************************************************
+					* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+					* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+					* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+					* of the data set is left for the user to perform as seen fit
+					******************************************************************************************/
+					//Inverse FFT
+					if (h_output_signal[i].x != fftLength)
+					{
+						printf("fail at %d, value %f\n", i, h_output_signal[i].x);
+						bTestResult = false;
+						break;
+					}
+					if (h_output_signal[ i ].y != 0)
+					{
+						printf("fail at %d, value %f\n", i, h_output_signal[i].y);
+						bTestResult = false;
+						break;
+					}
+				}
+			}
+		}
+	}
+	
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 3D C2R FFT - Basic Data Layout
+*/
+template<typename In, typename Out>
+bool run3DFFTTransformC2R(size_t lengthX, size_t lengthY, size_t lengthZ, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 3; //3D Transform
+
+	// Allocate host memory for the signal
+	// cuFFT only supports Hermitian Interleaved for input for C2R
+	size_t invectorLength = lengthX * lengthY * (lengthZ/2 + 1);
+	size_t innerDimLength = outPlace ? lengthZ : lengthZ + 2;
+	size_t outvectorLength = lengthX * lengthY * innerDimLength;
+	size_t fftLength = lengthX * lengthY * lengthZ;
+
+	int n[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+
+	size_t inmem_size = sizeof(In) * (invectorLength) * batch_size;
+	size_t outmem_size = sizeof(Out) * (outvectorLength) * batch_size;
+
+    In *h_in_signal = (In *)malloc(inmem_size);
+	In *d_in_signal;
+	cufftHandle plan;
+
+	Out *h_output_signal; 
+	Out *d_output_signal;
+
+	Timer tr;
+	double wtime_t =0.0;
+
+	// Allocate device memory for signal		
+	checkCudaErrors(cudaMalloc((void **)&d_in_signal, inmem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (Out*)malloc(outmem_size);
+
+		// Allocate device memory for output signal		
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, outmem_size));
+	}
+	else
+	{
+		h_output_signal = (Out*)h_in_signal;
+		d_output_signal = (Out *)d_in_signal;
+	}
+
+	// CUFFT plan	
+	checkCudaErrors(cufftPlanMany(&plan, rank, n,
+						NULL, 1, 0, // *inembed, istride, idist
+						NULL, 1, 0,  // *onembed, ostride, odist
+						type, batch_size));
+
+	// Initalize the memory for the signal
+	memset(h_in_signal, 0, inmem_size);
+	for (size_t j = 0; j < (invectorLength * batch_size); j+= (invectorLength))
+	{
+		h_in_signal[j].x = (float) fftLength;
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, inmem_size,
+							   cudaMemcpyHostToDevice));
+
+	// Transform signal and kernel
+	for( int i = 0; i < profile_count; ++i )
+	{
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_Z2D:
+			checkCudaErrors(cufftExecZ2D(plan, (cufftDoubleComplex*)d_in_signal, (cufftDoubleReal *)d_output_signal));
+			break;
+		case CUFFT_C2R:
+			checkCudaErrors(cufftExecC2R(plan, (cufftComplex *)d_in_signal, (cufftReal *)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n");
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+	
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = fftLength;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, outmem_size,
+                               cudaMemcpyDeviceToHost));
+	
+	bool bTestResult = true;
+	
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		size_t idx = 0;
+		size_t innerDim = 0;
+		for (int i = 0; i < batch_size; i++)
+		{
+			for (int k = 0; k < outvectorLength; k++)
+			{
+				//For in-place transform, skip the 2 elements after inner dimension length
+				if (!outPlace && (innerDim == (innerDimLength - 2) || innerDim == (innerDimLength - 1)))
+				{
+					if (innerDim == (innerDimLength - 1))
+						innerDim = 0;
+					else
+						innerDim++;
+
+					idx++;
+					continue;
+				}
+
+				/*****************************************************************************************
+				* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+				* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+				* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+				* of the data set is left for the user to perform as seen fit
+				******************************************************************************************/
+				if (h_output_signal[idx] != fftLength)
+				{
+					printf("failed at %d, value=%f\n", idx, h_output_signal[idx]);
+					bTestResult = false;
+					break;
+				}
+				idx++;
+				innerDim++;
+			}
+			innerDim = 0;
+		}
+	}
+
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 3D R2C FFT - Basic Data Layout
+*/
+template<typename In, typename Out>
+bool run3DFFTTransformR2C(size_t lengthX, size_t lengthY, size_t lengthZ, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 3; //3D Transform
+
+	int n[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+
+		// Allocate host memory for the signal
+	// cuFFT only supports Hermitian Interleaved for output of R2C
+	// Hence output size is N/2 + 1 complex. So allocate N + 2 real input
+	size_t ivectorLength = lengthX * lengthY * (lengthZ+2);
+	size_t ovectorLength = lengthX * lengthY * (lengthZ/2 + 1);
+	size_t fftLength = lengthX*lengthY*lengthZ;
+
+	size_t mem_size = sizeof(In) * ivectorLength * batch_size;
+    In *h_in_signal = (In *)malloc(mem_size);
+	In *d_in_signal;
+
+	Out *h_output_signal;
+	Out *d_output_signal;
+
+	// Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, mem_size));
+    
+	if (outPlace)
+	{
+		h_output_signal = (Out *)malloc(mem_size);
+
+		// Allocate device memory for signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, mem_size));
+	}
+	else
+	{
+		h_output_signal =  (Out*) h_in_signal;
+		d_output_signal = (Out*) d_in_signal;
+	}
+
+	// CUFFT plan
+    cufftHandle plan;
+	checkCudaErrors(cufftPlanMany(&plan, rank, n,
+						NULL, 1, (int)ivectorLength, // *inembed, istride, idist
+						NULL, 1, (int)ovectorLength,  // *onembed, ostride, odist
+						type, batch_size));
+	
+	Timer tr;
+	double wtime_t = 0.0;
+	
+	// Initalize the memory for the signal
+	for (unsigned int j = 0; j < (ivectorLength * batch_size); ++j)
+	{
+		h_in_signal[j] = 1;
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, mem_size,
+							   cudaMemcpyHostToDevice));
+		
+	for (int i = 0; i < profile_count; i++)
+	{
+		// Transform signal and kernel
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_R2C:
+			checkCudaErrors(cufftExecR2C(plan, (cufftReal*)d_in_signal, (cufftComplex*)d_output_signal));
+			break;
+		case CUFFT_D2Z:
+			checkCudaErrors(cufftExecD2Z(plan, (cufftDoubleReal*)d_in_signal, (cufftDoubleComplex*)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = fftLength;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, mem_size,
+                               cudaMemcpyDeviceToHost));
+
+	bool bTestResult = true;
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		for( int i = 0; i < ovectorLength*batch_size; ++i )
+		{
+			//Check real part of 1st element of every batch is equal to length
+			if (i == 0 || (0 == (i % ovectorLength)))
+			{
+				if (h_output_signal[i].x != fftLength)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+			else
+			{
+				if (h_output_signal[i].x != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+
+			if (h_output_signal[ i ].y != 0)
+			{
+				bTestResult = false;
+				break;
+			}
+		}
+	}
+
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 3D C2C FFT - Basic Data Layout
+*/
+template<typename T>
+bool run3DFFTTransformC2C(size_t lengthX, size_t lengthY, size_t lengthZ, int direction, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 3; //3D transform
+
+	int n[3] = {(int)lengthX, (int)lengthY, (int)lengthZ};
+	size_t vectorLength = lengthX * lengthY * lengthZ;
+	// Allocate host memory for the signal
+	size_t mem_size = sizeof(T) * vectorLength * batch_size;
+    T *h_in_signal = (T *)malloc(mem_size);
+	
+	T *h_output_signal;
+	T *d_in_signal;
+	T *d_output_signal;
+
+    // Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, mem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (T *)malloc(mem_size);
+
+		// Allocate device memory for out signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, mem_size));
+	}
+	else
+	{
+		h_output_signal = h_in_signal;
+		d_output_signal = d_in_signal;
+	}
+
+    // CUFFT plan
+    cufftHandle plan;
+	checkCudaErrors(cufftPlanMany(&plan, rank, n,
+							NULL, 1, (int)vectorLength, // *inembed, istride, idist
+							NULL, 1, (int)vectorLength,  // *onembed, ostride, odist
+							type, batch_size));
+
+    // Transform signal and kernel
+	
+	Timer tr;
+	double wtime_t = 0.0;
+
+	// Initalize the memory for the signal
+	if (direction == CUFFT_FORWARD)
+	{
+		for (size_t idx = 0; idx < (vectorLength*batch_size); ++idx)
+		{
+			h_in_signal[idx].x = 1;
+			h_in_signal[idx].y = 0;
+		}
+	}
+	else
+	{
+		//Inverse FFT
+		memset(h_in_signal, 0, mem_size);
+
+		for (size_t idx = 0; idx < (vectorLength*batch_size); idx+=vectorLength)
+		{
+			h_in_signal[idx].x = (float) (vectorLength);
+		}
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, mem_size,
+							   cudaMemcpyHostToDevice));
+
+	for (int i = 0; i < profile_count; i++)
+	{
+		tr.Start();
+
+		//Execute FFT
+		switch (type)
+		{
+		case CUFFT_C2C:
+			checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_in_signal, (cufftComplex *)d_output_signal, direction));
+			break;
+		case CUFFT_Z2Z:
+			checkCudaErrors(cufftExecZ2Z(plan, (cufftDoubleComplex *)d_in_signal, (cufftDoubleComplex *)d_output_signal, direction));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = vectorLength;
+	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, mem_size,
+                               cudaMemcpyDeviceToHost));
+
+	bool bTestResult = true;
+	
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		for( int i = 0; i < (vectorLength*batch_size); ++i )
+		{
+			if (direction == CUFFT_FORWARD)
+			{
+				if (0 == (i % vectorLength))
+				{
+					if (h_output_signal[i].x != vectorLength)
+					{
+						bTestResult = false;
+						break;
+					}
+				}
+				else
+				{
+					if (h_output_signal[i].x != 0)
+					{
+						bTestResult = false;
+						break;
+					}
+				}
+
+				if (h_output_signal[ i ].y != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+			else
+			{
+				/*****************************************************************************************
+				* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+				* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+				* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+				* of the data set is left for the user to perform as seen fit
+				******************************************************************************************/
+				//Inverse FFT
+				if (h_output_signal[i].x != vectorLength)
+				{
+					bTestResult = false;
+					break;
+				}
+				if (h_output_signal[ i ].y != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+		}
+	}
+	
+	//Destroy CUFFT context
+	checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 2D C2R FFT - Basic Data Layout
+*/
+template<typename In, typename Out>
+bool run2DFFTTransformC2R(size_t lengthX, size_t lengthY, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 2; //2D Transform
+
+	// Allocate host memory for the signal
+	// cuFFT only supports Hermitian Interleaved for input for C2R
+	size_t invectorLength = lengthX * (lengthY/2 + 1);
+	size_t innerDimLength = outPlace ? lengthY : (lengthY + 2);
+	size_t outvectorLength = lengthX * innerDimLength;
+
+	size_t fftLength = lengthX * lengthY;
+
+	int n[2] = {(int)lengthX, (int)lengthY};
+
+	size_t inmem_size = sizeof(In) * (invectorLength) * batch_size;
+	size_t outmem_size = sizeof(Out) * (outvectorLength) * batch_size;
+
+    In *h_in_signal = (In *)malloc(inmem_size);
+	In *d_in_signal;
+	cufftHandle plan;
+
+	Out *h_output_signal; 
+	Out *d_output_signal;
+
+	Timer tr;
+	double wtime_t =0.0;
+
+	// Allocate device memory for signal		
+	checkCudaErrors(cudaMalloc((void **)&d_in_signal, inmem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (Out*)malloc(outmem_size);
+
+		// Allocate device memory for output signal		
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, outmem_size));
+	}
+	else
+	{
+		h_output_signal = (Out*)h_in_signal;
+		d_output_signal = (Out *)d_in_signal;
+	}
+
+	// CUFFT plan	
+	checkCudaErrors(cufftPlanMany(&plan, rank, n,
+						NULL, 1, 0, // *inembed, istride, idist
+						NULL, 1, 0,  // *onembed, ostride, odist
+						type, batch_size));
+
+	// Initalize the memory for the signal
+	memset(h_in_signal, 0, inmem_size);
+	for (size_t j = 0; j < (invectorLength * batch_size); j+= invectorLength)
+	{
+		h_in_signal[j].x = (float)fftLength;
+	}
+	
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, inmem_size,
+							   cudaMemcpyHostToDevice));
+	
+	// Transform signal and kernel
+	for( int i = 0; i < profile_count; ++i )
+	{
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_Z2D:
+			checkCudaErrors(cufftExecZ2D(plan, (cufftDoubleComplex*)d_in_signal, (cufftDoubleReal *)d_output_signal));
+			break;
+		case CUFFT_C2R:
+			checkCudaErrors(cufftExecC2R(plan, (cufftComplex *)d_in_signal, (cufftReal *)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+	
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = fftLength;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, outmem_size,
+                               cudaMemcpyDeviceToHost));
+	
+	bool bTestResult = true;
+	
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		int idx = 0;
+		for (int i = 0; i < batch_size; i++)
+		{
+			for( int j = 0; j < lengthX; ++j )
+			{
+				for (int k = 0; k < lengthY; k++)
+				{
+					idx = i*outvectorLength + j*innerDimLength + k;
+
+					/*****************************************************************************************
+					* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+					* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+					* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+					* of the data set is left for the user to perform as seen fit
+					******************************************************************************************/
+					if (h_output_signal[idx] != (float)fftLength)
+					{
+						printf("failed at %d, value=%f\n", idx, h_output_signal[idx] );
+						bTestResult = false;
+						break;				
+					}
+				}
+			}
+		}
+	}
+
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 2D R2C FFT - Basic Data Layout
+*/
+template<typename In, typename Out>
+bool run2DFFTTransformR2C(size_t lengthX, size_t lengthY, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 2; //2D Transform
+
+	int n[2] = {(int)lengthX, (int)lengthY};
+	size_t ivectorLength = lengthX * (lengthY+2);
+	size_t ovectorLength = lengthX * (lengthY/2 + 1);
+	size_t fftLength = lengthX*lengthY;
+
+	// Allocate host memory for the signal
+	// cuFFT only supports Hermitian Interleaved for output of R2C
+	// Hence output size is N/2 + 1 complex. So allocate N + 2 real input
+	size_t mem_size = sizeof(In) * ivectorLength * batch_size;
+    In *h_in_signal = (In *)malloc(mem_size);
+	In *d_in_signal;
+
+	Out *h_output_signal;
+	Out *d_output_signal;
+
+	// Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, mem_size));
+    
+	if (outPlace)
+	{
+		h_output_signal = (Out *)malloc(mem_size);
+
+		// Allocate device memory for signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, mem_size));
+	}
+	else
+	{
+		h_output_signal =  (Out*) h_in_signal;
+		d_output_signal = (Out*) d_in_signal;
+	}
+
+	// CUFFT plan
+    cufftHandle plan;
+	checkCudaErrors(cufftPlanMany(&plan, rank, n,
+							NULL, 1, (int)ivectorLength, // *inembed, istride, idist
+							NULL, 1, (int)ovectorLength,  // *onembed, ostride, odist
+							type, batch_size));
+
+	// Initalize the memory for the signal
+	for (unsigned int j = 0; j < (ivectorLength * batch_size); ++j)
+	{
+		h_in_signal[j] = 1;
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, mem_size,
+							   cudaMemcpyHostToDevice));
+	
+	Timer tr;
+	double wtime_t = 0.0;
+			
+	for (int i = 0; i < profile_count; i++)
+	{
+		// Transform signal and kernel
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_D2Z:
+			checkCudaErrors(cufftExecD2Z(plan, (cufftDoubleReal*)d_in_signal, (cufftDoubleComplex*)d_output_signal));
+			break;
+		case CUFFT_R2C:
+			checkCudaErrors(cufftExecR2C(plan, (cufftReal*)d_in_signal, (cufftComplex*)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = fftLength;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, mem_size,
+                               cudaMemcpyDeviceToHost));
+
+    	bool bTestResult = true;
+	
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		for( int i = 0; i < ovectorLength*batch_size; ++i )
+		{
+			//Check real part of 1st element of every batch is equal to length
+			if (i == 0 || (0 == (i % ovectorLength)))
+			{
+				if (h_output_signal[i].x != fftLength)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+			else
+			{
+				if (h_output_signal[i].x != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+
+			if (h_output_signal[ i ].y != 0)
+			{
+				bTestResult = false;
+				break;
+			}
+		}
+	}
+
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 2D C2C FFT - Basic Data layout
+*/
+template<typename T>
+bool run2DFFTTransformC2C(size_t lengthX, size_t lengthY, int direction, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 2; //2D transform
+
+	int n[2] = {(int)lengthX, (int)lengthY};
+	size_t vectorLength = lengthX * lengthY;
+	// Allocate host memory for the signal
+	size_t mem_size = sizeof(T) * vectorLength * batch_size;
+    T *h_in_signal = (T *)malloc(mem_size);
+	
+	T *h_output_signal;
+	T *d_in_signal;
+	T *d_output_signal;
+
+    // Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, mem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (T *)malloc(mem_size);
+
+		// Allocate device memory for out signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, mem_size));
+	}
+	else
+	{
+		h_output_signal = h_in_signal;
+		d_output_signal = d_in_signal;
+	}
+
+    // CUFFT plan
+    cufftHandle plan;
+	checkCudaErrors(cufftPlanMany(&plan, rank, n,
+							NULL, 1, (int)vectorLength, // *inembed, istride, idist
+							NULL, 1, (int)vectorLength,  // *onembed, ostride, odist
+							type, batch_size));
+
+	// Initalize the memory for the signal
+	if (direction == CUFFT_FORWARD)
+	{
+		for (size_t idx = 0; idx < (vectorLength*batch_size); ++idx)
+		{
+			h_in_signal[idx].x = 1;
+			h_in_signal[idx].y = 0;
+		}
+	}
+	else
+	{
+		//Inverse FFT
+		memset(h_in_signal, 0, mem_size);
+
+		for (size_t idx = 0; idx < (vectorLength*batch_size); idx+=vectorLength)
+		{
+			h_in_signal[idx].x = (float) (vectorLength);
+		}
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, mem_size,
+							   cudaMemcpyHostToDevice));
+	
+	Timer tr;
+	double wtime_t = 0.0;
+
+	for (int i = 0; i < profile_count; i++)
+	{
+		tr.Start();
+
+		//Execute FFT
+		switch (type)
+		{
+		case CUFFT_C2C:
+			checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_in_signal, (cufftComplex *)d_output_signal, direction));
+			break;
+		case CUFFT_Z2Z:
+			checkCudaErrors(cufftExecZ2Z(plan, (cufftDoubleComplex *)d_in_signal, (cufftDoubleComplex *)d_output_signal, direction));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = vectorLength;
+	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, mem_size,
+                               cudaMemcpyDeviceToHost));
+
+	bool bTestResult = true;
+
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		for( int i = 0; i < (vectorLength*batch_size); ++i )
+		{
+			if (direction == CUFFT_FORWARD)
+			{
+				if (0 == (i % vectorLength))
+				{
+					if (h_output_signal[i].x != vectorLength)
+					{
+						bTestResult = false;
+						break;
+					}
+				}
+				else
+				{
+					if (h_output_signal[i].x != 0)
+					{
+						bTestResult = false;
+						break;
+					}
+				}
+
+				if (h_output_signal[ i ].y != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+			else
+			{
+				/*****************************************************************************************
+				* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+				* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+				* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+				* of the data set is left for the user to perform as seen fit
+				******************************************************************************************/
+				//Inverse FFT
+				if (h_output_signal[i].x != vectorLength)
+				{
+					bTestResult = false;
+					break;
+				}
+				if (h_output_signal[ i ].y != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+		}
+	}
+	
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 1D C2R FFT - Basic Data Layout
+*/
+template<typename In, typename Out>
+bool run1DFFTTransformC2R(size_t lengthX, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 1; //1D Transform
+
+	// Allocate host memory for the signal
+	// cuFFT only supports Hermitian Interleaved for input for C2R
+	size_t invectorLength = lengthX/2 + 1;
+	size_t outvectorLength = outPlace ? lengthX  : lengthX + 2;
+	size_t fftLength = lengthX;
+	
+	size_t inmem_size = sizeof(In) * (invectorLength) * batch_size;
+	size_t outmem_size = sizeof(Out) * (outvectorLength) * batch_size;
+
+    In *h_in_signal = (In *)malloc(inmem_size);
+	In *d_in_signal;
+	cufftHandle plan;
+
+	Out *h_output_signal; 
+	Out *d_output_signal;
+
+	Timer tr;
+	double wtime_t =0.0;
+
+	// Allocate device memory for signal		
+	checkCudaErrors(cudaMalloc((void **)&d_in_signal, inmem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (Out*)malloc(outmem_size);
+
+		// Allocate device memory for output signal		
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, outmem_size));
+	}
+	else
+	{
+		h_output_signal = (Out*)h_in_signal;
+		d_output_signal = (Out *)d_in_signal;
+	}
+
+	// CUFFT plan	
+	checkCudaErrors(cufftPlanMany(&plan, rank, (int*)&lengthX,
+						NULL, 1, 0, // *inembed, istride, idist
+						NULL, 1, 0,  // *onembed, ostride, odist
+						type, batch_size));
+
+	// Initalize the memory for the signal
+	memset(h_in_signal, 0, inmem_size);
+	for (size_t j = 0; j < (invectorLength * batch_size); j+= (invectorLength))
+	{
+		h_in_signal[j].x = (float) fftLength;
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, inmem_size,
+							   cudaMemcpyHostToDevice));
+
+	for( int i = 0; i < profile_count; ++i )
+	{
+
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_Z2D:
+			checkCudaErrors(cufftExecZ2D(plan, (cufftDoubleComplex*)d_in_signal, (cufftDoubleReal *)d_output_signal));
+			break;
+		case CUFFT_C2R:
+			checkCudaErrors(cufftExecC2R(plan, (cufftComplex *)d_in_signal, (cufftReal *)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+	
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = lengthX;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, outmem_size,
+                               cudaMemcpyDeviceToHost));
+	
+	bool bTestResult = true;
+	
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		int idx = 0;
+		for (int i = 0; i < batch_size; i++)
+		{
+			for( int j = 0; j < lengthX; ++j )
+			{
+				idx = i*outvectorLength +  j;
+				/*****************************************************************************************
+				* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+				* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+				* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+				* of the data set is left for the user to perform as seen fit
+				******************************************************************************************/
+				if (h_output_signal[idx] != fftLength)
+				{
+					printf("failed at %d, value=%f\n", idx,h_output_signal[idx]);
+					bTestResult = false;
+					break;
+				}
+			}
+		}
+	}
+
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 1D R2C FFT - Basic Data Layout
+*/
+template<typename In, typename Out>
+bool run1DFFTTransformR2C(size_t lengthX, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 1; //1D Transform
+
+	// Allocate host memory for the signal
+	// cuFFT only supports Hermitian Interleaved for output of R2C
+	// Hence output size is N/2 + 1 complex. So allocate N + 2 real input
+	size_t mem_size = sizeof(In) * (lengthX+2) * batch_size;
+    In *h_in_signal = (In *)malloc(mem_size);
+	In *d_in_signal;
+
+	Out *h_output_signal;
+	Out *d_output_signal;
+
+	// Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, mem_size));
+    
+	if (outPlace)
+	{
+		h_output_signal = (Out *)malloc(mem_size);
+
+		// Allocate device memory for signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, mem_size));
+	}
+	else
+	{
+		h_output_signal =  (Out*) h_in_signal;
+		d_output_signal = (Out*) d_in_signal;
+	}
+
+	// CUFFT plan
+    cufftHandle plan;
+	checkCudaErrors(cufftPlanMany(&plan, rank, (int*)&lengthX,
+							NULL, 1, (int)(lengthX+2), // *inembed, istride, idist
+							NULL, 1, (int)(lengthX/2 + 1),  // *onembed, ostride, odist
+							type, batch_size));
+
+	// Initalize the memory for the signal
+	for (unsigned int j = 0; j < ((lengthX+2) * batch_size); ++j)
+	{
+		h_in_signal[j] = 1;
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, mem_size,
+							   cudaMemcpyHostToDevice));
+	
+	Timer tr;
+	double wtime_t = 0.0;
+			
+	for (int i = 0; i < profile_count; i++)
+	{
+		// Transform signal and kernel
+		tr.Start();
+
+		switch (type)
+		{
+		case CUFFT_D2Z:
+			checkCudaErrors(cufftExecD2Z(plan, (cufftDoubleReal*)d_in_signal, (cufftDoubleComplex*)d_output_signal));
+			break;
+		case CUFFT_R2C:
+			checkCudaErrors(cufftExecR2C(plan, (cufftReal*)d_in_signal, (cufftComplex*)d_output_signal));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = lengthX;
+	double opsconst = 2.5 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+
+    // Copy device memory to host
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, mem_size,
+                               cudaMemcpyDeviceToHost));
+
+    //// check result
+	printf("h_output_signal[0].x=%f, h_output_signal[0].y=%f \n", h_output_signal[0].x, h_output_signal[0].y);
+	printf("h_output_signal[1].x=%f, h_output_signal[1].y=%f \n", h_output_signal[1].x, h_output_signal[1].y);
+	if (batch_size > 1)
+	{
+		printf("h_output_signal[lengthX/2 + 1].x=%f, h_output_signal[lengthX/2 + 1].y=%f \n", h_output_signal[lengthX/2 + 1].x, h_output_signal[lengthX/2 + 1].y);
+	}
+
+	bool bTestResult = true;
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		for( int i = 0; i < (lengthX/2 + 1)*batch_size; ++i )
+		{
+			//Check real part of 1st element of every batch is equal to length
+			if (i == 0 || (0 == (i % (lengthX/2 + 1))))
+			{
+				if (h_output_signal[i].x != lengthX)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+			else
+			{
+				if (h_output_signal[i].x != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+
+			if (h_output_signal[ i ].y != 0)
+			{
+				bTestResult = false;
+				break;
+			}
+		}
+	}
+
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<In, Out>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+/*
+* Run 1D C2C FFT - Basic Data Layout
+*/
+template<typename T>
+bool run1DFFTTransformC2C(size_t lengthX, int direction, cufftType type, int profile_count, int batch_size, bool outPlace)
+{
+	int rank = 1; //1D transform
+
+	// Allocate host memory for the signal
+	size_t mem_size = sizeof(T) * lengthX * batch_size;
+    T *h_in_signal = (T *)malloc(mem_size);
+	
+	T *h_output_signal;
+	T *d_in_signal;
+	T *d_output_signal;
+
+    // Allocate device memory for signal
+    checkCudaErrors(cudaMalloc((void **)&d_in_signal, mem_size));
+
+	if (outPlace)
+	{
+		h_output_signal = (T *)malloc(mem_size);
+
+		// Allocate device memory for out signal
+		checkCudaErrors(cudaMalloc((void **)&d_output_signal, mem_size));
+	}
+	else
+	{
+		h_output_signal = h_in_signal;
+		d_output_signal = d_in_signal;
+	}
+
+    // CUFFT plan
+    cufftHandle plan;
+	checkCudaErrors(cufftPlanMany(&plan, rank, (int*)&lengthX,
+							NULL, 1, (int)lengthX, // *inembed, istride, idist
+							NULL, 1, (int)lengthX,  // *onembed, ostride, odist
+							type, batch_size));
+	
+
+	// Initalize the memory for the signal
+	if (direction == CUFFT_FORWARD)
+	{
+		for (size_t idx = 0; idx < (lengthX*batch_size); ++idx)
+		{
+			h_in_signal[idx].x = 1;
+			h_in_signal[idx].y = 0;
+		}
+	}
+	else
+	{
+		//Inverse FFT
+		memset(h_in_signal, 0, mem_size);
+
+		for (size_t idx = 0; idx < (lengthX*batch_size); idx+=lengthX)
+		{
+			h_in_signal[idx].x = (float) (lengthX);
+		}
+	}
+
+	// Copy host memory to device
+	checkCudaErrors(cudaMemcpy(d_in_signal, h_in_signal, mem_size,
+							   cudaMemcpyHostToDevice));
+	
+	Timer tr;
+	double wtime_t = 0.0;
+
+	for (int i = 0; i < profile_count; i++)
+	{
+		tr.Start();
+
+		//Execute FFT
+		switch (type)
+		{
+		case CUFFT_C2C:
+			checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_in_signal, (cufftComplex *)d_output_signal, direction));
+			break;
+		case CUFFT_Z2Z:
+			checkCudaErrors(cufftExecZ2Z(plan, (cufftDoubleComplex *)d_in_signal, (cufftDoubleComplex *)d_output_signal, direction));
+			break;
+		default: 
+			printf("Input Arguments ERROR!! Invalid FFT type. Exiting..\n"); 
+			cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+			exit(EXIT_FAILURE);
+		}
+
+		cudaDeviceSynchronize();
+		wtime_t += tr.Sample();
+
+		//Ignore the first time sample if profiling for more than one iteration
+		if (i == 0 && profile_count > 1) wtime_t = 0.0;
+	}
+
+	double iter = (double)( profile_count > 1 ? (profile_count - 1) : profile_count);
+	double wtime = wtime_t/iter;
+	size_t totalLen = 1;
+	totalLen = lengthX;
+	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+
+	printf("\nExecution wall time: %lf ms\n", 1000.0*wtime);
+	printf("Execution gflops: %lf \n", ((double)batch_size * opsconst)/(1000000000.0*wtime));
+	
+    // Copy device memory to host
+    //cufftComplex *h_output_signal = h_signal;
+    checkCudaErrors(cudaMemcpy(h_output_signal, d_output_signal, mem_size,
+                               cudaMemcpyDeviceToHost));
+
+	bool bTestResult = true;
+	// check result only if profile count == 1 or outplace transform
+	if (profile_count == 1 || outPlace)
+	{
+		for( int i = 0; i < (lengthX*batch_size); ++i )
+		{
+			if (direction == CUFFT_FORWARD)
+			{
+				if (0 == (i % lengthX))
+				{
+					if (h_output_signal[i].x != lengthX)
+					{
+						bTestResult = false;
+						break;
+					}
+				}
+				else
+				{
+					if (h_output_signal[i].x != 0)
+					{
+						bTestResult = false;
+						break;
+					}
+				}
+
+				if (h_output_signal[ i ].y != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+			else
+			{
+				/*****************************************************************************************
+				* cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set 
+				* followed by an inverse FFT on the resulting set yields data that is equal to the input, 
+				* scaled by the number of elements. Scaling either transform by the reciprocal of the size 
+				* of the data set is left for the user to perform as seen fit
+				******************************************************************************************/
+				//Inverse FFT
+				if (h_output_signal[i].x != lengthX)
+				{
+					bTestResult = false;
+					break;
+				}
+				if (h_output_signal[ i ].y != 0)
+				{
+					bTestResult = false;
+					break;
+				}
+			}
+		}
+	}
+
+    //Destroy CUFFT context
+    checkCudaErrors(cufftDestroy(plan));
+
+    // cleanup memory
+	cleanup<T, T>(h_in_signal, d_in_signal, h_output_signal, d_output_signal, outPlace);
+
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+
+	return bTestResult;
+}
+
+
+/*
+* Clean up resources
+*/
+template<typename In, typename Out>
+void cleanup(In *h_in_signal, In *d_in_signal, Out *h_output_signal, Out *d_output_signal, bool outPlace)
+{
+	    // cleanup memory
+	if (h_in_signal)
+		free(h_in_signal);
+	if (h_output_signal && outPlace)
+		free(h_output_signal);
+    if (d_in_signal)
+		checkCudaErrors(cudaFree(d_in_signal));
+	if (d_output_signal && outPlace)
+		checkCudaErrors(cudaFree(d_output_signal));
+}
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list