[clfft] 24/74: removing VLIW transpose code
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jan 14 19:52:14 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository clfft.
commit f069b93c9c16e8d2528af3627d8cffd4277a7d1d
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Tue Dec 1 18:59:07 2015 -0800
removing VLIW transpose code
---
src/library/CMakeLists.txt | 55 +-
src/library/action.cpp | 12 -
src/library/action.h | 55 --
src/library/generator.h | 1 -
src/library/generator.transpose.vliw.cpp | 906 -------------------------------
src/library/generator.transpose.vliw.h | 25 -
src/library/plan.cpp | 45 +-
src/library/transform.cpp | 3 +-
8 files changed, 36 insertions(+), 1066 deletions(-)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 3025d0a..c58de4e 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -1,12 +1,12 @@
# ########################################################################
# Copyright 2013 Advanced Micro Devices, Inc.
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,16 +15,15 @@
# ########################################################################
# List the names of common files to compile across all platforms
-set( clFFT.Source transform.cpp
- accessors.cpp
- plan.cpp
- repo.cpp
- generator.stockham.cpp
- generator.transpose.vliw.cpp
+set( clFFT.Source transform.cpp
+ accessors.cpp
+ plan.cpp
+ repo.cpp
+ generator.stockham.cpp
generator.transpose.gcn.cpp
- generator.transpose.square.cpp
+ generator.transpose.square.cpp
generator.copy.cpp
- lifetime.cpp
+ lifetime.cpp
fft_binary_lookup.cpp
md5sum.c
action.cpp
@@ -35,23 +34,22 @@ if( MSVC )
set( clFFT.Source ${clFFT.Source} dllmain.cpp )
endif( )
-set( clFFT.Headers private.h
+set( clFFT.Headers private.h
action.h
- repo.h
- plan.h
- lock.h
- mainpage.h
- generator.h
- generator.stockham.h
- generator.transpose.vliw.h
- generator.transpose.gcn.h
- generator.transpose.square.h
+ repo.h
+ plan.h
+ lock.h
+ mainpage.h
+ generator.h
+ generator.stockham.h
+ generator.transpose.gcn.h
+ generator.transpose.square.h
fft_binary_lookup.h
md5sum.h
- ../include/stdafx.h
- ../include/unicode.compatibility.h
- ../include/targetver.h
- ../include/clAmdFft.h
+ ../include/stdafx.h
+ ../include/unicode.compatibility.h
+ ../include/targetver.h
+ ../include/clAmdFft.h
../include/clFFT.h )
set( clFFT.Files ${clFFT.Source} ${clFFT.Headers} )
@@ -59,7 +57,7 @@ set( clFFT.Files ${clFFT.Source} ${clFFT.Headers} )
# For a rainy day, add pre-compiled header support
#if( MSVC )
# if (USE_MSVC_PCH)
-
+
# set_source_files_properties(LungAnalysisPCH.cxx
# PROPERTIES
# COMPILE_FLAGS "/YcLungAnalysisPCH.h"
@@ -71,13 +69,13 @@ set( clFFT.Files ${clFFT.Source} ${clFFT.Headers} )
# COMPILE_FLAGS "/YuLungAnalysisPCH.h"
# )
# endforeach( src_file ${UPMC_LA_SRCS} )
-
+
# list(APPEND UPMC_LA_SRCS LungAnalysisPCH.cxx)
# list(APPEND UPMC_LA_HDRS LungAnalysisPCH.h)
# endif(USE_MSVC_PCH)
#endif (MSVC)
-
+
# add_definitions( ${Boost_LIB_DIAGNOSTIC_DEFINITIONS} )
add_definitions( "/DCLFFT_EXPORTS" )
@@ -106,4 +104,3 @@ install( TARGETS clFFT
LIBRARY DESTINATION lib${SUFFIX_LIB}
ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
)
-
diff --git a/src/library/action.cpp b/src/library/action.cpp
index 60508ff..7b2cfc9 100644
--- a/src/library/action.cpp
+++ b/src/library/action.cpp
@@ -41,18 +41,6 @@ FFTCopyAction::FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_comman
err = CLFFT_SUCCESS;
}
-FFTTransposeVLIWAction::FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
- : FFTAction(plan, err)
-{
- if (err != CLFFT_SUCCESS)
- {
- // FFTAction() failed, exit constructor
- return;
- }
-
- err = CLFFT_SUCCESS;
-}
-
FFTTransposeGCNAction::FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
: FFTAction(plan, err)
{
diff --git a/src/library/action.h b/src/library/action.h
index 9c14a74..1619594 100644
--- a/src/library/action.h
+++ b/src/library/action.h
@@ -54,20 +54,6 @@ public:
};
-//
-// FFTTransposeVLIWAction
-//
-// Base class for every TransposeVLIW action for the FFT.
-// Currently do nothing special. The kernel generation and compilation occurs
-// by the subclass FFTGeneratedTransposeVLIWAction
-//
-class FFTTransposeVLIWAction : public FFTAction
-{
-public:
- FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
-
- clfftGenerators getGenerator() { return Transpose_VLIW; }
-};
//
@@ -185,47 +171,6 @@ public:
}
};
-//
-// FFTGeneratedTransposeVLIWAction
-//
-// Implements a TransposeVLIW action for the FFT
-// Its signature is represented by FFTKernelGenKeyParams structure
-//
-// This class implements:
-// - the generation of the kernel string
-// - the build of the kernel
-//
-// The structure FFTKernelGenKeyParams is used to characterize and generate
-// the appropriate transpose kernel. That structure is used for the signature of
-// this action. It is common to Stockham, copy and transpose methods. For
-// convenience, this structure is used for every FFTGenerated*Action class,
-// but in practice the transpose action only use a few information of that
-// structure, so a proper structure should be used instead.
-//
-class FFTGeneratedTransposeVLIWAction : public FFTTransposeVLIWAction
-{
-public:
- FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
-
- typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_TRANSPOSE_ACTION> Signature;
-
-private:
- Signature signature;
-
- clfftStatus generateKernel (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
- clfftStatus getWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws);
- clfftStatus initParams ();
-
- bool buildForwardKernel();
- bool buildBackwardKernel();
-
-public:
-
- virtual const Signature * getSignatureData()
- {
- return &this->signature;
- }
-};
diff --git a/src/library/generator.h b/src/library/generator.h
index b27043a..b90090c 100644
--- a/src/library/generator.h
+++ b/src/library/generator.h
@@ -23,7 +23,6 @@
enum clfftGenerators
{
Stockham, // Using the Stockham autosort frameworks
- Transpose_VLIW,
Transpose_GCN,
Transpose_SQUARE,
Copy,
diff --git a/src/library/generator.transpose.vliw.cpp b/src/library/generator.transpose.vliw.cpp
deleted file mode 100644
index b075314..0000000
--- a/src/library/generator.transpose.vliw.cpp
+++ /dev/null
@@ -1,906 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-// clfft.generator.Transpose.cpp : Dynamic run-time generator of openCL transpose kernels
-//
-
-// TODO: generalize the kernel to work with any size
-
-#include "stdafx.h"
-#include <math.h>
-#include "generator.transpose.vliw.h"
-#include "action.h"
-
-
-FFTGeneratedTransposeVLIWAction::FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
- : FFTTransposeVLIWAction(plHandle, plan, queue, err)
-{
- if (err != CLFFT_SUCCESS)
- {
- // FFTTransposeVLIWAction() failed, exit
- fprintf(stderr, "FFTTransposeVLIWAction() failed!\n");
- return;
- }
-
- // Initialize the FFTAction::FFTKernelGenKeyParams member
- err = this->initParams();
-
- if (err != CLFFT_SUCCESS)
- {
- fprintf(stderr, "FFTGeneratedTransposeVLIWAction::initParams() failed!\n");
- return;
- }
-
- FFTRepo &fftRepo = FFTRepo::getInstance();
-
- err = this->generateKernel(fftRepo, queue);
-
- if (err != CLFFT_SUCCESS)
- {
- fprintf(stderr, "FFTGeneratedTransposeVLIWAction::generateKernel failed\n");
- return;
- }
-
- err = compileKernels( queue, plHandle, plan);
-
- if (err != CLFFT_SUCCESS)
- {
- fprintf(stderr, "FFTGeneratedTransposeVLIWAction::compileKernels failed\n");
- return;
- }
-
- err = CLFFT_SUCCESS;
-}
-
-
-bool FFTGeneratedTransposeVLIWAction::buildForwardKernel()
-{
- clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
- clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
- bool r2c_transform = (inputLayout == CLFFT_REAL);
- bool c2r_transform = (outputLayout == CLFFT_REAL);
- bool real_transform = (r2c_transform || c2r_transform);
-
- return (!real_transform) || r2c_transform;
-}
-
-bool FFTGeneratedTransposeVLIWAction::buildBackwardKernel()
-{
- clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
- clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
- bool r2c_transform = (inputLayout == CLFFT_REAL);
- bool c2r_transform = (outputLayout == CLFFT_REAL);
- bool real_transform = (r2c_transform || c2r_transform);
-
- return (!real_transform) || c2r_transform;
-}
-
-
-
-#define QUOTEMARK(x) #x
-
-#define PLANNAR_READ(z0, z1, z2, z3, gp) \
- ss << INDENT2 << QUOTEMARK(z0 = gp[0*HSTRIDE/4*8];\n) \
- << INDENT2 << QUOTEMARK(z1 = gp[1*HSTRIDE/4*8];\n) \
- << INDENT2 << QUOTEMARK(z2 = gp[2*HSTRIDE/4*8];\n) \
- << INDENT2 << QUOTEMARK(z3 = gp[3*HSTRIDE/4*8];\n);
-
-#define INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31) \
- ss << INDENT2 << QUOTEMARK(z00 = gp[0*HSTRIDE/4*16];\n) \
- << INDENT2 << QUOTEMARK(z01 = gp[0*HSTRIDE/4*16 + 1];\n) \
- << INDENT2 << QUOTEMARK(z10 = gp[1*HSTRIDE/4*16];\n) \
- << INDENT2 << QUOTEMARK(z11 = gp[1*HSTRIDE/4*16 + 1];\n) \
- << INDENT2 << QUOTEMARK(z20 = gp[2*HSTRIDE/4*16];\n) \
- << INDENT2 << QUOTEMARK(z21 = gp[2*HSTRIDE/4*16 + 1];\n) \
- << INDENT2 << QUOTEMARK(z30 = gp[3*HSTRIDE/4*16];\n) \
- << INDENT2 << QUOTEMARK(z31 = gp[3*HSTRIDE/4*16 + 1];\n);
-
-#define PLANNAR_WRITE(z0, z1, z2, z3, gp) \
- ss << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*8] = z0;\n) \
- << INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*8] = z1;\n) \
- << INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*8] = z2;\n) \
- << INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*8] = z3;\n);
-
-#define INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31) \
- ss << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*16] = z00;\n) \
- << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*16+1] = z01;\n) \
- << INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*16] = z10;\n) \
- << INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*16+1] = z11;\n) \
- << INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*16] = z20;\n) \
- << INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*16+1] = z21;\n) \
- << INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*16] = z30;\n) \
- << INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*16+1] = z31;\n);
-
-#define WRITE_TO_LDS(lp, jump, z0, z1, z2, z3, part) \
- ss << INDENT2 << QUOTEMARK(lp[0*jump] = z0.part;\n) \
- << INDENT2 << QUOTEMARK(lp[1*jump] = z1.part;\n) \
- << INDENT2 << QUOTEMARK(lp[2*jump] = z2.part;\n) \
- << INDENT2 << QUOTEMARK(lp[3*jump] = z3.part;\n) \
- << INDENT2 << QUOTEMARK(lp += jump*4;\n) \
- << "\n";
-
-typedef enum inputoutputflag_
-{
- PLANNAR_PLANNAR = 1,
- PLANNAR_INTERLEAVED,
- INTERLEAVED_PLANNAR,
- INTERLEAVED_INTERLEAVED,
- ENDTRANSIO
-} transio;
-
-static clfftStatus GenerateTransposeKernel (FFTGeneratedTransposeVLIWAction::Signature & params,
- std::string & kernel)
-{
- kernel.reserve (8000);
- std::stringstream ss (std::stringstream::out);
-
- const char * szIn0 = "";
- const char * szIn1 = "";
- const char * szOut0 = "";
- const char * szOut1 = "";
- const char * typeIn = "";
- const char * typeOut = "";
- const char * INDENT2 = "";
- const char * INDENT = " ";
- const char * datatype="";
- const char * datatype2="";
- bool xyflag = (params.fft_N[0] == params.fft_N[1]) ? false : true;
- transio iotype;
-
-
- if (params.fft_precision == CLFFT_SINGLE)
- {
- datatype = "float";
- datatype2 = "float2";
- }
- else
- {
- datatype = "double";
- datatype2 = "double2";
- ss << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n\n";
- }
-
- size_t hstride = params.fft_N[0];
- size_t vstride = params.fft_N[1];
-
- ss << "#define HSTRIDE " << hstride << "\n";
- ss << "#define VSTRIDE " << vstride << "\n";
- if (xyflag)
- {
- ss << "#define DIMX " << hstride/32 << "\n";
- ss << "#define DIMY " << vstride/32 << "\n";
- ss << "#define DIM ((DIMX > DIMY) ? DIMX : DIMY) \n\n";
- INDENT2 = " ";
- }
- else
- {
- ss << "#define DIM " << vstride/32 << "\n\n";
- INDENT2 = " ";
- }
-
- // Generate the kernel entry point and parameter list
- //
- ss << "__attribute__((reqd_work_group_size(" << unsigned(params.fft_SIMD) << ",1,1)))\n"
- << "__kernel void\n"
- << "fft_trans" << "(";
-
- if (xyflag && params.fft_placeness == CLFFT_INPLACE) return CLFFT_INVALID_ARG_VALUE;
-
- switch (params.fft_inputLayout) {
- case CLFFT_COMPLEX_INTERLEAVED:
- typeIn = datatype2;
- if (params.fft_placeness == CLFFT_INPLACE)
- {
- szIn0 = szOut0 = "gcomplx";
- typeOut = datatype2;
- ss << "__global " << typeIn << " * restrict " << szIn0;
- iotype = INTERLEAVED_INTERLEAVED;
- }
- else
- {
- szIn0 = "gcomplxIn";
- ss << "__global " << typeIn << " * restrict " << szIn0;
-
- if (params.fft_outputLayout == CLFFT_COMPLEX_PLANAR)
- {
- szOut0 = "grealOut";
- szOut1 = "gimagOut";
- typeOut = datatype;
- ss <<", __global " << typeOut <<" * restrict " << szOut0
- <<", __global " << typeOut <<" * restrict " << szOut1;
- iotype = INTERLEAVED_PLANNAR;
- }
- else
- {
- szOut0 = "gcomplxOut";
- typeOut = datatype2;
- ss <<", __global " << typeOut <<" * restrict " << szOut0;
- iotype = INTERLEAVED_INTERLEAVED;
- }
- }
- break;
- case CLFFT_COMPLEX_PLANAR:
- typeIn = datatype;
- if (params.fft_placeness == CLFFT_INPLACE)
- {
- szIn0 = szOut0 = "greal";
- szIn1 = szOut1 = "gimag";
- typeOut = datatype;
- ss << "__global " << typeIn << " * restrict " << szIn0 <<", __global " << typeIn <<" * restrict " << szIn1;
- iotype = PLANNAR_PLANNAR;
- }
- else
- {
- szIn0 = "greadIn";
- szIn1 = "gimagIn";
- ss <<"__global " << typeIn << " * restrict " << szIn0 <<", __global " << typeIn <<" * restrict " << szIn1;
- if (params.fft_outputLayout == CLFFT_COMPLEX_PLANAR)
- {
- szOut0 = "grealOut";
- szOut1 = "gimagOut";
- typeOut = datatype;
- ss << ", __global " << typeOut <<" * restrict " << szOut0 <<", __global " << typeOut <<" * restrict " << szOut1;
- iotype = PLANNAR_PLANNAR;
- }
- else
- {
- szOut0 = "gcomplxOut";
- typeOut = datatype2;
- ss << ", __global " << typeOut <<" * restrict " << szOut0;
- iotype = PLANNAR_INTERLEAVED;
- }
- }
- break;
- default:
- return CLFFT_NOTIMPLEMENTED;
- }
- ss << ")\n{\n";
-
- // Support plannar and interleaved format
- switch (iotype)
- {
- case PLANNAR_INTERLEAVED:
- ss << INDENT << "__local " << typeIn << " ldsa[2048];\n"
- << INDENT << "__local " << typeIn << " ldsb[2048];\n";
- break;
- case INTERLEAVED_PLANNAR:
- case PLANNAR_PLANNAR:
- case INTERLEAVED_INTERLEAVED:
- ss << INDENT << "__local " << typeIn << " ldsa[1024];\n"
- << INDENT << "__local " << typeIn << " ldsb[1024];\n";
- break;
- default:
- return CLFFT_NOTIMPLEMENTED;
- }
-
- ss << INDENT << "uint gid = get_global_id(0);\n"
- << INDENT << "uint me = gid & 0x3fU;\n"
- << INDENT << "uint k = (gid >> 6) % ";
-
- // add batch support
- size_t batchnum = (vstride > hstride) ? ((vstride/32) * (vstride/32 +1) /2)
- : ((hstride/32) * (hstride/32 +1) /2);
- ss << batchnum
- << ";\n"
- << "\n";
-
- ss << INDENT << "// Compute location of blocks\n"
- << INDENT << "int l = DIM+0.5f - native_sqrt((DIM+0.5f)*(DIM+0.5f) - 2.0f * (float)as_int(k));\n"
- << INDENT << "int kl = ((DIM*2+1 - l) * l) >> 1;\n"
- << INDENT << "uint j = k - kl;\n"
- << INDENT << "uint i = l + j;\n"
- << "\n";
-
- ss << INDENT << "uint goa, gob;\n"
- << INDENT << "uint go = ((me & 0x7U) << 2) + ((gid>>6)/" << batchnum << ") * VSTRIDE * HSTRIDE;\n"
- << INDENT << "__global " << datatype << "4 *gp;\n"
- << INDENT << "__local " << datatype << "4 *lp4;\n"
- << INDENT << "uint lo = ((me & 0x7U) << 7) + (me >> 3);\n"
- << INDENT << "uint lot = (me<<2); \n";
-
- switch (iotype)
- {
- case PLANNAR_PLANNAR:
- ss << INDENT << datatype <<"4 z0, z1, z2, z3;\n\n"
- << INDENT << "__local " << typeIn <<" *lp;\n";
- break;
- case PLANNAR_INTERLEAVED:
- ss << INDENT << "__global " << datatype << "4 *gpi;\n"
- << INDENT << "__local " << typeIn <<" *lp;\n"
- << INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
- break;
- case INTERLEAVED_PLANNAR:
- ss << INDENT << "__local " << typeOut <<" *lp;\n"
- << INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
- break;
- case INTERLEAVED_INTERLEAVED:
- ss << INDENT << "__local " << typeIn <<" *lp;\n"
- << INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
- break;
- }
-
- if (xyflag)
- {
- ss << INDENT << "if ( i < DIMX && j < DIMY) \n"
- << INDENT << "{\n";
- }
-
- ss << INDENT2 << "// Array offsets\n"
- << INDENT2 << "goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
- <<"\n";
-
- ss << INDENT2 << "// Load A block\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn0 <<" + goa);\n";
-
- switch (iotype)
- {
- case PLANNAR_PLANNAR:
- PLANNAR_READ(z0, z1, z2, z3, gp);
- break;
- case PLANNAR_INTERLEAVED:
- PLANNAR_READ(z00, z10, z20, z30, gp);
- ss << INDENT2 << "gpi = (__global " << datatype << "4 *)(" << szIn1 <<" + goa);\n";
- PLANNAR_READ(z01, z11, z21, z31, gpi);
- break;
- default:
- INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31);
- break;
- }
- ss << "\n";
-
- ss << INDENT2 << "// Save into LDS\n";
- switch (iotype)
- {
- case PLANNAR_PLANNAR:
- ss << INDENT2 << "lp = ldsa + lo;\n";
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
- break;
- case PLANNAR_INTERLEAVED:
- ss << INDENT2 << "lp = ldsa + lo*2;\n";
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, x);
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, y);
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, z);
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, w);
- ss << INDENT2 << "lp = ldsa + lo*2 + 1;\n";
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, x);
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, y);
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, z);
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, w);
- break;
- case INTERLEAVED_INTERLEAVED:
- ss << INDENT2 << "lp = ldsa + lo;\n";
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, xy);
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, zw);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, xy);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, zw);
- break;
- case INTERLEAVED_PLANNAR:
- ss << INDENT2 << "lp = (__local " << typeOut << "*)ldsa + lo;\n";
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, x);
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, z);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, x);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, z);
- //next write to lp = ldsa+lo+1024
- ss << INDENT2 << "lp += (1024-32*4);\n";
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, y);
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, w);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, y);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, w);
- break;
- }
-
- ss << INDENT;
- if (xyflag) ss << "} ";
- ss <<"//End load A block\n\n";
-
- if (xyflag)
- ss << INDENT << "if (i < DIMY && j < DIMX) \n"
- << INDENT << "{\n";
-
- ss << INDENT2 << "// Load B block\n"
- << INDENT2 << "gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn0 << " + gob);\n";
-
- switch (iotype)
- {
- case PLANNAR_PLANNAR:
- PLANNAR_READ(z0, z1, z2, z3, gp);
- break;
- case PLANNAR_INTERLEAVED:
- PLANNAR_READ(z00, z10, z20, z30, gp);
- ss << INDENT2 << "gpi = (__global " << datatype << "4 *)(" << szIn1 <<" + gob);\n";
- PLANNAR_READ(z01, z11, z21, z31, gpi);
- break;
- default:
- INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31);
- break;
- }
- ss << "\n";
-
- ss << INDENT2 << "// Save into LDS\n";
- switch (iotype)
- {
- case PLANNAR_PLANNAR:
- ss << INDENT2 << "lp = ldsb + lo;\n";
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
- break;
- case PLANNAR_INTERLEAVED:
- ss << INDENT2 << "lp = ldsb + lo*2;\n";
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, x);
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, y);
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, z);
- WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, w);
- ss << INDENT2 << "lp = ldsb + lo*2 + 1;\n";
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, x);
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, y);
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, z);
- WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, w);
- break;
- case INTERLEAVED_INTERLEAVED:
- ss << INDENT2 << "lp = ldsb + lo;\n";
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, xy);
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, zw);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, xy);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, zw);
- break;
- case INTERLEAVED_PLANNAR:
- ss << INDENT2 << "lp = (__local " << typeOut << "*) ldsb + lo;\n";
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, x);
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, z);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, x);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, z);
- //next write to lp = ldsa+lo+1024
- ss << INDENT2 << "lp += (1024-32*4);\n";
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, y);
- WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, w);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, y);
- WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, w);
- break;
- }
-
- ss << INDENT;
- if (xyflag) ss<< "} ";
- ss << "// End load B block\n\n";
-
- ss << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
- << "\n";
-
- if (xyflag) ss << INDENT << "if (i < DIMY && j < DIMX)\n"
- << INDENT << "{\n";
-
- ss << INDENT2 << "// write A block\n";
-
- ss << INDENT2 << "goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut0 << " + goa);\n";
-
- switch (iotype)
- {
- case PLANNAR_PLANNAR:
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n\n";
- ss << INDENT2 << "z0 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z1 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z2 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z3 = lp4[0];\n\n";
-
- PLANNAR_WRITE(z0, z1, z2, z3, gp);
- break;
- case INTERLEAVED_PLANNAR:
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)((__local " << typeOut << "*)ldsb + lot);\n\n";
- ss << INDENT2 << "z00 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z10 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z20 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z30 = lp4[0];\n\n";
-
- PLANNAR_WRITE(z00, z10, z20, z30, gp);
-
- ss << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + goa);\n";
- ss << INDENT2 << "lp4 += (256 - 32*6);\n";
- ss << INDENT2 << "z01 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z11 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z21 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z31 = lp4[0];\n\n";
-
- PLANNAR_WRITE(z01, z11, z21, z31, gp);
- break;
-
- case PLANNAR_INTERLEAVED:
- case INTERLEAVED_INTERLEAVED:
- if (iotype == PLANNAR_INTERLEAVED)
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot*2);\n";
- else
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n";
-
- ss << INDENT2 << "z00 = lp4[0];\n"
- << INDENT2 << "z01 = lp4[1];\n"
- << INDENT2 << "lp4 += 32*4;\n"
- << "\n";
-
- ss << INDENT2 << "z10 = lp4[0];\n"
- << INDENT2 << "z11 = lp4[1];\n"
- << INDENT2 << "lp4 += 32*4;\n"
- << "\n";
-
- ss << INDENT2 << "z20 = lp4[0];\n"
- << INDENT2 << "z21 = lp4[1];\n"
- << INDENT2 << "lp4 += 32*4;\n"
- << "\n";
-
- ss << INDENT2 << "z30 = lp4[0];\n"
- << INDENT2 << "z31 = lp4[1];\n\n";
-
- INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31);
- break;
- }
- ss << "\n";
-
- ss << INDENT;
- if (xyflag) ss << "} ";
- ss << "// End write A block;\n\n";
-
- if (xyflag) ss << INDENT << "if (i < DIMX && j < DIMY)\n"
- << INDENT << "{\n";
-
- ss << INDENT2 << "// write B block\n\n";
- ss << INDENT2 << "gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut0 << " + gob);\n";
-
- switch (iotype)
- {
- case PLANNAR_PLANNAR:
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n\n";
- ss << INDENT2 << "z0 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z1 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z2 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z3 = lp4[0];\n\n";
-
- PLANNAR_WRITE(z0, z1, z2, z3, gp);
- break;
- case INTERLEAVED_PLANNAR:
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)((__local " << typeOut << "*)ldsa + lot);\n\n";
- ss << INDENT2 << "z00 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z10 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z20 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z30 = lp4[0];\n\n";
-
- PLANNAR_WRITE(z00, z10, z20, z30, gp);
-
- ss << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + gob);\n";
- ss << INDENT2 << "lp4 += (256 - 32*6);\n";
- ss << INDENT2 << "z01 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z11 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z21 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z31 = lp4[0];\n\n";
-
- PLANNAR_WRITE(z01, z11, z21, z31, gp);
- break;
-
- case PLANNAR_INTERLEAVED:
- case INTERLEAVED_INTERLEAVED:
- if (iotype == PLANNAR_INTERLEAVED)
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot*2);\n\n";
- else
- ss << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n\n";
-
- ss << INDENT2 << "z00 = lp4[0];\n"
- << INDENT2 << "z01 = lp4[1];\n"
- << INDENT2 << "lp4 += 32*4;\n"
- << "\n";
-
- ss << INDENT2 << "z10 = lp4[0];\n"
- << INDENT2 << "z11 = lp4[1];\n"
- << INDENT2 << "lp4 += 32*4;\n"
- << "\n";
-
- ss << INDENT2 << "z20 = lp4[0];\n"
- << INDENT2 << "z21 = lp4[1];\n"
- << INDENT2 << "lp4 += 32*4;\n"
- << "\n";
-
- ss << INDENT2 << "z30 = lp4[0];\n"
- << INDENT2 << "z31 = lp4[1];\n\n";
-
- INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31);
- break;
- }
- ss << "\n";
-
- ss << INDENT;
- if(xyflag) ss << "} ";
- ss << "// End write B block;\n\n";
-
- if (iotype == PLANNAR_PLANNAR)
- {
- ss << INDENT << "// Identical handling for imaginary data\n"
- << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
- << "\n";
-
- if (xyflag) ss << INDENT << "if (i < DIMX && j < DIMY)\n"
- << INDENT << "{\n";
-
- ss << INDENT2 << "//load A block\n"
- << INDENT2 << "goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn1 << " + goa);\n"
- << "\n";
-
- PLANNAR_READ(z0, z1, z2, z3, gp);
-
- ss << INDENT2 << "lp = ldsa + lo;\n"
- << "\n";
-
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
-
- ss << INDENT;
- if (xyflag) ss << "} ";
- ss << "//end load A block\n\n";
-
- if (xyflag) ss << INDENT << "if (i < DIMY && j < DIMX)\n"
- << INDENT << "{\n";
-
- ss << INDENT2 << "//load B block\n"
- << INDENT2 << "gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn1 <<" + gob);\n"
- << "\n";
-
- PLANNAR_READ(z0, z1, z2, z3, gp);
-
- ss << INDENT2 << "lp = ldsb + lo;\n"
- << "\n";
-
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
- WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
-
- ss << INDENT;
- if (xyflag) ss << "} ";
- ss << "// end load B block\n\n";
-
- ss << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
- << "\n";
-
- if (xyflag) ss << INDENT << "if (i < DIMY && j < DIMX)\n"
- << INDENT << "{\n";
-
- ss << INDENT2 << "//Write A block\n"
- << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n"
- << "\n";
-
- ss << INDENT2 << "z0 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z1 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z2 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z3 = lp4[0];\n"
- << "\n";
-
- ss << INDENT2 << "goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + goa);\n"
- << "\n";
-
- PLANNAR_WRITE(z0, z1, z2, z3, gp);
-
- ss << INDENT;
- if (xyflag) ss << "} ";
- ss << "// end write A block\n\n";
-
- if (xyflag) ss << INDENT << "if (i < DIMX && j < DIMY)\n"
- << INDENT << "{\n";
-
- ss << INDENT2 << "//write B block\n"
- << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n"
- << "\n";
-
- ss << INDENT2 << "z0 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z1 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z2 = lp4[0];\n"
- << INDENT2 << "lp4 += 32*2;\n"
- << "\n";
-
- ss << INDENT2 << "z3 = lp4[0];\n"
- << "\n";
-
- ss << INDENT2 << "gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
- << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + gob);\n";
- PLANNAR_WRITE(z0, z1, z2, z3, gp);
-
- ss << INDENT;
- if (xyflag) ss << "} ";
- ss << "// end write B block\n";
- }
-
- ss << "}\n\n";
- kernel = ss.str();
- return CLFFT_SUCCESS;
-}
-
-clfftStatus FFTGeneratedTransposeVLIWAction::initParams ()
-{
-
- // Query the devices in this context for their local memory sizes
- // How we generate a kernel depends on the *minimum* LDS size for all devices.
- //
- const FFTEnvelope * pEnvelope = NULL;
- OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
- BUG_CHECK (NULL != pEnvelope);
-
-
- this->signature.fft_precision = this->plan->precision;
- this->signature.fft_placeness = this->plan->placeness;
- this->signature.fft_inputLayout = this->plan->inputLayout;
-
- ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
-
- if (CLFFT_INPLACE == this->plan->placeness) {
- // If this is an in-place transform the
- // input and output layout, dimensions and strides
- // *MUST* be the same.
- //
- ARG_CHECK (this->plan->inputLayout == this->plan->outputLayout)
- this->signature.fft_outputLayout = this->plan->inputLayout;
- for (size_t u = this->plan->inStride.size(); u-- > 0; ) {
- ARG_CHECK (this->plan->inStride[u] == this->plan->outStride[u]);
- }
- } else {
- this->signature.fft_outputLayout = this->plan->outputLayout;
- }
-
- //we only support 2D transpose
- switch (this->plan->inStride.size()) {
- // 2-D array is a 3-D data structure
- // 2-D unit is a speical case of 2-D array.
- case 2:
- ARG_CHECK(this->plan->length .size() > 1);
- ARG_CHECK(this->plan->outStride.size() > 1);
- this->signature.fft_DataDim = 3;
- this->signature.fft_N[0] = this->plan->length[0];
- this->signature.fft_N[1] = this->plan->length[1];
- this->signature.fft_inStride[0] = this->plan->inStride[0];
- this->signature.fft_inStride[1] = this->plan->inStride[1];
- this->signature.fft_inStride[2] = this->plan->iDist;
- this->signature.fft_outStride[0] = this->plan->outStride[0];
- this->signature.fft_outStride[1] = this->plan->outStride[1];
- this->signature.fft_outStride[2] = this->plan->oDist;
- break;
- default:
- ARG_CHECK (false);
- }
-
- //ToDO: work group size setup
- this->signature.fft_R = 32; // divide the element into 32x32 blocks
- this->signature.fft_SIMD = 64; //work group size
-
- return CLFFT_SUCCESS;
-}
-
-clfftStatus FFTGeneratedTransposeVLIWAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
-{
-
- unsigned long long count, count0, count1;
- count0 = DivRoundingUp<unsigned long long> (this->plan->length[0], this->signature.fft_R);
- count1 = DivRoundingUp<unsigned long long> (this->plan->length[1], this->signature.fft_R);
- count = (count0>count1) ? count0 : count1;
- count = (count * (count+1)) /2;
- count *= this->signature.fft_SIMD;
- count *= this->plan->batchsize;
-
- globalWS.push_back( static_cast< size_t >( count ) );
- localWS.push_back( this->signature.fft_SIMD );
-
- return CLFFT_SUCCESS;
-}
-
-
-// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
-// Feed this generator the FFTPlan, and it returns the generated program as a string
-clfftStatus FFTGeneratedTransposeVLIWAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
-{
-
- std::string programCode;
- OPENCL_V( GenerateTransposeKernel( this->signature, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
-
- cl_int status = CL_SUCCESS;
- cl_device_id Device = NULL;
- status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
- OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
- cl_context QueueContext = NULL;
- status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
- OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
- OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
- OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW, this->getSignatureData(), "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
-
- return CLFFT_SUCCESS;
-}
diff --git a/src/library/generator.transpose.vliw.h b/src/library/generator.transpose.vliw.h
deleted file mode 100644
index 12ad701..0000000
--- a/src/library/generator.transpose.vliw.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-#pragma once
-#if !defined( AMD_CLFFT_generator_transpose_H )
-#define AMD_CLFFT_generator_transpose_H
-#include "private.h"
-#include "repo.h"
-#include "plan.h"
-
-#endif
-
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index a6736bc..a5c605e 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -240,7 +240,6 @@ std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHan
case Stockham: generatorName = "Stockham"; break;
case Transpose_GCN: generatorName = "Transpose"; break;
case Transpose_SQUARE: generatorName = "Transpose"; break;
- case Transpose_VLIW: generatorName = "Transpose"; break;
case Copy: generatorName = "Copy"; break;
}
@@ -276,12 +275,6 @@ clfftStatus selectAction(FFTPlan * fftPlan, FFTAction *& action, cl_command_queu
}
break;
- case Transpose_VLIW:
- {
- action = new FFTGeneratedTransposeVLIWAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
- OPENCL_V( err, "FFTGeneratedTransposeVLIWAction() failed");
- }
- break;
case Copy:
{
@@ -1879,8 +1872,9 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
else if(fftPlan->gen == Transpose_SQUARE)
fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
else
- fftPlan->action = new FFTGeneratedTransposeVLIWAction(plHandle, fftPlan, *commQueueFFT, err);
- OPENCL_V( err, "FFTGeneratedTransposeVLIWAction failed");
+ fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
+
+ OPENCL_V( err, "FFTGeneratedTransposeXXXAction failed");
fftPlan->baked = true;
return CLFFT_SUCCESS;
@@ -1890,28 +1884,6 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
size_t length1 = fftPlan->length[1];
- if (fftPlan->length[0]==256 && fftPlan->length[1]==256)
- {
- length0 += 8;
- length1 += 1;
- }
- else if (fftPlan->length[0]==512 && fftPlan->length[1]==512)
- {
- length0 += 1;
- length1 += 1;//length1 += 0;
- }
- else if (fftPlan->length[0]==1024 && fftPlan->length[1]==512)
- {
- length0 += 2;
- length1 += 2;//length1 += 0;
- }
- else if (fftPlan->length[0]==1024 && fftPlan->length[1]==1024)
- {
- length0 += 1;
- length1 += 1;//length1 += 0;
- }
-
-
if (fftPlan->length[0] > Large1DThreshold ||
fftPlan->length[1] > Large1DThreshold)
fftPlan->large2D = true;
@@ -2003,7 +1975,8 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
clLengths[0] = fftPlan->length[0];
clLengths[1] = fftPlan->length[1];
- bool xyflag = (clLengths[0]==clLengths[1]) ? false : true;
+ // bool xyflag = (clLengths[0]==clLengths[1]) ? false : true;
+ bool xyflag = true;
if (xyflag && fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2)
{
// we need tmp buffer for x!=y case
@@ -2022,7 +1995,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
transPlanX->inputLayout = fftPlan->outputLayout;
transPlanX->precision = fftPlan->precision;
transPlanX->tmpBufSize = 0;
- transPlanX->gen = Transpose_VLIW;
+ transPlanX->gen = Transpose_GCN;
transPlanX->envelope = fftPlan->envelope;
transPlanX->batchsize = fftPlan->batchsize;
transPlanX->inStride[0] = fftPlan->outStride[0];
@@ -2035,7 +2008,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
transPlanX->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
transPlanX->placeness = CLFFT_OUTOFPLACE;
transPlanX->outStride[0] = 1;
- transPlanX->outStride[1] = clLengths[0];
+ transPlanX->outStride[1] = clLengths[1];
transPlanX->oDist = clLengths[0] * clLengths[1];
}
else
@@ -2149,7 +2122,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
transPlanY->oDist = fftPlan->oDist;
transPlanY->precision = fftPlan->precision;
transPlanY->tmpBufSize = 0;
- transPlanY->gen = Transpose_VLIW;
+ transPlanY->gen = Transpose_GCN;
transPlanY->envelope = fftPlan->envelope;
transPlanY->batchsize = fftPlan->batchsize;
transPlanY->transflag = true;
@@ -4158,8 +4131,6 @@ clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
switch(gen)
{
case Stockham: return GetMax1DLengthStockham(longest);
- //No restriction for Transpose_VLIW kernel
- case Transpose_VLIW: *longest = 4096; return CLFFT_SUCCESS;
case Transpose_GCN: *longest = 4096; return CLFFT_SUCCESS;
case Transpose_SQUARE: *longest = 4096; return CLFFT_SUCCESS;
case Copy: *longest = 4096; return CLFFT_SUCCESS;
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index d061dad..0efad17 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -671,7 +671,8 @@ clfftStatus clfftEnqueueTransform(
cl_event transXOutEvents = NULL;
cl_event colOutEvents = NULL;
- bool xyflag = (fftPlan->length[0] == fftPlan->length[1]) ? false : true;
+ //bool xyflag = (fftPlan->length[0] == fftPlan->length[1]) ? false : true;
+ bool xyflag = true;
if (xyflag)
{
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list