[clfft] 47/128: added support for planar complex to generator inplace transpose
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:37 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 7f5a6b994a451fdcf1e9855a8427c557b1df852d
Author: Amir Gholami <i.amirgh at gmail.com>
Date: Wed Aug 26 18:06:02 2015 -0500
added support for planar complex to generator inplace transpose
---
src/library/generator.transpose.inplace.cpp | 155 +++++++++++++++++++++++++---
1 file changed, 143 insertions(+), 12 deletions(-)
diff --git a/src/library/generator.transpose.inplace.cpp b/src/library/generator.transpose.inplace.cpp
index 027bfd1..a76f7b7 100644
--- a/src/library/generator.transpose.inplace.cpp
+++ b/src/library/generator.transpose.inplace.cpp
@@ -203,12 +203,11 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeInplaceActi
dtInput = dtComplex;
dtOutput = dtComplex;
clKernWrite( transKernel, 0 ) << "global " << dtInput << "* restrict inputA";
- clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
break;
case CLFFT_COMPLEX_PLANAR:
dtInput = dtPlanar;
- return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
- // clKernWrite( transKernel, 0 ) << "global " << dtInput << "* restrict " << pmRealIn << ", global " << dtInput << "* restrict " << pmImagIn;
+ dtOutput = dtPlanar;
+ clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA_R" << ", global " << dtInput << "* restrict inputA_I";
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
@@ -218,12 +217,39 @@ static clfftStatus genTransposePrototype( const FFTGeneratedTransposeInplaceActi
dtOutput = dtPlanar;
clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
- clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
break;
default:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
+
+ switch (params.fft_outputLayout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ dtInput = dtComplex;
+ dtOutput = dtComplex;
+ clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ dtInput = dtPlanar;
+ dtOutput = dtPlanar;
+ clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA_R" << ", global " << dtOutput << "* restrict outputA_I";
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ dtInput = dtPlanar;
+ dtOutput = dtPlanar;
+ clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+
+
+
// Close the method signature
clKernWrite( transKernel, 0 ) << " )\n{" << std::endl;
return CLFFT_SUCCESS;
@@ -244,8 +270,8 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
// NOTE: Enable only for debug
// clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
- if (params.fft_inputLayout != params.fft_outputLayout)
- return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ //if (params.fft_inputLayout != params.fft_outputLayout)
+ // return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
switch( params.fft_precision )
{
@@ -308,17 +334,69 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
else
funcName = "transpose_Inplace";
+
+ // Generate kernel API
genTransposePrototype(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
+
+
+
+
+
if (mult_of_16)
clKernWrite(transKernel, 3) << "const int grid_dim = " << (params.fft_N[0] / 16 / reShapeFactor)*(params.fft_N[0] / 16 / reShapeFactor + 1) / 2 << ";" << std::endl;
else
clKernWrite(transKernel, 3) << "const int grid_dim = " << (params.fft_N[0] / (16 * reShapeFactor) + 1)*(params.fft_N[0] / (16 * reShapeFactor) + 1 + 1) / 2 << ";" << std::endl;
clKernWrite(transKernel, 3) << "const int z = get_group_id(0) / grid_dim; " << std::endl;
- clKernWrite(transKernel, 3) << "inputA = &inputA[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
- clKernWrite(transKernel, 3) << "outputA = &outputA[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
+
+
+
+
+ // Handle planar and interleaved right here
+ switch (params.fft_inputLayout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite(transKernel, 3) << "inputA = &inputA[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
+
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+
+ clKernWrite(transKernel, 3) << "inputA_R = &inputA_R[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
+ clKernWrite(transKernel, 3) << "inputA_I = &inputA_I[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
+
+
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+ switch (params.fft_outputLayout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite(transKernel, 3) << "outputA = &outputA[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
+
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+
+ clKernWrite(transKernel, 3) << "outputA_R = &outputA_R[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
+ clKernWrite(transKernel, 3) << "outputA_I = &outputA_I[z*" << params.fft_N[0] * params.fft_N[0] << "];" << std::endl; // Set A ptr to the start of each slice " << std::endl;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+
clKernWrite(transKernel, 3) << "" << std::endl;
clKernWrite(transKernel, 3) << "const int g_index = get_group_id(0) - z*grid_dim; " << std::endl;
@@ -380,8 +458,34 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
clKernWrite(transKernel, 3) << "int index;" << std::endl;
clKernWrite(transKernel, 3) << "for (int loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
clKernWrite(transKernel, 6) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
- clKernWrite(transKernel, 6) << "xy_s[index] = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
- clKernWrite(transKernel, 6) << "yx_s[index] = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+
+ // Handle planar and interleaved right here
+ switch (params.fft_inputLayout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite(transKernel, 6) << "xy_s[index] = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+ clKernWrite(transKernel, 6) << "yx_s[index] = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+ dtInput = dtPlanar;
+ dtOutput = dtPlanar;
+ clKernWrite(transKernel, 6) << "xy_s[index].x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+ clKernWrite(transKernel, 6) << "xy_s[index].y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
+
+ clKernWrite(transKernel, 6) << "yx_s[index].x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+ clKernWrite(transKernel, 6) << "yx_s[index].y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
+
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+
clKernWrite(transKernel, 3) << "}" << std::endl;
clKernWrite(transKernel, 3) << "" << std::endl;
@@ -394,8 +498,35 @@ static clfftStatus genTransposeKernel( const FFTGeneratedTransposeInplaceAction:
// Step2: Write from shared to global
clKernWrite(transKernel, 3) << "for (int loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
clKernWrite(transKernel, 6) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
- clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
- clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
+
+
+ // Handle planar and interleaved right here
+ switch (params.fft_outputLayout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
+ clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
+
+ break;
+ case CLFFT_COMPLEX_PLANAR:
+
+ clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
+
+ clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
+ break;
+ case CLFFT_HERMITIAN_INTERLEAVED:
+ case CLFFT_HERMITIAN_PLANAR:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
+ default:
+ return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ }
+
+
+
clKernWrite(transKernel, 3) << "}" << std::endl;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list