[clfft] 102/128: fixing many failures caused by deeper recursions, added more visibility of plan info in stattimer
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:44 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 36cd6bad8c778f968301abaaf7bd73a97344021c
Author: bragadeesh <bragadeesh.natarajan at amd.com>
Date: Fri Oct 9 16:16:55 2015 -0700
fixing many failures caused by deeper recursions, added more visibility of plan info in stattimer
---
src/library/action.cpp | 2 +-
src/library/generator.stockham.cpp | 9 +--
src/library/plan.cpp | 99 +++++++++++++++++--------------
src/library/transform.cpp | 56 ++++++++++-------
src/statTimer/statisticalTimer.GPU.cpp | 21 ++++++-
src/statTimer/statisticalTimer.GPU.h | 8 ++-
src/tests/accuracy_test_mixed_radices.cpp | 2 +-
7 files changed, 114 insertions(+), 83 deletions(-)
diff --git a/src/library/action.cpp b/src/library/action.cpp
index f073f5f..60508ff 100644
--- a/src/library/action.cpp
+++ b/src/library/action.cpp
@@ -633,7 +633,7 @@ clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
if( fftRepo.pStatTimer )
{
- fftRepo.pStatTimer->AddSample( plHandle, this->plan, kern, numQueuesAndEvents, outEvents, gWorkSize );
+ fftRepo.pStatTimer->AddSample( plHandle, this->plan, kern, numQueuesAndEvents, outEvents, gWorkSize, lWorkSize );
}
return CLFFT_SUCCESS;
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 40f54eb..660189e 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -2738,14 +2738,7 @@ namespace StockhamGenerator
rcSimple = params.fft_RCsimple;
- // Set half lds only for power-of-2 problem sizes & interleaved data
- halfLds = ( (params.fft_inputLayout == CLFFT_COMPLEX_INTERLEAVED) &&
- (params.fft_outputLayout == CLFFT_COMPLEX_INTERLEAVED) ) ? true : false;
- halfLds = halfLds ? ((length & (length-1)) ? false : true) : false;
-
- // Set half lds for real transforms
- halfLds = r2c2r ? true : halfLds;
-
+ halfLds = true;
linearRegs = true;
realSpecial = params.fft_realSpecial;
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index cd4ef28..6139137 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1285,54 +1285,58 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
}
}
- // copy plan to from hermitian to full complex
- OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0] ),
- _T( "CreateDefaultPlan RC copy failed" ) );
+ if ((fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+ (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR))
+ {
+ // copy plan to from hermitian to full complex
+ OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0]),
+ _T("CreateDefaultPlan RC copy failed"));
- FFTPlan* copyPlan = NULL;
- lockRAII* copyLock = NULL;
- OPENCL_V( fftRepo.getPlan( fftPlan->planRCcopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
+ FFTPlan* copyPlan = NULL;
+ lockRAII* copyLock = NULL;
+ OPENCL_V(fftRepo.getPlan(fftPlan->planRCcopy, copyPlan, copyLock), _T("fftRepo.getPlan failed"));
- // This is second column fft, intermediate buffer is packed and interleaved
- // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
+ // This is second column fft, intermediate buffer is packed and interleaved
+ // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
- // common part for both passes
- copyPlan->placeness = CLFFT_OUTOFPLACE;
- copyPlan->inputLayout = fftPlan->inputLayout;
- copyPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
+ // common part for both passes
+ copyPlan->placeness = CLFFT_OUTOFPLACE;
+ copyPlan->inputLayout = fftPlan->inputLayout;
+ copyPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
- copyPlan->precision = fftPlan->precision;
- copyPlan->forwardScale = 1.0f;
- copyPlan->backwardScale = 1.0f;
- copyPlan->tmpBufSize = 0;
- copyPlan->batchsize = fftPlan->batchsize;
+ copyPlan->precision = fftPlan->precision;
+ copyPlan->forwardScale = 1.0f;
+ copyPlan->backwardScale = 1.0f;
+ copyPlan->tmpBufSize = 0;
+ copyPlan->batchsize = fftPlan->batchsize;
- copyPlan->gen = Copy;
- copyPlan->envelope = fftPlan->envelope;
+ copyPlan->gen = Copy;
+ copyPlan->envelope = fftPlan->envelope;
- copyPlan->inStride[0] = fftPlan->inStride[0];
- copyPlan->iDist = fftPlan->iDist;
+ copyPlan->inStride[0] = fftPlan->inStride[0];
+ copyPlan->iDist = fftPlan->iDist;
- copyPlan->outStride[0] = 1;
- copyPlan->oDist = fftPlan->length[0];
+ copyPlan->outStride[0] = 1;
+ copyPlan->oDist = fftPlan->length[0];
- for (size_t index=1; index < fftPlan->length.size(); index++)
- {
- copyPlan->length.push_back(fftPlan->length[index]);
- copyPlan->outStride.push_back(copyPlan->outStride[index-1] * fftPlan->length[index-1]);
- copyPlan->oDist *= fftPlan->length[index];
- copyPlan->inStride.push_back(fftPlan->inStride[index]);
- }
+ for (size_t index = 1; index < fftPlan->length.size(); index++)
+ {
+ copyPlan->length.push_back(fftPlan->length[index]);
+ copyPlan->outStride.push_back(copyPlan->outStride[index - 1] * fftPlan->length[index - 1]);
+ copyPlan->oDist *= fftPlan->length[index];
+ copyPlan->inStride.push_back(fftPlan->inStride[index]);
+ }
- //Set callback data if set on top level plan
- if (fftPlan->hasPreCallback)
- {
- copyPlan->hasPreCallback = true;
- copyPlan->preCallback = fftPlan->preCallback;
- copyPlan->precallUserData = fftPlan->precallUserData;
- }
+ //Set callback data if set on top level plan
+ if (fftPlan->hasPreCallback)
+ {
+ copyPlan->hasPreCallback = true;
+ copyPlan->preCallback = fftPlan->preCallback;
+ copyPlan->precallUserData = fftPlan->precallUserData;
+ }
- OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d RC copy plan failed" ) );
+ OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
+ }
// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
// transposed output
@@ -1349,7 +1353,6 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
// we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
//this part are common for both passes
- colTPlan->placeness = CLFFT_INPLACE;
colTPlan->precision = fftPlan->precision;
colTPlan->forwardScale = 1.0f;
colTPlan->backwardScale = 1.0f;
@@ -1357,18 +1360,16 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colTPlan->batchsize = fftPlan->batchsize;
colTPlan->gen = fftPlan->gen;
- colTPlan->envelope = fftPlan->envelope;
+ colTPlan->envelope = fftPlan->envelope;
//Pass large1D flag to confirm we need multiply twiddle factor
colTPlan->large1D = fftPlan->length[0];
colTPlan->length.push_back(clLengths[0]);
- // first Pass
colTPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
-
-
+
colTPlan->inStride[0] = length0;
colTPlan->inStride.push_back(1);
colTPlan->iDist = length0 * length1;
@@ -1386,6 +1387,15 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
colTPlan->oDist *= fftPlan->length[index];
}
+ if ((fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
+ (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR))
+ {
+ colTPlan->placeness = CLFFT_INPLACE;
+ }
+ else
+ {
+ colTPlan->placeness = CLFFT_OUTOFPLACE;
+ }
OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
@@ -1653,7 +1663,6 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
}
// column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
- // transposed output
OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
_T( "CreateDefaultPlan Large1d column failed" ) );
@@ -1721,7 +1730,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
- //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
+ //another column FFT, size clLengths[0], batch clLengths[1], output with transpose
OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
_T( "CreateDefaultPlan large1D row failed" ) );
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 354c1ed..d061dad 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -248,17 +248,29 @@ clfftStatus clfftEnqueueTransform(
cl_event colOutEvents = NULL;
cl_event copyOutEvents = NULL;
- // copy from hermitian to full complex
- OPENCL_V( clfftEnqueueTransform( fftPlan->planRCcopy, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, ©OutEvents, clInputBuffers, &(fftPlan->intBufferRC), localIntBuffer ),
- _T("clfftEnqueueTransform large1D RC copy failed"));
-
- // First pass
- // column with twiddle first, INPLACE,
- OPENCL_V( clfftEnqueueTransform( fftPlan->planX, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, 1,
- ©OutEvents, &colOutEvents, &(fftPlan->intBufferRC), &(fftPlan->intBufferRC), localIntBuffer),
- _T("clfftEnqueueTransform large1D col pass failed"));
- clReleaseEvent(copyOutEvents);
+ if (fftPlan->planRCcopy)
+ {
+ // copy from hermitian to full complex
+ OPENCL_V(clfftEnqueueTransform(fftPlan->planRCcopy, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, ©OutEvents, clInputBuffers, &(fftPlan->intBufferRC), localIntBuffer),
+ _T("clfftEnqueueTransform large1D RC copy failed"));
+
+ // First pass
+ // column with twiddle first, INPLACE,
+ OPENCL_V(clfftEnqueueTransform(fftPlan->planX, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, 1,
+ ©OutEvents, &colOutEvents, &(fftPlan->intBufferRC), &(fftPlan->intBufferRC), localIntBuffer),
+ _T("clfftEnqueueTransform large1D col pass failed"));
+ clReleaseEvent(copyOutEvents);
+ }
+ else
+ {
+ // First pass
+ // column with twiddle first, INPLACE,
+ OPENCL_V(clfftEnqueueTransform(fftPlan->planX, CLFFT_BACKWARD, numQueuesAndEvents, commQueues, numWaitEvents,
+ waitEvents, &colOutEvents, clInputBuffers, &(fftPlan->intBufferRC), localIntBuffer),
+ _T("clfftEnqueueTransform large1D col pass failed"));
+ clReleaseEvent(copyOutEvents);
+ }
cl_mem *out_local;
out_local = (fftPlan->placeness==CLFFT_INPLACE) ? clInputBuffers : clOutputBuffers;
@@ -499,9 +511,9 @@ clfftStatus clfftEnqueueTransform(
{
cl_event colOutEvents = NULL;
// First pass
- // column with twiddle first, OUTOFPLACE, + transpose
+ // column with twiddle first, OUTOFPLACE
OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &colOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer),
+ waitEvents, &colOutEvents, clInputBuffers, &localIntBuffer, NULL),
_T("clfftEnqueueTransform large1D col pass failed"));
#if defined(DEBUGGING)
@@ -536,11 +548,11 @@ clfftStatus clfftEnqueueTransform(
}
else
{
- //another column FFT output, OUTOFPLACE
+ //another column FFT output, OUTOFPLACE + transpose
if (fftPlan->placeness == CLFFT_INPLACE)
{
OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
- outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+ outEvents, &localIntBuffer, clInputBuffers, NULL ),
_T("clfftEnqueueTransform large1D second column failed"));
#if defined(DEBUGGING)
@@ -560,7 +572,7 @@ clfftStatus clfftEnqueueTransform(
_T("Reading the result buffer failed") );
#endif
OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &colOutEvents,
- outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
+ outEvents, &localIntBuffer, clOutputBuffers, NULL ),
_T("clfftEnqueueTransform large1D second column failed"));
#if defined(DEBUGGING)
@@ -603,7 +615,7 @@ clfftStatus clfftEnqueueTransform(
if( fftRepo.pStatTimer )
{
- fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
+ fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ), std::vector< size_t >() );
}
return CLFFT_SUCCESS;
@@ -987,7 +999,7 @@ clfftStatus clfftEnqueueTransform(
{
//deal with row first
OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
- waitEvents, &rowOutEvents, clInputBuffers, &localIntBuffer, localIntBuffer ),
+ waitEvents, &rowOutEvents, clInputBuffers, &localIntBuffer, NULL ),
_T("clfftEnqueueTransform for row failed"));
@@ -995,14 +1007,14 @@ clfftStatus clfftEnqueueTransform(
{
//deal with column
OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
- outEvents, &localIntBuffer, clInputBuffers, localIntBuffer ),
+ outEvents, &localIntBuffer, clInputBuffers, NULL ),
_T("clfftEnqueueTransform for column failed"));
}
else
{
//deal with column
OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1, &rowOutEvents,
- outEvents, &localIntBuffer, clOutputBuffers, localIntBuffer ),
+ outEvents, &localIntBuffer, clOutputBuffers, NULL ),
_T("clfftEnqueueTransform for column failed"));
#if defined(DEBUGGING)
@@ -1021,7 +1033,7 @@ clfftStatus clfftEnqueueTransform(
if( fftRepo.pStatTimer )
{
- fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
+ fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ), std::vector< size_t >() );
}
return CLFFT_SUCCESS;
@@ -1269,7 +1281,7 @@ clfftStatus clfftEnqueueTransform(
if( fftRepo.pStatTimer )
{
- fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ) );
+ fftRepo.pStatTimer->AddSample( plHandle, fftPlan, NULL, 0, NULL, std::vector< size_t >( ), std::vector< size_t >() );
}
return CLFFT_SUCCESS;
diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp
index c4c94e1..133825d 100644
--- a/src/statTimer/statisticalTimer.GPU.cpp
+++ b/src/statTimer/statisticalTimer.GPU.cpp
@@ -202,7 +202,7 @@ GpuStatTimer::Stop( size_t id )
void
GpuStatTimer::AddSample( clfftPlanHandle plHandle, FFTPlan* plan, cl_kernel kern, cl_uint numEvents, cl_event* ev,
- const std::vector< size_t >& gWorkSize )
+ const std::vector< size_t >& gWorkSize, const std::vector< size_t >& lWorkSize )
{
if( (numEvents != 0) && (ev == NULL) )
return;
@@ -219,12 +219,12 @@ GpuStatTimer::AddSample( clfftPlanHandle plHandle, FFTPlan* plan, cl_kernel kern
{
timerData.at( currID ).push_back( StatDataVec( ) );
timerData.at( currID ).back( ).reserve( nSamples );
- timerData.at( currID ).back( ).push_back( StatData( plHandle, plan, kern, numEvents, ev, gWorkSize ) );
+ timerData.at( currID ).back( ).push_back( StatData( plHandle, plan, kern, numEvents, ev, gWorkSize, lWorkSize) );
}
else
{
timerData.at( currID ).at( currSample )
- .push_back( StatData( plHandle, plan, kern, numEvents, ev, gWorkSize ) );
+ .push_back( StatData( plHandle, plan, kern, numEvents, ev, gWorkSize, lWorkSize ) );
++currSample;
}
}
@@ -582,6 +582,9 @@ GpuStatTimer::Print( )
<< std::setw( tableThird ) << mean[ t ].batchSize << std::endl;
}
+ tout << std::setw(tableFourth) << _T("Placeness:") << std::setw(tableThird)
+ << ( mean[t].placeness == CLFFT_INPLACE ? "InPlace": "OutOfPlace" ) << std::endl;
+
tout << std::setw(tableFourth) << _T("Input Dist:") << std::setw(tableThird) << mean[t].iDist << std::endl;
tout << std::setw(tableFourth) << _T("Output Dist:") << std::setw(tableThird) << mean[t].oDist << std::endl;
@@ -624,6 +627,18 @@ GpuStatTimer::Print( )
}
catLengths << _T( ")" );
tout << std::setw( tableThird ) << catLengths.str( ) << std::endl;
+
+ tout << std::setw(tableFourth) << _T("Local Work:");
+ catLengths.str(_T(""));
+ catLengths << _T("(");
+ for (size_t i = 0; i < mean[t].enqueueLocalWorkSize.size(); ++i)
+ {
+ catLengths << mean[t].enqueueLocalWorkSize.at(i);
+ if (i < (mean[t].enqueueLocalWorkSize.size() - 1))
+ catLengths << _T(",");
+ }
+ catLengths << _T(")");
+ tout << std::setw(tableThird) << catLengths.str() << std::endl;
}
tout << std::setw( tableFourth ) << _T( "Gflops:" )
diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h
index 9dcb161..2a9781e 100644
--- a/src/statTimer/statisticalTimer.GPU.h
+++ b/src/statTimer/statisticalTimer.GPU.h
@@ -56,6 +56,8 @@ struct StatData
std::vector< size_t > outStride;
size_t iDist;
size_t oDist;
+ clfftResultLocation placeness;
+ std::vector< size_t > enqueueLocalWorkSize;
std::vector< size_t > enqueueWorkSize;
std::vector< cl_event > outEvents;
@@ -63,13 +65,13 @@ struct StatData
{}
StatData( clfftPlanHandle id, FFTPlan* plan, cl_kernel kern, cl_uint nEv, cl_event* Ev,
- const std::vector< size_t >& gWorkSize ):
+ const std::vector< size_t >& gWorkSize, const std::vector< size_t >& lWorkSize):
deltaNanoSec( 0 ), kernel( kern ), batchSize( plan->batchsize ), dim( plan->dim ),
plHandle( id ), planX( plan->planX ), planY( plan->planY ), planZ( plan->planZ ),
planTX( plan->planTX ), planTY( plan->planTY ), planTZ( plan->planTZ ),
planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ),
inStride( plan->inStride ), outStride( plan->outStride ), iDist( plan->iDist ), oDist( plan->oDist ),
- lengths( plan->length ), enqueueWorkSize( gWorkSize )
+ lengths( plan->length ), enqueueWorkSize( gWorkSize ), enqueueLocalWorkSize( lWorkSize ), placeness( plan->placeness )
{
for( cl_uint e = 0; e < nEv; ++e )
{
@@ -217,7 +219,7 @@ public:
* \brief Explicitely add a timing sample into the class
*/
virtual void AddSample( clfftPlanHandle plHandle, FFTPlan* plan, cl_kernel kern, cl_uint numQueuesAndEvents, cl_event* ev,
- const std::vector< size_t >& gWorkSize );
+ const std::vector< size_t >& gWorkSize, const std::vector< size_t >& lWorkSize );
/**
* \fn void Reset(void)
diff --git a/src/tests/accuracy_test_mixed_radices.cpp b/src/tests/accuracy_test_mixed_radices.cpp
index d0e36b4..81ec183 100644
--- a/src/tests/accuracy_test_mixed_radices.cpp
+++ b/src/tests/accuracy_test_mixed_radices.cpp
@@ -42,7 +42,7 @@ void mixed_radix_complex_to_complex( size_t problem_size )
std::vector<size_t> lengths;
lengths.push_back( problem_size );
- size_t batch = 1;
+ size_t batch = 500;
std::vector<size_t> input_strides;
std::vector<size_t> output_strides;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list