[vspline] 41/72: some changes to the remapping code I had a bug (which did not manifest) in the handling of the recursive descent of warp_generator, which is fixed now, together with a bit of tweaking for speed. I now always recurse all the way to 1D, which resolves some type issues.
Kay F. Jahnke
kfj-guest at moszumanska.debian.org
Sun Jul 2 09:02:41 UTC 2017
This is an automated email from the git hooks/post-receive script.
kfj-guest pushed a commit to branch master
in repository vspline.
commit 7f335e98beb15c92d7c22e5ff26dcb0ee6c7a809
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date: Mon Apr 3 11:05:47 2017 +0200
some changes to the remapping code
I had a bug (which did not manifest) in the handling of the recursive descent
of warp_generator, which is fixed now, together with a bit of tweaking for speed.
I now always recurse all the way to 1D, which resolves some type issues.
---
remap.h | 382 +++++++++++++++++++++++++++++++++++++++-----------------
unary_functor.h | 6 +-
2 files changed, 268 insertions(+), 120 deletions(-)
diff --git a/remap.h b/remap.h
index 3045eb6..8b6e7de 100644
--- a/remap.h
+++ b/remap.h
@@ -322,15 +322,13 @@ public:
/// of the 'whole' output
/// _fill is used by st_fill. It's an object implementing an hierarchical fill
-/// of the target array if this makes sense: If the whole target array is unstrided,
-/// the best-case loop is executed straight away. Otherwise, we check if the target
-/// array is unstrided in dimension 0, in which case we iterate over subdimensional
-/// slices. Only if even dimension 0 is unstrided, we don't recurse and use the less
-/// efficient buffering loop, but without the recursion overhead. All in all this
-/// seems like a good compromise.
+/// of the target array. This is done recursively. while the generator's and the
+/// target's dimension is greater 1 they are sliced and the slices are processed
+/// with the routine one level down.
template < typename generator_type , // functor object yielding values
int dim_out > // number of dimensions of output array
+// int floor_dim = 1 > // intended lowest dimensionality
struct _fill
{
void operator() ( generator_type & gen ,
@@ -339,77 +337,87 @@ struct _fill
bool use_vc = true
)
{
- typedef typename generator_type::value_type value_type ;
-
- auto target_it = output.begin() ;
- int leftover = output.elementCount() ;
-
- #ifdef USE_VC
+#ifdef USE_VC
- const int vsize = generator_type::vsize ;
+// const int vsize = generator_type::vsize ;
if ( use_vc )
{
- int aggregates = leftover / vsize ; // number of full vectors
- leftover -= aggregates * vsize ; // remaining leftover single values
- value_type * destination = output.data() ; // pointer to target's memory
-
- if ( output.isUnstrided() )
- {
-// std::cout << "case fill:1" << std::endl ;
- // best case: output array has consecutive memory. if this test comes
- // out true, we know that dimension 0 is also unstrided, so for warp_generator,
- // we have strided_warp false and the efficient iteration is used
- for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
- {
- gen ( destination ) ; // pass pointer to destination to generator
- }
- target_it += aggregates * vsize ; // advance target_it to remaining single values
- }
-
- else if ( output.isUnstrided(0) )
- {
-// std::cout << "case fill:2" << std::endl ;
- // at least dimension 0 is unstrided, so recursion makes sense. for a warp_generator,
- // the recursion is even necessary, because with dimension 0 unstrided we have picked
- // the variant with strided_warp false, so we have to recurse until this is actually
- // the case.
+// if ( dim_out > floor_dim )
+// {
+ // if we're not yet at the intended lowest level of recursion,
+ // we slice output and generator and feed the slices to the
+ // next lower recursion level
for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
{
// recursively call _fill for each slice along the highest axis
auto sub_output = output.bindOuter ( c ) ;
-// std::cout << "recursing" << std::endl ;
auto sub_gen = gen.bindOuter ( c ) ;
- _fill < decltype ( sub_gen ) , dim_out - 1 >() ( sub_gen , sub_output ) ;
+ _fill < decltype ( sub_gen ) , dim_out - 1 >()
+ ( sub_gen , sub_output ) ;
}
- leftover = 0 ; // no leftovers in this case
- }
-
- else
- {
-// std::cout << "case fill:3" << std::endl ;
- // strided even in dimension 0, no point in recursing. With a warp_generator,
- // we have strided_warp true, so we have the right generator type object.
- // fall back to buffering and storing individual result values
- value_type target_buffer [ vsize ] ;
- for ( int a = 0 ; a < aggregates ; a++ )
- {
- gen ( target_buffer ) ; // pass pointer to buffer to generator
- for ( int e = 0 ; e < vsize ; e++ )
- {
- *target_it = target_buffer[e] ; // and deposit in target
- ++target_it ;
- }
- }
- }
+ // after processing all slices, we're done and can return straight away
+ return ;
+// }
}
+
+// // we are at the intended lowest level of recursion already, so we
+// // do the work here and now. This is the same code as the specialization
+// // for dim_out==1 uses, so this code could be factored out. But we
+// // want the option to do the processing without recursing all the way
+// // down if that isn't necessary. The calling code can figure this out
+// // and set floor_dim accordingly.
+//
+// int aggregates = leftover / vsize ; // number of full vectors
+// leftover -= aggregates * vsize ; // remaining leftover single values
+//
+// if ( output.isUnstrided() )
+// {
+// // best case: output array has consecutive memory
+// // get a pointer to target memory
+// value_type * destination = output.data() ;
+//
+// for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
+// {
+// // pass pointer to target memory to the generator
+// gen ( destination ) ;
+// }
+// // if there aren't any leftovers, we can return straight away.
+// if ( ! leftover )
+// return ;
+//
+// // otherwise, advance target_it to remaining single values
+// target_it += aggregates * vsize ;
+// }
+// else
+// {
+// // fall back to buffering and storing individual result values.
+// // have a buffer ready
+// value_type target_buffer [ vsize ] ;
+//
+// for ( int a = 0 ; a < aggregates ; a++ )
+// {
+// gen ( target_buffer ) ; // pass buffer to the generator
+// for ( int e = 0 ; e < vsize ; e++ )
+// {
+// *target_it = target_buffer[e] ; // and deposit in target
+// ++target_it ;
+// }
+// }
+// }
+// }
#endif // USE_VC
- // process leftovers, if any - if vc isn't used, this loop does all the processing
+ typedef typename generator_type::value_type value_type ;
+
+ auto target_it = output.begin() ;
+ int leftover = output.elementCount() ;
+
+ // process leftovers. If vc isn't used, this loop does all the processing
+
while ( leftover-- )
{
-// std::cout << "fill1:leftovers" << std::endl ;
// process leftovers with single-value evaluation
gen ( *target_it ) ;
++target_it ;
@@ -417,10 +425,6 @@ struct _fill
}
} ;
-/// partial specialization od struct _fill for dimension 1. Here we only have
-/// to distinguish two cases: either the array is unstrided, so we can use the efficient
-/// loop, or it is not, in which case we resort to buffering.
-
template < typename generator_type > // functor object yielding values
struct _fill < generator_type , 1 >
{
@@ -437,33 +441,51 @@ struct _fill < generator_type , 1 >
#ifdef USE_VC
- const int vsize = generator_type::vsize ;
-
if ( use_vc )
{
+ const int vsize = generator_type::vsize ;
int aggregates = leftover / vsize ; // number of full vectors
leftover -= aggregates * vsize ; // remaining leftover single values
- value_type * destination = output.data() ; // pointer to target's memory
- value_type target_buffer [ vsize ] ;
if ( output.isUnstrided() )
{
// best case: output array has consecutive memory
+ // get a pointer to target memory
+ value_type * destination = output.data() ;
+
for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
{
- gen ( destination ) ; // pass pointer to destination to generator
+ // pass pointer to target memory to the generator
+ gen ( destination ) ;
}
- target_it += aggregates * vsize ; // advance target_it to remaining single values
+ // if there aren't any leftovers, we can return straight away.
+ if ( ! leftover )
+ return ;
+
+ // otherwise, advance target_it to remaining single values
+ target_it += aggregates * vsize ;
}
else
{
- // fall back to buffering and storing individual result values. coming from
- // nD via hierarchical descent will never land here, because the case that
- // dimension 0 is strided is dealt with straight away, but the 1D strided
- // case needs this bit:
+ // TODO: we're level 0 now, so we can scatter
+// generator_type::nd_rc_v target_buffer ;
+// for ( int a = 0 ; a < aggregates ; a++ )
+// {
+// gen ( target_buffer ) ; // pass buffer to the generator
+//
+// for ( int d = 0 ; d < value_type::size() ; d++ )
+// {
+// target_buffer[d].scatter ( destination , rc_v::IndexesFromZero * d + d ) ;
+// }
+
+ // fall back to buffering and storing individual result values.
+ // have a buffer ready
+
+ value_type target_buffer [ vsize ] ;
for ( int a = 0 ; a < aggregates ; a++ )
{
- gen ( target_buffer ) ; // pass pointer to buffer to generator
+ gen ( target_buffer ) ; // pass buffer to the generator
+
for ( int e = 0 ; e < vsize ; e++ )
{
*target_it = target_buffer[e] ; // and deposit in target
@@ -471,25 +493,11 @@ struct _fill < generator_type , 1 >
}
}
}
- // deal with leftovers by generating a full simdized datum, then copy
- // only leftover values to target. Comment out this bit to fall back to
- // single-value processing of leftovers. Potentially unsafe, may read-access
- // out-of-bounds memory (not write-access, though). Currently unused.
-// if ( leftover )
-// {
-// gen ( target_buffer ) ;
-// for ( int e = 0 ; e < leftover ; e++ )
-// {
-// *target_it = target_buffer[e] ; // and deposit in target
-// ++target_it ;
-// }
-// leftover = 0 ;
-// }
}
#endif // USE_VC
- // process leftovers, if any - if vc isn't used, this loop does all the processing
+ // process leftovers. If vc isn't used, this loop does all the processing
while ( leftover-- )
{
// process leftovers with single-value evaluation
@@ -499,6 +507,8 @@ struct _fill < generator_type , 1 >
}
} ;
+
+
/// single-threaded fill. This routine receives the range to process and the generator
/// object capable of providing result values. The generator object is set up to provide
/// values for the desired subrange and then passed to _fill, which handles the calls to
@@ -515,7 +525,12 @@ void st_fill ( shape_range_type < dim_out > range ,
auto output = p_output->subarray ( range[0] , range[1] ) ;
- // limit the generator to cover the same range
+ // get a new generator to cover the same range. we need an instance here.
+ // the generator carries state, we're in the single thread, processing one
+ // chunk out of the partitioning, so the generator we have here won't be
+ // used by other threads (which would be wrong, since it carries state).
+ // but it may be subdivided in yet more generators if fill decides to slice
+ // it and process slices.
auto gen = p_gen->subrange ( range ) ;
@@ -564,10 +579,84 @@ void fill ( generator_type & gen ,
use_vc ) ; // flag to switch use of Vc on/off
} ;
-/// struct warp_generator combines a warp array and an interpolator into a functor
-/// producing values to store in the output. This functor handles the whole chain of
-/// operations from picking the coordinates from the warp array to delivering target
-/// values. It can be fed to fill() to produce a remap function, see below.
+// /// struct warp_generator combines a warp array and an interpolator into a functor
+// /// producing values to store in the output. This functor handles the whole chain of
+// /// operations from picking the coordinates from the warp array to delivering target
+// /// values. It can be fed to fill() to produce a remap function, see below.
+//
+// template < int dimension ,
+// typename unary_functor_type ,
+// bool strided_warp >
+// struct _warp_generator
+// {
+// typedef typename unary_functor_type::in_type nd_rc_type ;
+// typedef typename unary_functor_type::out_type value_type ;
+//
+// typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
+//
+// const warp_array_type warp ; // must not use reference here!
+// typename warp_array_type::const_iterator witer ;
+//
+// const unary_functor_type & itp ;
+//
+// _warp_generator
+// ( const warp_array_type & _warp ,
+// const unary_functor_type & _itp )
+// : warp ( _warp ) ,
+// itp ( _itp ) ,
+// witer ( _warp.begin() )
+// { } ;
+//
+// void operator() ( value_type & target )
+// {
+// itp.eval ( *witer , target ) ;
+// ++witer ;
+// }
+//
+// #ifdef USE_VC
+//
+// enum { vsize = unary_functor_type :: vsize } ;
+//
+// // operator() incorporates two variants, which depend on a template argument,
+// // so the conditional has no run-time effect. We use witer to stay consistent
+// // with the unvectorized version, which may be used to mop up leftovers.
+//
+// void operator() ( value_type * target )
+// {
+// if ( strided_warp )
+// {
+// nd_rc_type buffer [ vsize ] ;
+// for ( int e = 0 ; e < vsize ; e++ , witer++ )
+// buffer[e] = *witer ;
+// itp.eval ( buffer , target ) ;
+// }
+// else
+// {
+// itp.eval ( &(*witer) , target ) ;
+// witer += vsize ;
+// }
+// }
+//
+// #endif
+//
+// _warp_generator < dimension , unary_functor_type , strided_warp >
+// subrange ( const shape_range_type < dimension > & range ) const
+// {
+// return _warp_generator < dimension , unary_functor_type , strided_warp >
+// ( warp.subarray ( range[0] , range[1] ) , itp ) ;
+// }
+//
+// _warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+// bindOuter ( const int & c ) const
+// {
+// return _warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+// ( warp.bindOuter ( c ) , itp ) ;
+// }
+// } ;
+
+/// warp_generator for dimensions > 1. Here we provide 'subrange' and
+/// 'bindOuter' to be used for the hierarchical descent in _fill. The current
+/// implementation relies of the hierarchical descent going all the way to 1D!
template < int dimension ,
typename unary_functor_type ,
@@ -580,18 +669,87 @@ struct warp_generator
typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
const warp_array_type warp ; // must not use reference here!
+ const nd_rc_type * data ;
typename warp_array_type::const_iterator witer ;
const unary_functor_type & itp ;
-
+
warp_generator
( const warp_array_type & _warp ,
const unary_functor_type & _itp )
: warp ( _warp ) ,
itp ( _itp ) ,
- witer ( _warp.begin() )
+ witer ( _warp.begin() ) ,
+ data ( _warp.data() )
{ } ;
+ // with Vc present, we only need operator() here for the code to compile,
+ // it's not used. But the unvectorized version may use it.
+
+ void operator() ( value_type & target )
+ {
+ itp.eval ( *witer , target ) ;
+ ++witer ;
+ }
+
+ warp_generator < dimension , unary_functor_type , strided_warp >
+ subrange ( const shape_range_type < dimension > & range ) const
+ {
+ return warp_generator < dimension , unary_functor_type , strided_warp >
+ ( warp.subarray ( range[0] , range[1] ) , itp ) ;
+ }
+
+ warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+ bindOuter ( const int & c ) const
+ {
+ return warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+ ( warp.bindOuter ( c ) , itp ) ;
+ }
+} ;
+
+/// specialization of warp_generator for dimension 1. Here we have
+/// (vectorized) evaluation code, but not for subrange and bindOuter.
+
+template < typename unary_functor_type ,
+ bool strided_warp >
+struct warp_generator < 1 , unary_functor_type , strided_warp >
+{
+ typedef typename unary_functor_type::in_type nd_rc_type ;
+ typedef typename unary_functor_type::out_type value_type ;
+
+ typedef MultiArrayView < 1 , nd_rc_type > warp_array_type ;
+
+ const warp_array_type warp ; // must not use reference here!
+ const nd_rc_type * data ;
+ typename warp_array_type::const_iterator witer ;
+
+ const unary_functor_type & itp ;
+
+ warp_generator
+ ( const warp_array_type & _warp ,
+ const unary_functor_type & _itp )
+ : warp ( _warp ) ,
+ itp ( _itp ) ,
+ witer ( _warp.begin() ) ,
+ data ( _warp.data() )
+ {
+#ifdef USE_VC
+ if ( ! strided_warp )
+ {
+ // if the warp array is unstrided, 'witer' will only be used
+ // by the 'mop-up' action processing the leftover single values
+ // so we advance witer to this location.
+
+ int aggregates = warp.size() / vsize ;
+ witer += aggregates * vsize ;
+ }
+#endif
+ } ;
+
+ // in the context of this class, single value evaluation is only ever
+ // used for mop-up action after all full vectors have been processed.
+ // If Vc isn't used, this routine does all the work.
+
void operator() ( value_type & target )
{
itp.eval ( *witer , target ) ;
@@ -603,13 +761,13 @@ struct warp_generator
enum { vsize = unary_functor_type :: vsize } ;
// operator() incorporates two variants, which depend on a template argument,
- // so the conditional has no run-time effect. We use witer to stay consistent
- // with the unvectorized version, which may be used to mop up leftovers.
+ // so the conditional has no run-time effect.
void operator() ( value_type * target )
{
if ( strided_warp )
{
+ // if the warp array is strided, we collect the values one by one
nd_rc_type buffer [ vsize ] ;
for ( int e = 0 ; e < vsize ; e++ , witer++ )
buffer[e] = *witer ;
@@ -617,26 +775,15 @@ struct warp_generator
}
else
{
- itp.eval ( &(*witer) , target ) ;
- witer += vsize ;
+ // otherwise we can collect them with an efficient load operation,
+ // which is affected by passing a pointer (data) to the evaluation
+ // routine
+ itp.eval ( data , target ) ;
+ data += vsize ;
}
}
#endif
-
- warp_generator < dimension , unary_functor_type , strided_warp >
- subrange ( const shape_range_type < dimension > & range ) const
- {
- return warp_generator < dimension , unary_functor_type , strided_warp >
- ( warp.subarray ( range[0] , range[1] ) , itp ) ;
- }
-
- warp_generator < dimension - 1 , unary_functor_type , strided_warp >
- bindOuter ( const int & c ) const
- {
- return warp_generator < dimension - 1 , unary_functor_type , strided_warp >
- ( warp.bindOuter ( c ) , itp ) ;
- }
} ;
/// implementation of remap() by delegation to the more general fill() routine, passing
@@ -693,7 +840,8 @@ void remap ( const unary_functor_type & ev ,
// if it is strided in higher dimensions, via the hierarchical access we will
// eventually arrive in dimension 0 and iterate over an unstrided array.
// This only matters if Vc is used, because if the warp array is unstrided,
- // the coordinates can be loaded more effectively.
+ // the coordinates can be loaded more effectively. Note that this method
+ // requires that the hierarchical access goes down all the way to 1D.
if ( warp.isUnstrided ( 0 ) )
{
diff --git a/unary_functor.h b/unary_functor.h
index b2f4a89..6f6ccde 100644
--- a/unary_functor.h
+++ b/unary_functor.h
@@ -180,8 +180,8 @@ struct unary_functor
/// it's slow compared to proper vector code.
/// But for 'quick shots' one can use this idiom in a derived class:
///
- /// virtual void eval ( const in_v & c ,
- /// out_v & result ) const
+ /// void eval ( const in_v & c ,
+ /// out_v & result ) const
/// {
/// broadcast_eval ( c , result ) ;
/// }
@@ -198,7 +198,7 @@ struct unary_functor
{
for ( int d = 0 ; d < dim_in ; d++ )
pc[d] = c[d][e] ;
- eval ( c , v ) ;
+ eval ( cc , v ) ;
for ( int d = 0 ; d < dim_out ; d++ )
result[d][e] = pv[d] ;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git
More information about the debian-science-commits
mailing list