[vspline] 35/72: small changes
Kay F. Jahnke
kfj-guest at moszumanska.debian.org
Sun Jul 2 09:02:40 UTC 2017
This is an automated email from the git hooks/post-receive script.
kfj-guest pushed a commit to branch master
in repository vspline.
commit 8b78ec88881fe6574c6473d9469cbb79efa529d0
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date: Sat Feb 18 12:35:22 2017 +0100
small changes
---
eval.h | 8 +--
interpolator.h | 6 +-
multithread.h | 95 +++++++++++++++++++++++------
remap.h | 185 ++++++++++++++++++++++++++++++++++++++++++++-------------
4 files changed, 228 insertions(+), 66 deletions(-)
diff --git a/eval.h b/eval.h
index 63a27d8..d5fdc63 100644
--- a/eval.h
+++ b/eval.h
@@ -610,7 +610,7 @@ public:
private:
nd_weight_functor fweight ; ///< set of pointers to weight functors, one per dimension
- const array_type& coefficients ; ///< b-spline coefficient array
+ const array_type coefficients ; ///< b-spline coefficient array
const shape_type expanded_stride ; ///< strides in terms of expanded value_type
MultiArray < 1 , ic_type > offsets ; ///< offsets in terms of value_type
MultiArray < 1 , ic_type > component_offsets ; ///< offsets in terms of ele_type, for vc op
@@ -888,9 +888,9 @@ public:
const nd_rc_type& tune
) const
{
- dtype result = ( 1.0 - tune[level] ) * pdata [ *ofs ] ;
+ dtype result = ( 1.0 - tune[0] ) * pdata [ *ofs ] ;
++ofs ;
- result += tune[level] * pdata [ *ofs ] ;
+ result += tune[0] * pdata [ *ofs ] ;
++ofs ;
return result ;
}
@@ -1005,7 +1005,7 @@ public:
if ( specialize == 0 )
{
// nearest neighbour. simply pick the coefficient
- result = coefficients [ sum ( select * coefficients.stride() ) ] ;
+ result = coefficients [ select ] ;
}
else if ( specialize == 1 )
{
diff --git a/interpolator.h b/interpolator.h
index 34ec708..b819a3e 100644
--- a/interpolator.h
+++ b/interpolator.h
@@ -186,9 +186,9 @@ struct interpolator
}
} ;
-#else
-
- enum { vsize = 1 } ;
+// #else
+//
+// enum { vsize = 1 } ;
#endif
} ;
diff --git a/multithread.h b/multithread.h
index 9717d71..4bc1915 100644
--- a/multithread.h
+++ b/multithread.h
@@ -272,27 +272,91 @@ struct shape_splitter
/// the second variant of multithread(), which is called with a range and a number
/// of desired threads instead of a ready-made partitioning.
+// template < int d >
+// partition_type < shape_range_type<d> >
+// _partition ( shape_range_type<d> range , int nparts )
+// {
+// if ( range[0].any() )
+// {
+// // the lower limit of the range is not at the origin, so get the shape
+// // of the region between range[0] and range[1], call shape_splitter with
+// // this shape, and add the offset to the lower limit of the original range
+// // to the partial ranges in the result
+// auto shape = range[1] - range[0] ;
+// auto res = shape_splitter < d > :: part ( shape , nparts ) ;
+// for ( auto & r : res )
+// {
+// r[0] += range[0] ;
+// r[1] += range[0] ;
+// }
+// return res ;
+// }
+// // if range[0] is at the origin, we don't have use an offset
+// return shape_splitter < d > :: part ( range[1] , nparts ) ;
+// }
+
+// alternative partitioning into tiles. For the optimal situation, where
+// the view isn't rotated or pitched much, the partitioning into bunches
+// of lines (above) seems to perform slightly better, but with more difficult
+// transformations (like 90 degree rotation), performance suffers (like, -20%),
+// whereas with this tiled partitioning it is roughly the same, supposedly due
+// to identical locality in both cases. So currently I am using this partitioning.
+// TODO code is a bit clumsy...
+
template < int d >
partition_type < shape_range_type<d> >
partition ( shape_range_type<d> range , int nparts )
{
- if ( range[0].any() )
+ auto shape = range[1] - range[0] ;
+ int nelements = prod ( shape ) ;
+ int ntile = nelements / nparts ;
+ int nedge = 160 ; // heuristic, fixed size // pow ( ntile , ( 1.0 / d ) ) ;
+ auto tiled_shape = shape / nedge ;
+
+ typedef std::vector < int > stopv ;
+ stopv stops [ d ] ;
+ for ( int a = 0 ; a < d ; a++ )
+ {
+ stops[a].push_back ( 0 ) ;
+ for ( int k = 1 ; k < tiled_shape[a] ; k++ )
+ stops[a].push_back ( k * nedge ) ;
+ stops[a].push_back ( shape[a] ) ;
+ }
+
+ for ( int a = 0 ; a < d ; a++ )
+ tiled_shape[a] = stops[a].size() - 1 ;
+
+ nparts = prod ( tiled_shape ) ;
+ partition_type < shape_range_type<d> > res ( nparts ) ;
+
+ for ( int a = 0 ; a < d ; a++ )
{
- // the lower limit of the range is not at the origin, so get the shape
- // of the region between range[0] and range[1], call shape_splitter with
- // this shape, and add the offset to the lower limit of the original range
- // to the partial ranges in the result
- auto shape = range[1] - range[0] ;
- auto res = shape_splitter < d > :: part ( shape , nparts ) ;
- for ( auto & r : res )
+ int j0 = 1 ;
+ for ( int h = 0 ; h < a ; h++ )
+ j0 *= tiled_shape[h] ;
+ int i = 0 ;
+ int j = 0 ;
+ for ( int k = 0 ; k < nparts ; k++ )
{
- r[0] += range[0] ;
- r[1] += range[0] ;
+ res[k][0][a] = stops[a][i] ;
+ res[k][1][a] = stops[a][i+1] ;
+ ++j ;
+ if ( j == j0 )
+ {
+ j = 0 ;
+ ++i ;
+ if ( i >= tiled_shape[a] )
+ i = 0 ;
+ }
}
- return res ;
}
- // if range[0] is at the origin, we don't have use an offset
- return shape_splitter < d > :: part ( range[1] , nparts ) ;
+ for ( auto & e : res )
+ {
+ e[0] += range[0] ;
+ e[1] += range[0] ;
+// std::cout << "tile: " << e[0] << e[1] << std::endl ;
+ }
+ return res ;
}
/// action_wrapper wraps a functional into an outer function which
@@ -405,7 +469,7 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
std::unique_lock<std::mutex> lk ( pool_mutex ) ;
- // the predicate done==crew rejects spurious wakes
+ // the predicate done == nparts rejects spurious wakes
pool_cv.wait ( lk , [&] { return done == nparts ; } ) ;
}
@@ -419,9 +483,6 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
/// range covering the 'whole' data. partition() is called with the range,
/// resulting in a partitioning which above overload of multithread() can use.
-// TODO we still have to update all calls to multithread() to pass in the
-// number of partial tasks (so we can omit the * 8)
-
template < class range_type , class ...Types >
int multithread ( void (*pfunc) ( range_type , Types... ) ,
int nparts ,
diff --git a/remap.h b/remap.h
index 65ed1de..fdefecd 100644
--- a/remap.h
+++ b/remap.h
@@ -348,7 +348,10 @@ struct _fill
if ( output.isUnstrided() )
{
- // best case: output array has consecutive memory
+// std::cout << "case fill:1" << std::endl ;
+ // best case: output array has consecutive memory. if this test comes
+ // out true, we know that dimension 0 is also unstrided, so for warp_generator,
+ // we have strided_warp false and the efficient iteration is used
for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
{
gen ( destination ) ; // pass pointer to destination to generator
@@ -358,11 +361,16 @@ struct _fill
else if ( output.isUnstrided(0) )
{
- // at least dimension 0 is unstrided, so recursion makes sense
+// std::cout << "case fill:2" << std::endl ;
+ // at least dimension 0 is unstrided, so recursion makes sense. for a warp_generator,
+ // the recursion is even necessary, because with dimension 0 unstrided we have picked
+ // the variant with strided_warp false, so we have to recurse until this is actually
+ // the case.
for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
{
// recursively call _fill for each slice along the highest axis
auto sub_output = output.bindOuter ( c ) ;
+// std::cout << "recursing" << std::endl ;
auto sub_gen = gen.bindOuter ( c ) ;
_fill < decltype ( sub_gen ) , dim_out - 1 >() ( sub_gen , sub_output ) ;
}
@@ -371,7 +379,9 @@ struct _fill
else
{
- // unstrided even in dimension 0, no point in recursing
+// std::cout << "case fill:3" << std::endl ;
+ // strided even in dimension 0, no point in recursing. With a warp_generator,
+ // we have strided_warp true, so we have the right generator type object.
// fall back to buffering and storing individual result values
value_type target_buffer [ vsize ] ;
for ( int a = 0 ; a < aggregates ; a++ )
@@ -391,6 +401,7 @@ struct _fill
// process leftovers, if any - if vc isn't used, this loop does all the processing
while ( leftover-- )
{
+// std::cout << "fill1:leftovers" << std::endl ;
// process leftovers with single-value evaluation
gen ( *target_it ) ;
++target_it ;
@@ -428,6 +439,7 @@ struct _fill < generator_type , 1 >
if ( output.isUnstrided() )
{
+// std::cout << "case fill1:1" << std::endl ;
// best case: output array has consecutive memory
for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
{
@@ -438,7 +450,11 @@ struct _fill < generator_type , 1 >
else
{
- // fall back to buffering and storing individual result values
+// std::cout << "case fill1:2" << std::endl ;
+ // fall back to buffering and storing individual result values. coming from
+ // nD via hierarchical descent will never land here, because the case that
+ // dimension 0 is strided is dealt with straight away, but the 1D strided
+ // case needs this bit:
value_type target_buffer [ vsize ] ;
for ( int a = 0 ; a < aggregates ; a++ )
{
@@ -457,6 +473,7 @@ struct _fill < generator_type , 1 >
// process leftovers, if any - if vc isn't used, this loop does all the processing
while ( leftover-- )
{
+ std::cout << "fill1:leftovers" << std::endl ;
// process leftovers with single-value evaluation
gen ( *target_it ) ;
++target_it ;
@@ -510,9 +527,9 @@ void fill ( generator_type & gen ,
// heuristic. desired number of partitions. When working on images,
// we try setting up the jobs to do 4 lines each:
- int njobs = output.shape ( dim_target - 1 ) / 4 ;
- if ( njobs < 16 ) // small thing, try at least 16
- njobs = 16 ;
+ int njobs = 64 ; // = output.shape ( dim_target - 1 ) / 4 ;
+// if ( njobs < 16 ) // small thing, try at least 16
+// njobs = 16 ;
// call multithread(), specifying the single-threaded fill routine as the functor
// to invoke the threads, passing in 'range', which will be partitioned by multithread(),
@@ -535,7 +552,7 @@ void fill ( generator_type & gen ,
template < int dimension ,
typename interpolator_type ,
- bool strided_warp = true >
+ bool strided_warp >
struct warp_generator
{
typedef typename interpolator_type::nd_rc_type nd_rc_type ;
@@ -545,7 +562,6 @@ struct warp_generator
const warp_array_type warp ; // must not use reference here!
typename warp_array_type::const_iterator witer ;
- const nd_rc_type * warp_data ;
const interpolator_type & itp ;
@@ -554,19 +570,13 @@ struct warp_generator
const interpolator_type & _itp )
: warp ( _warp ) ,
itp ( _itp ) ,
- warp_data ( warp.data() ) ,
witer ( _warp.begin() )
{ } ;
void operator() ( value_type & target )
{
- if ( strided_warp )
- itp.eval ( *witer , target ) ;
- else
- {
- itp.eval ( *warp_data , target ) ;
- ++warp_data ;
- }
+ itp.eval ( *witer , target ) ;
+ ++witer ;
}
#ifdef USE_VC
@@ -574,13 +584,10 @@ struct warp_generator
typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
enum { vsize = vector_traits < ele_type > :: vsize } ;
-#else
+ // operator() incorporates two variants, which depend on a template argument,
+ // so the conditional has no run-time effect. We use witer to stay consistent
+ // with the unvectorized version, which may be used to mop up leftovers.
- enum { vsize = 1 } ;
-
-#endif
-
-
void operator() ( value_type * target )
{
if ( strided_warp )
@@ -592,11 +599,13 @@ struct warp_generator
}
else
{
- itp.v_eval ( warp_data , target ) ;
- warp_data += vsize ;
+ itp.v_eval ( &(*witer) , target ) ;
+ witer += vsize ;
}
}
+#endif
+
warp_generator < dimension , interpolator_type , strided_warp >
subrange ( const shape_range_type < dimension > & range ) const
{
@@ -628,14 +637,22 @@ void remap ( interpolator_type & ev ,
bool use_vc = true
)
{
- if ( warp.isUnstrided() )
+ // we test if the warp array is unstrided in dimension 0. If that is so, even
+ // if it is strided in higher dimensions, via the hierarchical access we will
+ // eventually arrive in dimension 0 and iterate over an unstrided array.
+ // This only matters if Vc is used, because if the warp array is unstrided,
+ // the coordinates can be loaded more effectively.
+
+ if ( warp.isUnstrided ( 0 ) )
{
+ // set strided_warp to false vvv
typedef warp_generator < dim_target , interpolator_type , false > gen_t ;
gen_t gen ( warp , ev ) ;
fill < gen_t , dim_target > ( gen , output , use_vc ) ;
}
else
{
+ // warp array is strided even in dimension 0
typedef warp_generator < dim_target , interpolator_type , true > gen_t ;
gen_t gen ( warp , ev ) ;
fill < gen_t , dim_target > ( gen , output , use_vc ) ;
@@ -728,43 +745,44 @@ struct index_generator
typedef typename interpolator_type::nd_rc_type nd_rc_type ;
const interpolator_type & itp ;
- const shape_range_type < dim_target > & range ;
-
+ const shape_range_type < dim_target > range ;
vigra::MultiCoordinateIterator < dim_source > mci ;
index_generator
( const interpolator_type & _itp ,
- const shape_range_type < dim_target > & _range
+ const shape_range_type < dim_target > _range
)
: itp ( _itp ) ,
range ( _range ) ,
mci ( _range[1] - _range[0] )
-#ifdef USE_VC
- , civ ( _range[0] , _range[1] )
-#endif
- {
- } ;
+ { } ;
void operator() ( value_type & target )
{
- itp.eval ( * ( mci + range[0] ) , target ) ;
+ itp.eval ( *mci + range[0] , target ) ;
++mci ;
}
#ifdef USE_VC
- coordinate_iterator_v < rc_type , ic_type , dim_source , vsize > civ ;
-
void operator() ( value_type * target )
{
- itp.v_eval ( *civ , target ) ;
- ++civ ;
+ typename interpolator_type::nd_ic_v index ;
+ for ( int e = 0 ; e < vsize ; e++ )
+ {
+ for ( int d = 0 ; d < dim_source ; d++ )
+ {
+ index[d][e] = (*mci)[d] + range[0][d] ;
+ }
+ ++mci ;
+ }
+ itp.v_eval ( index , target ) ;
}
#endif
index_generator < dim_target , interpolator_type >
- subrange ( const shape_range_type < dim_target > & range ) const
+ subrange ( const shape_range_type < dim_target > range ) const
{
return index_generator < dim_target , interpolator_type >
( itp , range ) ;
@@ -775,14 +793,97 @@ struct index_generator
{
nd_ic_type slice_start = range[0] , slice_end = range[1] ;
- slice_start [ dim_target - 1 ] = c ;
- slice_end [ dim_target - 1 ] = c ;
+ slice_start [ dim_target - 1 ] += c ;
+ slice_end [ dim_target - 1 ] = slice_start [ dim_target - 1 ] + 1 ;
return index_generator < dim_target , interpolator_type >
( itp , shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
}
} ;
+// template < int dim_target , typename interpolator_type >
+// struct index_generator
+// {
+// typedef typename interpolator_type::value_type value_type ;
+//
+// enum { dim_source = interpolator_type::dimension } ;
+//
+// #ifdef USE_VC
+//
+// enum { vsize = interpolator_type :: vsize } ;
+//
+// #else
+//
+// enum { vsize = 1 } ;
+//
+// #endif
+//
+// typedef typename interpolator_type::ic_type ic_type ;
+// typedef typename interpolator_type::rc_type rc_type ;
+// typedef typename interpolator_type::nd_ic_type nd_ic_type ;
+// typedef typename interpolator_type::nd_rc_type nd_rc_type ;
+//
+// const interpolator_type & itp ;
+// const shape_range_type < dim_target > range ;
+//
+// vigra::MultiCoordinateIterator < dim_source > mci ;
+//
+// #ifdef USE_VC
+//
+// coordinate_iterator_v < rc_type , ic_type , dim_source , vsize > civ ;
+//
+// #endif
+//
+// index_generator
+// ( const interpolator_type & _itp ,
+// const shape_range_type < dim_target > _range
+// )
+// : itp ( _itp ) ,
+// range ( _range ) ,
+// mci ( _range[1] - _range[0] )
+// #ifdef USE_VC
+// , civ ( _range[0] , _range[1] )
+// #endif
+// {
+// } ;
+//
+// void operator() ( value_type & target )
+// {
+// itp.eval ( * ( mci + range[0] ) , target ) ;
+// ++mci ;
+// }
+//
+// #ifdef USE_VC
+//
+// void operator() ( value_type * target )
+// {
+// itp.v_eval ( *civ , target ) ;
+// ++civ ;
+// mci += vsize ; // this sucks...
+// }
+//
+// #endif
+//
+// index_generator < dim_target , interpolator_type >
+// subrange ( const shape_range_type < dim_target > range ) const
+// {
+// return index_generator < dim_target , interpolator_type >
+// ( itp , range ) ;
+// }
+//
+// index_generator < dim_target , interpolator_type >
+// bindOuter ( const int & c ) const
+// {
+// nd_ic_type slice_start = range[0] , slice_end = range[1] ;
+//
+// slice_start [ dim_target - 1 ] += c ;
+// slice_end [ dim_target - 1 ] = slice_start [ dim_target - 1 ] + 1 ;
+//
+// return index_generator < dim_target , interpolator_type >
+// ( itp , shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
+// }
+// } ;
+
template < class interpolator_type , // type satisfying the interface in class interpolator
int dim_target > // dimension of target array
void index_remap ( const interpolator_type & ev ,
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git
More information about the debian-science-commits
mailing list