[vspline] 33/72: added specializations for splines of degree 0 and 1 to have specialized nearest-neighbour and linear interpolators in vspline, which makes it as fast as hand-coded code for the purpose. Also introduced 'evenness' for raw-mode remaps without using mmap.
Kay F. Jahnke
kfj-guest at moszumanska.debian.org
Sun Jul 2 09:02:40 UTC 2017
This is an automated email from the git hooks/post-receive script.
kfj-guest pushed a commit to branch master
in repository vspline.
commit af741ec1ce101f02996bb7b1d0ac0ab2a5ca9e5e
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date: Wed Feb 8 12:07:02 2017 +0100
added specializations for splines of degree 0 and 1 to have specialized
nearest-neighbour and linear interpolators in vspline, which makes it as fast
as hand-coded code for the purpose.
Also introduced 'evenness' for raw-mode remaps without using mmap.
---
eval.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
mapping.h | 6 ++++-
remap.h | 37 ++++++++++++++++++-------------
thread_pool.h | 2 +-
4 files changed, 89 insertions(+), 26 deletions(-)
diff --git a/eval.h b/eval.h
index c41c7e6..770a84e 100644
--- a/eval.h
+++ b/eval.h
@@ -465,7 +465,8 @@ template < int _dimension , ///< dimension of the spline
class _rc_type , ///< singular real coordinate, float or double
class _ic_type , ///< singular integral coordinate, currently only int
bool use_tag = false , ///< flag switching tag checking on/off
- int specialize = -1 >
+ int specialize = -1 ,
+ int evenness = -1 >
class evaluator
// we could inherit from class interpolator, since class evaluator satisfies the
// interpolator interface, but there's no advantage in doing so.
@@ -1038,18 +1039,35 @@ public:
{
nd_ic_type select ;
nd_rc_type tune ;
- mmap ( c , select , tune ) ; /// apply the mappings
+
+ if ( evenness == 1 )
+ {
+ // use raw mapping for even splines
+ for ( int d = 0 ; d < dimension ; d++ )
+ even_mapping_raw < ic_type , rc_type > ()
+ ( c[d] , select[d] , tune[d] ) ;
+ }
+ else if ( evenness == 0 )
+ {
+ // use raw mapping for odd splines
+ for ( int d = 0 ; d < dimension ; d++ )
+ odd_mapping_raw < ic_type , rc_type > ()
+ ( c[d] , select[d] , tune[d] ) ;
+ }
+ else
+ {
+ // no specialization; fall back to calling mmap
+ mmap ( c , select , tune ) ;
+ }
+
eval ( select , tune , result ) ;
}
value_type eval ( const nd_rc_type& c ) const // unsplit coordinate
{
- nd_ic_type select ;
- nd_rc_type tune ;
value_type result ;
- mmap ( c , select , tune ) ; /// apply the mappings
- eval ( select , tune , result ) ;
+ eval ( c , result ) ;
return result ;
}
@@ -1617,10 +1635,44 @@ public:
nd_rc_v tune ;
// map the incoming coordinates to the spline's range
+ // here we have another specialization: 'evenness' is an
+ // int template argument, which can be set to 0 (odd spline),
+ // 1 (even spline) or some other value (not determined).
+ // With the flag set to 0 or 1, translation of incoming
+ // coordinates to split coordinates is performed in RAW
+ // mode, relying on the calling code to have mapped the
+ // coordinate to the allowed range. This is an optimization,
+ // because in these cases the mapping code can be inlined,
+ // whereas in the indeterminate case 'mmap' is used which
+ // realizes the mapping by a function call via a function
+ // pointer, which is constant from the creation of the
+ // evaluator, but nevertheless may not be optimized like
+ // inlined code.
+ // since we are very much inner loop here, I have chosen to
+ // implement this optimization even though this increases
+ // complexity.
+
+ if ( evenness == 1 )
+ {
+ // use raw mapping for even splines
+ for ( int d = 0 ; d < dimension ; d++ )
+ even_mapping_raw < ic_type , rc_type , vsize > ()
+ ( input[d] , select[d] , tune[d] ) ;
+ }
+ else if ( evenness == 0 )
+ {
+ // use raw mapping for odd splines
+ for ( int d = 0 ; d < dimension ; d++ )
+ odd_mapping_raw < ic_type , rc_type , vsize > ()
+ ( input[d] , select[d] , tune[d] ) ;
+ }
+ else
+ {
+ // no specialization; fall back to calling mmap
+ mmap ( input , select , tune ) ;
+ }
- mmap ( input , select , tune ) ;
-
- // delegate
+ // delegate to v_eval with split coordinates
v_eval ( select , tune , result ) ;
}
diff --git a/mapping.h b/mapping.h
index fafe8c5..723c423 100644
--- a/mapping.h
+++ b/mapping.h
@@ -153,9 +153,13 @@ rc_v v_fmod ( const rc_v& lhs ,
const typename rc_v::EntryType rhs )
{
typedef typename vector_traits < int , rc_v::Size > :: type ic_v ;
-
+
ic_v iv ( lhs / rhs ) ;
return lhs - rhs * rc_v ( iv ) ;
+// or, to always get a positive result:
+// auto result = lhs - rhs * rc_v ( iv ) ;
+// result ( result < 0 ) = result + rhs ;
+// return result ;
}
#endif // USE_VC
diff --git a/remap.h b/remap.h
index 55fc9da..90aca7d 100644
--- a/remap.h
+++ b/remap.h
@@ -666,14 +666,20 @@ template < typename generator_type , // functor object yielding values
void fill ( generator_type & gen ,
MultiArrayView < dim_target , typename generator_type::value_type >
& output ,
- bool use_vc = true ,
- int nthreads = ncores
+ bool use_vc = true
)
{
// set up 'range' to cover a complete array of output's size
- shape_range_type < dim_target > range ( shape_type < dim_target > () , output.shape() ) ;
-
+ shape_range_type < dim_target > range ( shape_type < dim_target > () ,
+ output.shape() ) ;
+
+ // heuristic. desired number of partitions. When working on images,
+ // we try setting up the jobs to do 4 lines each:
+ int njobs = output.shape ( dim_target - 1 ) / 4 ;
+ if ( njobs < 16 ) // small thing, try at least 16
+ njobs = 16 ;
+
// call multithread(), specifying the single-threaded fill routine as the functor
// to invoke the threads, passing in 'range', which will be partitioned by multithread(),
// followed by all the other parameters the single-threaded fill needs, which is
@@ -681,7 +687,7 @@ void fill ( generator_type & gen ,
// that we can't pass anything on by reference and use pointers instead.
multithread ( & st_fill < generator_type , dim_target > ,
- nthreads * 8 , // heuristic. desired number of partitions
+ njobs , // desired number of partitions
range , // 'full' range which is to be partitioned
&gen , // generator_type object
&output , // target array
@@ -785,21 +791,20 @@ void remap ( interpolator_type & ev ,
typename interpolator_type::nd_rc_type > & warp ,
MultiArrayView < dim_target ,
typename interpolator_type::value_type > & output ,
- bool use_vc = true ,
- int nthreads = ncores
+ bool use_vc = true
)
{
if ( warp.isUnstrided() )
{
typedef warp_generator < dim_target , interpolator_type , false > gen_t ;
gen_t gen ( warp , ev ) ;
- fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
+ fill < gen_t , dim_target > ( gen , output , use_vc ) ;
}
else
{
typedef warp_generator < dim_target , interpolator_type , true > gen_t ;
gen_t gen ( warp , ev ) ;
- fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
+ fill < gen_t , dim_target > ( gen , output , use_vc ) ;
}
}
@@ -993,8 +998,7 @@ void tf_remap ( const interpolator_type & ev ,
interpolator_type::vsize > tf ,
MultiArrayView < dim_target ,
typename interpolator_type::value_type > & output ,
- bool use_vc = true ,
- int nthreads = ncores
+ bool use_vc = true
)
{
typedef typename interpolator_type::value_type value_type ;
@@ -1003,7 +1007,7 @@ void tf_remap ( const interpolator_type & ev ,
shape_range_type < dim_target > range ( nd_ic_type() , output.shape() ) ;
gen_t gen ( tf , ev , range ) ;
- fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
+ fill < gen_t , dim_target > ( gen , output , use_vc ) ;
}
/// this highest-level transform-based remap takes an input array and creates
@@ -1147,14 +1151,17 @@ void make_warp_array ( transformation < rc_type ,
MultiArrayView < dim_out ,
vigra::TinyVector < rc_type , dim_in > >
& warp_array ,
- bool use_vc = true ,
- int nthreads = ncores
+ bool use_vc = true
)
{
+ int njobs = warp_array.shape ( dim_out - 1 ) / 4 ;
+ if ( njobs < 16 )
+ njobs = 16 ;
+
shape_range_type < dim_out > range ( shape_type < dim_out > () , warp_array.shape() ) ;
multithread ( st_make_warp_array < value_type , rc_type , dim_in , dim_out > ,
- nthreads * 8 , range , tf , warp_array , use_vc ) ;
+ njobs , range , tf , warp_array , use_vc ) ;
}
diff --git a/thread_pool.h b/thread_pool.h
index bf03f93..0214b44 100644
--- a/thread_pool.h
+++ b/thread_pool.h
@@ -119,7 +119,7 @@ private:
public:
- thread_pool ( int nthreads = 2 * std::thread::hardware_concurrency() )
+ thread_pool ( int nthreads = 4 * std::thread::hardware_concurrency() )
{
// to launch a thread with a method, we need to bind it to the object:
std::function < void() > wf = std::bind ( &thread_pool::worker_thread , this ) ;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git
More information about the debian-science-commits
mailing list