[vspline] 33/72: added specializations for splines of degree 0 and 1 to have specialized nearest-neighbour and linear interpolators in vspline, which makes it as fast as hand-coded code for the purpose. Also introduced 'evenness' for raw-mode remaps without using mmap.

Sun Jul 2 09:02:40 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit af741ec1ce101f02996bb7b1d0ac0ab2a5ca9e5e
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Wed Feb 8 12:07:02 2017 +0100

    added specializations for splines of degree 0 and 1 to have specialized
    nearest-neighbour and linear interpolators in vspline, which makes it as fast
    as hand-coded code for the purpose.
    Also introduced 'evenness' for raw-mode remaps without using mmap.
---
 eval.h        | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 mapping.h     |  6 ++++-
 remap.h       | 37 ++++++++++++++++++-------------
 thread_pool.h |  2 +-
 4 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/eval.h b/eval.h
index c41c7e6..770a84e 100644
--- a/eval.h
+++ b/eval.h
@@ -465,7 +465,8 @@ template < int _dimension ,         ///< dimension of the spline
            class _rc_type ,         ///< singular real coordinate, float or double
            class _ic_type ,         ///< singular integral coordinate, currently only int
            bool use_tag = false ,   ///< flag switching tag checking on/off
-           int specialize = -1 >
+           int specialize = -1 ,
+           int evenness = -1 >
 class evaluator
 // we could inherit from class interpolator, since class evaluator satisfies the
 // interpolator interface, but there's no advantage in doing so.
@@ -1038,18 +1039,35 @@ public:
   {
     nd_ic_type select ;
     nd_rc_type tune ;
-    mmap ( c , select , tune ) ;  /// apply the mappings
+    
+    if ( evenness == 1 )
+    {
+      // use raw mapping for even splines
+      for ( int d = 0 ; d < dimension ; d++ )
+        even_mapping_raw < ic_type , rc_type > ()
+          ( c[d] , select[d] , tune[d] ) ;
+    }
+    else if ( evenness == 0 )
+    {
+      // use raw mapping for odd splines
+      for ( int d = 0 ; d < dimension ; d++ )
+        odd_mapping_raw < ic_type , rc_type > ()
+          ( c[d] , select[d] , tune[d] ) ;
+    }
+    else
+    {
+      // no specialization; fall back to calling mmap
+      mmap ( c , select , tune ) ;
+    }
+
     eval ( select , tune , result ) ;
   }
   
   value_type eval ( const nd_rc_type& c ) const // unsplit coordinate
   {
-    nd_ic_type select ;
-    nd_rc_type tune ;
     value_type result ;
     
-    mmap ( c , select , tune ) ;  /// apply the mappings
-    eval ( select , tune , result ) ;
+    eval ( c , result ) ;
     return result ;
   }
   
@@ -1617,10 +1635,44 @@ public:
     nd_rc_v tune ;
 
     // map the incoming coordinates to the spline's range
+    // here we have another specialization: 'evenness' is an
+    // int template argument, which can be set to 0 (odd spline),
+    // 1 (even spline) or some other value (not determined).
+    // With the flag set to 0 or 1, translation of incoming
+    // coordinates to split coordinates is performed in RAW
+    // mode, relying on the calling code to have mapped the
+    // coordinate to the allowed range. This is an optimization,
+    // because in these cases the mapping code can be inlined,
+    // whereas in the indeterminate case 'mmap' is used which
+    // realizes the mapping by a function call via a function
+    // pointer, which is constant from the creation of the
+    // evaluator, but nevertheless may not be optimized like
+    // inlined code.
+    // since we are very much inner loop here, I have chosen to
+    // implement this optimization even though this increases
+    // complexity.
+  
+    if ( evenness == 1 )
+    {
+      // use raw mapping for even splines
+      for ( int d = 0 ; d < dimension ; d++ )
+        even_mapping_raw < ic_type , rc_type , vsize > ()
+          ( input[d] , select[d] , tune[d] ) ;
+    }
+    else if ( evenness == 0 )
+    {
+      // use raw mapping for odd splines
+      for ( int d = 0 ; d < dimension ; d++ )
+        odd_mapping_raw < ic_type , rc_type , vsize > ()
+          ( input[d] , select[d] , tune[d] ) ;
+    }
+    else
+    {
+      // no specialization; fall back to calling mmap
+      mmap ( input , select , tune ) ;
+    }
 
-    mmap ( input , select , tune ) ;
-
-    // delegate
+    // delegate to v_eval with split coordinates
 
     v_eval ( select , tune , result ) ;
   }
diff --git a/mapping.h b/mapping.h
index fafe8c5..723c423 100644
--- a/mapping.h
+++ b/mapping.h
@@ -153,9 +153,13 @@ rc_v v_fmod ( const rc_v& lhs ,
               const typename rc_v::EntryType rhs )
 {
   typedef typename vector_traits < int , rc_v::Size > :: type ic_v ;
-  
+
   ic_v iv ( lhs / rhs ) ;
   return lhs - rhs * rc_v ( iv ) ;
+// or, to always get a positive result:
+//   auto result = lhs - rhs * rc_v ( iv ) ;
+//   result ( result < 0 ) = result + rhs ;
+//   return result ;
 }
 
 #endif // USE_VC
diff --git a/remap.h b/remap.h
index 55fc9da..90aca7d 100644
--- a/remap.h
+++ b/remap.h
@@ -666,14 +666,20 @@ template < typename generator_type  , // functor object yielding values
 void fill ( generator_type & gen ,
             MultiArrayView < dim_target , typename generator_type::value_type >
               & output ,
-            bool use_vc = true ,
-            int nthreads = ncores
+            bool use_vc = true
           )
 {
   // set up 'range' to cover a complete array of output's size
   
-  shape_range_type < dim_target > range ( shape_type < dim_target > () , output.shape() ) ;
-  
+  shape_range_type < dim_target > range ( shape_type < dim_target > () ,
+                                          output.shape() ) ;
+
+  // heuristic. desired number of partitions. When working on images,
+  // we try setting up the jobs to do 4 lines each: 
+  int njobs = output.shape ( dim_target - 1 ) / 4 ;
+  if ( njobs < 16 ) // small thing, try at least 16
+    njobs = 16 ;
+
   // call multithread(), specifying the single-threaded fill routine as the functor
   // to invoke the threads, passing in 'range', which will be partitioned by multithread(),
   // followed by all the other parameters the single-threaded fill needs, which is
@@ -681,7 +687,7 @@ void fill ( generator_type & gen ,
   // that we can't pass anything on by reference and use pointers instead.
 
   multithread ( & st_fill < generator_type , dim_target > ,
-                nthreads * 8 , // heuristic. desired number of partitions
+                njobs ,        // desired number of partitions
                 range ,        // 'full' range which is to be partitioned
                 &gen ,         // generator_type object
                 &output ,      // target array
@@ -785,21 +791,20 @@ void remap ( interpolator_type & ev ,
                                     typename interpolator_type::nd_rc_type > & warp ,
              MultiArrayView < dim_target ,
                               typename interpolator_type::value_type > & output ,
-             bool use_vc = true ,
-             int nthreads = ncores
+             bool use_vc = true
            )
 {
   if ( warp.isUnstrided() )
   {
     typedef warp_generator < dim_target , interpolator_type , false > gen_t ;  
     gen_t gen ( warp , ev ) ;  
-    fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
+    fill < gen_t , dim_target > ( gen , output , use_vc ) ;
   }
   else
   {
     typedef warp_generator < dim_target , interpolator_type , true > gen_t ;  
     gen_t gen ( warp , ev ) ;  
-    fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
+    fill < gen_t , dim_target > ( gen , output , use_vc ) ;
   }
 }
 
@@ -993,8 +998,7 @@ void tf_remap ( const interpolator_type & ev ,
                                  interpolator_type::vsize > tf ,
                 MultiArrayView < dim_target ,
                                  typename interpolator_type::value_type > & output ,
-                bool use_vc = true ,
-                int nthreads = ncores            
+                bool use_vc = true           
               )
 {
   typedef typename interpolator_type::value_type value_type ;
@@ -1003,7 +1007,7 @@ void tf_remap ( const interpolator_type & ev ,
 
   shape_range_type < dim_target > range ( nd_ic_type() , output.shape() ) ;  
   gen_t gen ( tf , ev , range ) ;  
-  fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
+  fill < gen_t , dim_target > ( gen , output , use_vc ) ;
 }
 
 /// this highest-level transform-based remap takes an input array and creates
@@ -1147,14 +1151,17 @@ void make_warp_array ( transformation < rc_type ,
                       MultiArrayView < dim_out ,
                                        vigra::TinyVector < rc_type , dim_in > >
                         & warp_array ,
-                      bool use_vc = true ,
-                      int nthreads = ncores
+                      bool use_vc = true
                     )
 {
+  int njobs = warp_array.shape ( dim_out - 1 ) / 4 ;
+  if ( njobs < 16 )
+    njobs = 16 ;
+  
   shape_range_type < dim_out > range ( shape_type < dim_out > () , warp_array.shape() ) ;
   
   multithread ( st_make_warp_array < value_type , rc_type , dim_in , dim_out > ,
-                nthreads * 8 , range , tf , warp_array , use_vc ) ;
+                njobs , range , tf , warp_array , use_vc ) ;
   
 }
 
diff --git a/thread_pool.h b/thread_pool.h
index bf03f93..0214b44 100644
--- a/thread_pool.h
+++ b/thread_pool.h
@@ -119,7 +119,7 @@ private:
 
 public:
   
-  thread_pool ( int nthreads = 2 * std::thread::hardware_concurrency() )
+  thread_pool ( int nthreads = 4 * std::thread::hardware_concurrency() )
   {
     // to launch a thread with a method, we need to bind it to the object:
     std::function < void() > wf = std::bind ( &thread_pool::worker_thread , this ) ;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git