[vspline] 35/72: small changes

Sun Jul 2 09:02:40 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 8b78ec88881fe6574c6473d9469cbb79efa529d0
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sat Feb 18 12:35:22 2017 +0100

    small changes
---
 eval.h         |   8 +--
 interpolator.h |   6 +-
 multithread.h  |  95 +++++++++++++++++++++++------
 remap.h        | 185 ++++++++++++++++++++++++++++++++++++++++++++-------------
 4 files changed, 228 insertions(+), 66 deletions(-)

diff --git a/eval.h b/eval.h
index 63a27d8..d5fdc63 100644
--- a/eval.h
+++ b/eval.h
@@ -610,7 +610,7 @@ public:
 private:
   
   nd_weight_functor fweight ;       ///< set of pointers to weight functors, one per dimension
-  const array_type& coefficients ;  ///< b-spline coefficient array
+  const array_type coefficients ;   ///< b-spline coefficient array
   const shape_type expanded_stride ;                 ///< strides in terms of expanded value_type
   MultiArray < 1 , ic_type > offsets ;               ///< offsets in terms of value_type
   MultiArray < 1 , ic_type > component_offsets ;     ///< offsets in terms of ele_type, for vc op
@@ -888,9 +888,9 @@ public:
                        const nd_rc_type& tune
                      ) const
     {
-      dtype result = ( 1.0 - tune[level] ) * pdata [ *ofs ] ;
+      dtype result = ( 1.0 - tune[0] ) * pdata [ *ofs ] ;
       ++ofs ;
-      result += tune[level] * pdata [ *ofs ] ;
+      result += tune[0] * pdata [ *ofs ] ;
       ++ofs ;
       return result ;
     }
@@ -1005,7 +1005,7 @@ public:
     if ( specialize == 0 )
     {
       // nearest neighbour. simply pick the coefficient
-      result = coefficients [ sum ( select * coefficients.stride() ) ] ;
+      result = coefficients [ select ] ;
     }
     else if ( specialize == 1 )
     {
diff --git a/interpolator.h b/interpolator.h
index 34ec708..b819a3e 100644
--- a/interpolator.h
+++ b/interpolator.h
@@ -186,9 +186,9 @@ struct interpolator
     }
   } ;
 
-#else
-
-  enum { vsize = 1 } ;
+// #else
+// 
+//   enum { vsize = 1 } ;
   
 #endif
 } ;
diff --git a/multithread.h b/multithread.h
index 9717d71..4bc1915 100644
--- a/multithread.h
+++ b/multithread.h
@@ -272,27 +272,91 @@ struct shape_splitter
 /// the second variant of multithread(), which is called with a range and a number
 /// of desired threads instead of a ready-made partitioning.
 
+// template < int d >
+// partition_type < shape_range_type<d> >
+// _partition ( shape_range_type<d> range , int nparts )
+// {
+//   if ( range[0].any() )
+//   {
+//     // the lower limit of the range is not at the origin, so get the shape
+//     // of the region between range[0] and range[1], call shape_splitter with
+//     // this shape, and add the offset to the lower limit of the original range
+//     // to the partial ranges in the result
+//     auto shape = range[1] - range[0] ;
+//     auto res = shape_splitter < d > :: part ( shape , nparts ) ;
+//     for ( auto & r : res )
+//     {
+//       r[0] += range[0] ;
+//       r[1] += range[0] ;
+//     }
+//     return res ;
+//   }
+//   // if range[0] is at the origin, we don't have use an offset
+//   return shape_splitter < d > :: part ( range[1] , nparts ) ;
+// }
+
+// alternative partitioning into tiles. For the optimal situation, where
+// the view isn't rotated or pitched much, the partitioning into bunches
+// of lines (above) seems to perform slightly better, but with more difficult
+// transformations (like 90 degree rotation), performance suffers (like, -20%),
+// whereas with this tiled partitioning it is roughly the same, supposedly due
+// to identical locality in both cases. So currently I am using this partitioning.
+// TODO code is a bit clumsy...
+
 template < int d >
 partition_type < shape_range_type<d> >
 partition ( shape_range_type<d> range , int nparts )
 {
-  if ( range[0].any() )
+  auto shape = range[1] - range[0] ;
+  int nelements = prod ( shape ) ;
+  int ntile = nelements / nparts ;
+  int nedge = 160 ; // heuristic, fixed size // pow ( ntile , ( 1.0 / d ) ) ;
+  auto tiled_shape = shape / nedge ;
+
+  typedef std::vector < int > stopv ;
+  stopv stops [ d ] ;
+  for ( int a = 0 ; a < d ; a++ )
+  {
+    stops[a].push_back ( 0 ) ;
+    for ( int k = 1 ; k < tiled_shape[a] ; k++ )
+      stops[a].push_back ( k * nedge ) ;
+    stops[a].push_back ( shape[a] ) ;
+  }
+  
+  for ( int a = 0 ; a < d ; a++ )
+    tiled_shape[a] = stops[a].size() - 1 ;
+  
+  nparts = prod ( tiled_shape ) ;
+  partition_type < shape_range_type<d> > res ( nparts ) ;
+  
+  for ( int a = 0 ; a < d ; a++ )
   {
-    // the lower limit of the range is not at the origin, so get the shape
-    // of the region between range[0] and range[1], call shape_splitter with
-    // this shape, and add the offset to the lower limit of the original range
-    // to the partial ranges in the result
-    auto shape = range[1] - range[0] ;
-    auto res = shape_splitter < d > :: part ( shape , nparts ) ;
-    for ( auto & r : res )
+    int j0 = 1 ;
+    for ( int h = 0 ; h < a ; h++ )
+      j0 *= tiled_shape[h] ;
+    int i = 0 ;
+    int j = 0 ;
+    for ( int k = 0 ; k < nparts ; k++ )
     {
-      r[0] += range[0] ;
-      r[1] += range[0] ;
+      res[k][0][a] = stops[a][i] ;
+      res[k][1][a] = stops[a][i+1] ;
+      ++j ;
+      if ( j == j0 )
+      {
+        j = 0 ;
+        ++i ;
+        if ( i >= tiled_shape[a] )
+          i = 0 ;
+      }
     }
-    return res ;
   }
-  // if range[0] is at the origin, we don't have use an offset
-  return shape_splitter < d > :: part ( range[1] , nparts ) ;
+  for ( auto & e : res )
+  {
+    e[0] += range[0] ;
+    e[1] += range[0] ;
+//     std::cout << "tile: " << e[0] << e[1] << std::endl ;
+  }
+  return res ;
 }
 
 /// action_wrapper wraps a functional into an outer function which
@@ -405,7 +469,7 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
 
     std::unique_lock<std::mutex> lk ( pool_mutex ) ;
     
-    // the predicate done==crew rejects spurious wakes
+    // the predicate done == nparts rejects spurious wakes
     
     pool_cv.wait ( lk , [&] { return done == nparts ; } ) ;
   }
@@ -419,9 +483,6 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
 /// range covering the 'whole' data. partition() is called with the range,
 /// resulting in a partitioning which above overload of multithread() can use.
 
-//  TODO we still have to update all calls to multithread() to pass in the
-//  number of partial tasks (so we can omit the * 8)
-
 template < class range_type , class ...Types >
 int multithread ( void (*pfunc) ( range_type , Types... ) ,
                   int nparts ,
diff --git a/remap.h b/remap.h
index 65ed1de..fdefecd 100644
--- a/remap.h
+++ b/remap.h
@@ -348,7 +348,10 @@ struct _fill
 
       if ( output.isUnstrided() )
       {
-        // best case: output array has consecutive memory
+//         std::cout << "case fill:1" << std::endl ;
+        // best case: output array has consecutive memory. if this test comes
+        // out true, we know that dimension 0 is also unstrided, so for warp_generator,
+        // we have strided_warp false and the efficient iteration is used
         for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
         {
           gen ( destination ) ; // pass pointer to destination to generator
@@ -358,11 +361,16 @@ struct _fill
 
       else if ( output.isUnstrided(0) )
       {
-        // at least dimension 0 is unstrided, so recursion makes sense
+//         std::cout << "case fill:2" << std::endl ;
+        // at least dimension 0 is unstrided, so recursion makes sense. for a warp_generator,
+        // the recursion is even necessary, because with dimension 0 unstrided we have picked
+        // the variant with strided_warp false, so we have to recurse until this is actually
+        // the case.
         for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
         {
           // recursively call _fill for each slice along the highest axis
           auto sub_output = output.bindOuter ( c ) ;
+//           std::cout << "recursing" << std::endl ;
           auto sub_gen = gen.bindOuter ( c ) ;
           _fill < decltype ( sub_gen ) , dim_out - 1 >() ( sub_gen , sub_output ) ;
         }
@@ -371,7 +379,9 @@ struct _fill
 
       else
       {
-        // unstrided even in dimension 0, no point in recursing
+//         std::cout << "case fill:3" << std::endl ;
+        // strided even in dimension 0, no point in recursing. With a warp_generator,
+        // we have strided_warp true, so we have the right generator type object.
         // fall back to buffering and storing individual result values
         value_type target_buffer [ vsize ] ;
         for ( int a = 0 ; a < aggregates ; a++ )
@@ -391,6 +401,7 @@ struct _fill
     // process leftovers, if any - if vc isn't used, this loop does all the processing
     while ( leftover-- )
     {
+//       std::cout << "fill1:leftovers" << std::endl ;
       // process leftovers with single-value evaluation
       gen ( *target_it ) ;
       ++target_it ;
@@ -428,6 +439,7 @@ struct _fill < generator_type , 1 >
 
       if ( output.isUnstrided() )
       {
+//         std::cout << "case fill1:1" << std::endl ;
         // best case: output array has consecutive memory
         for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
         {
@@ -438,7 +450,11 @@ struct _fill < generator_type , 1 >
 
       else
       {
-        // fall back to buffering and storing individual result values
+//         std::cout << "case fill1:2" << std::endl ;
+        // fall back to buffering and storing individual result values. coming from
+        // nD via hierarchical descent will never land here, because the case that
+        // dimension 0 is strided is dealt with straight away, but the 1D strided
+        // case needs this bit:
         value_type target_buffer [ vsize ] ;
         for ( int a = 0 ; a < aggregates ; a++ )
         {
@@ -457,6 +473,7 @@ struct _fill < generator_type , 1 >
     // process leftovers, if any - if vc isn't used, this loop does all the processing
     while ( leftover-- )
     {
+      std::cout << "fill1:leftovers" << std::endl ;
       // process leftovers with single-value evaluation
       gen ( *target_it ) ;
       ++target_it ;
@@ -510,9 +527,9 @@ void fill ( generator_type & gen ,
 
   // heuristic. desired number of partitions. When working on images,
   // we try setting up the jobs to do 4 lines each: 
-  int njobs = output.shape ( dim_target - 1 ) / 4 ;
-  if ( njobs < 16 ) // small thing, try at least 16
-    njobs = 16 ;
+  int njobs = 64 ; // = output.shape ( dim_target - 1 ) / 4 ;
+//   if ( njobs < 16 ) // small thing, try at least 16
+//     njobs = 16 ;
 
   // call multithread(), specifying the single-threaded fill routine as the functor
   // to invoke the threads, passing in 'range', which will be partitioned by multithread(),
@@ -535,7 +552,7 @@ void fill ( generator_type & gen ,
 
 template < int dimension ,
            typename interpolator_type ,
-           bool strided_warp = true >
+           bool strided_warp >
 struct warp_generator
 {
   typedef typename interpolator_type::nd_rc_type nd_rc_type ;
@@ -545,7 +562,6 @@ struct warp_generator
   
   const warp_array_type warp ; // must not use reference here!
   typename warp_array_type::const_iterator witer ;
-  const nd_rc_type * warp_data ;
   
   const interpolator_type & itp ;
 
@@ -554,19 +570,13 @@ struct warp_generator
       const interpolator_type & _itp )
   : warp ( _warp ) ,
     itp ( _itp ) ,
-    warp_data ( warp.data() ) ,
     witer ( _warp.begin() )
   { } ;
 
   void operator() ( value_type & target )
   {
-    if ( strided_warp )
-      itp.eval ( *witer , target ) ;
-    else
-    {
-      itp.eval ( *warp_data , target ) ;
-      ++warp_data ;
-    }
+    itp.eval ( *witer , target ) ;
+    ++witer ;
   }
 
 #ifdef USE_VC
@@ -574,13 +584,10 @@ struct warp_generator
   typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
   enum { vsize = vector_traits < ele_type > :: vsize } ;
 
-#else
+  // operator() incorporates two variants, which depend on a template argument,
+  // so the conditional has no run-time effect. We use witer to stay consistent
+  // with the unvectorized version, which may be used to mop up leftovers.
   
-  enum { vsize = 1 } ;
-
-#endif
-
-
   void operator() ( value_type * target )
   {
     if ( strided_warp )
@@ -592,11 +599,13 @@ struct warp_generator
     }
     else
     {
-      itp.v_eval ( warp_data , target ) ;
-      warp_data += vsize ;
+      itp.v_eval ( &(*witer) , target ) ;
+      witer += vsize ;
     }
   }
 
+#endif
+
   warp_generator < dimension , interpolator_type , strided_warp >
     subrange ( const shape_range_type < dimension > & range ) const
   {
@@ -628,14 +637,22 @@ void remap ( interpolator_type & ev ,
              bool use_vc = true
            )
 {
-  if ( warp.isUnstrided() )
+  // we test if the warp array is unstrided in dimension 0. If that is so, even
+  // if it is strided in higher dimensions, via the hierarchical access we will
+  // eventually arrive in dimension 0 and iterate over an unstrided array.
+  // This only matters if Vc is used, because if the warp array is unstrided,
+  // the coordinates can be loaded more effectively.
+
+  if ( warp.isUnstrided ( 0 ) )
   {
+    //                                set strided_warp to false vvv
     typedef warp_generator < dim_target , interpolator_type , false > gen_t ;  
     gen_t gen ( warp , ev ) ;  
     fill < gen_t , dim_target > ( gen , output , use_vc ) ;
   }
   else
   {
+    // warp array is strided even in dimension 0
     typedef warp_generator < dim_target , interpolator_type , true > gen_t ;  
     gen_t gen ( warp , ev ) ;  
     fill < gen_t , dim_target > ( gen , output , use_vc ) ;
@@ -728,43 +745,44 @@ struct index_generator
   typedef typename interpolator_type::nd_rc_type nd_rc_type ;
 
   const interpolator_type & itp ;
-  const shape_range_type < dim_target > & range ;
-  
+  const shape_range_type < dim_target > range ;
   vigra::MultiCoordinateIterator < dim_source > mci ;
   
   index_generator
     ( const interpolator_type & _itp ,
-      const shape_range_type < dim_target > & _range
+      const shape_range_type < dim_target > _range
     )
   : itp ( _itp ) ,
     range ( _range ) ,
     mci ( _range[1] - _range[0] )
-#ifdef USE_VC
-    , civ ( _range[0] , _range[1] ) 
-#endif
-  {
-  } ;
+  { } ;
 
   void operator() ( value_type & target )
   {
-    itp.eval ( * ( mci + range[0] ) , target ) ;
+    itp.eval ( *mci + range[0] , target ) ;
     ++mci ;
   }
 
 #ifdef USE_VC
   
-  coordinate_iterator_v < rc_type , ic_type , dim_source , vsize > civ ;
-
   void operator() ( value_type * target )
   {
-    itp.v_eval ( *civ , target ) ;
-    ++civ ;
+    typename interpolator_type::nd_ic_v index ;
+    for ( int e = 0 ; e < vsize ; e++ )
+    {
+      for ( int d = 0 ; d < dim_source ; d++ )
+      {
+        index[d][e] = (*mci)[d] + range[0][d] ;
+      }
+      ++mci ;
+    }
+    itp.v_eval ( index , target ) ;
   }
 
 #endif
 
   index_generator < dim_target , interpolator_type >
-    subrange ( const shape_range_type < dim_target > & range ) const
+    subrange ( const shape_range_type < dim_target > range ) const
   {
     return index_generator < dim_target , interpolator_type >
              ( itp , range ) ;
@@ -775,14 +793,97 @@ struct index_generator
   {
     nd_ic_type slice_start = range[0] , slice_end = range[1] ;
 
-    slice_start [ dim_target - 1 ] = c ;
-    slice_end [ dim_target - 1 ] = c ;
+    slice_start [ dim_target - 1 ] += c ;
+    slice_end [ dim_target - 1 ] = slice_start [ dim_target - 1 ] + 1 ;
     
     return index_generator < dim_target , interpolator_type >
              ( itp , shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
   }  
 } ;
 
+// template < int dim_target , typename interpolator_type >
+// struct index_generator
+// {
+//   typedef typename interpolator_type::value_type value_type ;
+// 
+//   enum { dim_source = interpolator_type::dimension } ;
+//   
+// #ifdef USE_VC
+// 
+//   enum { vsize = interpolator_type :: vsize } ;
+// 
+// #else
+//   
+//   enum { vsize = 1 } ;
+// 
+// #endif
+//   
+//   typedef typename interpolator_type::ic_type ic_type ;
+//   typedef typename interpolator_type::rc_type rc_type ;
+//   typedef typename interpolator_type::nd_ic_type nd_ic_type ;
+//   typedef typename interpolator_type::nd_rc_type nd_rc_type ;
+// 
+//   const interpolator_type & itp ;
+//   const shape_range_type < dim_target > range ;
+//   
+//   vigra::MultiCoordinateIterator < dim_source > mci ;
+//   
+// #ifdef USE_VC
+//   
+//   coordinate_iterator_v < rc_type , ic_type , dim_source , vsize > civ ;
+// 
+// #endif
+//   
+//   index_generator
+//     ( const interpolator_type & _itp ,
+//       const shape_range_type < dim_target > _range
+//     )
+//   : itp ( _itp ) ,
+//     range ( _range ) ,
+//     mci ( _range[1] - _range[0] )
+// #ifdef USE_VC
+//     , civ ( _range[0] , _range[1] ) 
+// #endif
+//   {
+//   } ;
+// 
+//   void operator() ( value_type & target )
+//   {
+//     itp.eval ( * ( mci + range[0] ) , target ) ;
+//     ++mci ;
+//   }
+// 
+// #ifdef USE_VC
+//   
+//   void operator() ( value_type * target )
+//   {
+//     itp.v_eval ( *civ , target ) ;
+//     ++civ ;
+//     mci += vsize ; // this sucks...
+//   }
+// 
+// #endif
+// 
+//   index_generator < dim_target , interpolator_type >
+//     subrange ( const shape_range_type < dim_target > range ) const
+//   {
+//     return index_generator < dim_target , interpolator_type >
+//              ( itp , range ) ;
+//   }
+//   
+//   index_generator < dim_target , interpolator_type >
+//     bindOuter ( const int & c ) const
+//   {
+//     nd_ic_type slice_start = range[0] , slice_end = range[1] ;
+// 
+//     slice_start [ dim_target - 1 ] += c ;
+//     slice_end [ dim_target - 1 ] = slice_start [ dim_target - 1 ] + 1 ;
+//     
+//     return index_generator < dim_target , interpolator_type >
+//              ( itp , shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
+//   }  
+// } ;
+
 template < class interpolator_type , // type satisfying the interface in class interpolator
            int dim_target >          // dimension of target array
 void index_remap ( const interpolator_type & ev ,

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git