[vspline] 41/72: some changes to the remapping code I had a bug (which did not manifest) in the handling of the recursive descent of warp_generator, which is fixed now, together with a bit of tweaking for speed. I now always recurse all the way to 1D, which resolves some type issues.

Sun Jul 2 09:02:41 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 7f335e98beb15c92d7c22e5ff26dcb0ee6c7a809
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Mon Apr 3 11:05:47 2017 +0200

    some changes to the remapping code
    I had a bug (which did not manifest) in the handling of the recursive descent
    of warp_generator, which is fixed now, together with a bit of tweaking for speed.
    I now always recurse all the way to 1D, which resolves some type issues.
---
 remap.h         | 382 +++++++++++++++++++++++++++++++++++++++-----------------
 unary_functor.h |   6 +-
 2 files changed, 268 insertions(+), 120 deletions(-)

diff --git a/remap.h b/remap.h
index 3045eb6..8b6e7de 100644
--- a/remap.h
+++ b/remap.h
@@ -322,15 +322,13 @@ public:
 ///   of the 'whole' output
 
 /// _fill is used by st_fill. It's an object implementing an hierarchical fill
-/// of the target array if this makes sense: If the whole target array is unstrided,
-/// the best-case loop is executed straight away. Otherwise, we check if the target
-/// array is unstrided in dimension 0, in which case we iterate over subdimensional
-/// slices. Only if even dimension 0 is unstrided, we don't recurse and use the less
-/// efficient buffering loop, but without the recursion overhead. All in all this
-/// seems like a good compromise.
+/// of the target array. This is done recursively. while the generator's and the
+/// target's dimension is greater 1 they are sliced and the slices are processed
+/// with the routine one level down.
 
 template < typename generator_type  , // functor object yielding values
            int dim_out >              // number of dimensions of output array
+//            int floor_dim = 1 >        // intended lowest dimensionality
 struct _fill
 {
   void operator() ( generator_type & gen ,
@@ -339,77 +337,87 @@ struct _fill
                     bool use_vc = true
                   )
   {
-    typedef typename generator_type::value_type value_type ;
-    
-    auto target_it = output.begin() ;  
-    int leftover = output.elementCount() ;
-
-  #ifdef USE_VC
+#ifdef USE_VC
 
-    const int vsize = generator_type::vsize ;
+//     const int vsize = generator_type::vsize ;
 
     if ( use_vc )
     {
-      int aggregates = leftover / vsize ;        // number of full vectors
-      leftover -= aggregates * vsize ;           // remaining leftover single values
-      value_type * destination = output.data() ; // pointer to target's memory
-
-      if ( output.isUnstrided() )
-      {
-//         std::cout << "case fill:1" << std::endl ;
-        // best case: output array has consecutive memory. if this test comes
-        // out true, we know that dimension 0 is also unstrided, so for warp_generator,
-        // we have strided_warp false and the efficient iteration is used
-        for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
-        {
-          gen ( destination ) ; // pass pointer to destination to generator
-        }
-        target_it += aggregates * vsize ; // advance target_it to remaining single values
-      }
-
-      else if ( output.isUnstrided(0) )
-      {
-//         std::cout << "case fill:2" << std::endl ;
-        // at least dimension 0 is unstrided, so recursion makes sense. for a warp_generator,
-        // the recursion is even necessary, because with dimension 0 unstrided we have picked
-        // the variant with strided_warp false, so we have to recurse until this is actually
-        // the case.
+//       if ( dim_out > floor_dim )
+//       {
+        // if we're not yet at the intended lowest level of recursion,
+        // we slice output and generator and feed the slices to the
+        // next lower recursion level
         for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
         {
           // recursively call _fill for each slice along the highest axis
           auto sub_output = output.bindOuter ( c ) ;
-//           std::cout << "recursing" << std::endl ;
           auto sub_gen = gen.bindOuter ( c ) ;
-          _fill < decltype ( sub_gen ) , dim_out - 1 >() ( sub_gen , sub_output ) ;
+          _fill < decltype ( sub_gen ) , dim_out - 1 >()
+            ( sub_gen , sub_output ) ;
         }
-        leftover = 0 ; // no leftovers in this case
-      }
-
-      else
-      {
-//         std::cout << "case fill:3" << std::endl ;
-        // strided even in dimension 0, no point in recursing. With a warp_generator,
-        // we have strided_warp true, so we have the right generator type object.
-        // fall back to buffering and storing individual result values
-        value_type target_buffer [ vsize ] ;
-        for ( int a = 0 ; a < aggregates ; a++ )
-        {
-          gen ( target_buffer ) ;        // pass pointer to buffer to generator
-          for ( int e = 0 ; e < vsize ; e++ )
-          {
-            *target_it = target_buffer[e] ;   // and deposit in target
-            ++target_it ;
-          }
-        }
-      }
+        // after processing all slices, we're done and can return straight away
+        return ;
+//       }
     }
+
+//       // we are at the intended lowest level of recursion already, so we
+//       // do the work here and now. This is the same code as the specialization
+//       // for dim_out==1 uses, so this code could be factored out. But we
+//       // want the option to do the processing without recursing all the way
+//       // down if that isn't necessary. The calling code can figure this out
+//       // and set floor_dim accordingly.
+// 
+//       int aggregates = leftover / vsize ;        // number of full vectors
+//       leftover -= aggregates * vsize ;           // remaining leftover single values
+// 
+//       if ( output.isUnstrided() )
+//       {
+//         // best case: output array has consecutive memory
+//         // get a pointer to target memory        
+//         value_type * destination = output.data() ;
+//         
+//         for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
+//         {
+//           // pass pointer to target memory to the generator
+//           gen ( destination ) ;
+//         }
+//         // if there aren't any leftovers, we can return straight away.
+//         if ( ! leftover )
+//           return ;
+// 
+//         // otherwise, advance target_it to remaining single values
+//         target_it += aggregates * vsize ;
+//       }
+//       else
+//       {
+//         // fall back to buffering and storing individual result values.
+//         // have a buffer ready
+//         value_type target_buffer [ vsize ] ;
+// 
+//         for ( int a = 0 ; a < aggregates ; a++ )
+//         {
+//           gen ( target_buffer ) ;             // pass buffer to the generator
+//           for ( int e = 0 ; e < vsize ; e++ )
+//           {
+//             *target_it = target_buffer[e] ;   // and deposit in target
+//             ++target_it ;
+//           }
+//         }
+//       }
+//     }
     
   #endif // USE_VC
 
-    // process leftovers, if any - if vc isn't used, this loop does all the processing
+    typedef typename generator_type::value_type value_type ;
+    
+    auto target_it = output.begin() ;  
+    int leftover = output.elementCount() ;
+
+    // process leftovers. If vc isn't used, this loop does all the processing
+    
     while ( leftover-- )
     {
-//       std::cout << "fill1:leftovers" << std::endl ;
       // process leftovers with single-value evaluation
       gen ( *target_it ) ;
       ++target_it ;
@@ -417,10 +425,6 @@ struct _fill
   }
 } ;
 
-/// partial specialization od struct _fill for dimension 1. Here we only have
-/// to distinguish two cases: either the array is unstrided, so we can use the efficient
-/// loop, or it is not, in which case we resort to buffering.
-
 template < typename generator_type > // functor object yielding values
 struct _fill < generator_type , 1 >
 {
@@ -437,33 +441,51 @@ struct _fill < generator_type , 1 >
 
   #ifdef USE_VC
 
-    const int vsize = generator_type::vsize ;
-
     if ( use_vc )
     {
+      const int vsize = generator_type::vsize ;
       int aggregates = leftover / vsize ;        // number of full vectors
       leftover -= aggregates * vsize ;           // remaining leftover single values
-      value_type * destination = output.data() ; // pointer to target's memory
-      value_type target_buffer [ vsize ] ;
 
       if ( output.isUnstrided() )
       {
         // best case: output array has consecutive memory
+        // get a pointer to target memory        
+        value_type * destination = output.data() ;
+        
         for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
         {
-          gen ( destination ) ; // pass pointer to destination to generator
+          // pass pointer to target memory to the generator
+          gen ( destination ) ;
         }
-        target_it += aggregates * vsize ; // advance target_it to remaining single values
+        // if there aren't any leftovers, we can return straight away.
+        if ( ! leftover )
+          return ;
+
+        // otherwise, advance target_it to remaining single values
+        target_it += aggregates * vsize ;
       }
       else
       {
-        // fall back to buffering and storing individual result values. coming from
-        // nD via hierarchical descent will never land here, because the case that
-        // dimension 0 is strided is dealt with straight away, but the 1D strided
-        // case needs this bit:
+        // TODO: we're level 0 now, so we can scatter
+//         generator_type::nd_rc_v target_buffer ;
+//         for ( int a = 0 ; a < aggregates ; a++ )
+//         {
+//           gen ( target_buffer ) ;             // pass buffer to the generator
+//           
+//           for ( int d = 0 ; d < value_type::size() ; d++ )
+//           {
+//             target_buffer[d].scatter ( destination , rc_v::IndexesFromZero * d + d ) ;
+//           }
+        
+        // fall back to buffering and storing individual result values.
+        // have a buffer ready
+
+        value_type target_buffer [ vsize ] ;
         for ( int a = 0 ; a < aggregates ; a++ )
         {
-          gen ( target_buffer ) ;        // pass pointer to buffer to generator
+          gen ( target_buffer ) ;             // pass buffer to the generator
+          
           for ( int e = 0 ; e < vsize ; e++ )
           {
             *target_it = target_buffer[e] ;   // and deposit in target
@@ -471,25 +493,11 @@ struct _fill < generator_type , 1 >
           }
         }
       }
-      // deal with leftovers by generating a full simdized datum, then copy
-      // only leftover values to target. Comment out this bit to fall back to
-      // single-value processing of leftovers. Potentially unsafe, may read-access
-      // out-of-bounds memory (not write-access, though). Currently unused.
-//       if ( leftover )
-//       {
-//         gen ( target_buffer ) ;
-//         for ( int e = 0 ; e < leftover ; e++ )
-//         {
-//           *target_it = target_buffer[e] ;   // and deposit in target
-//           ++target_it ;
-//         } 
-//         leftover = 0 ;
-//       }
     }
     
 #endif // USE_VC
 
-    // process leftovers, if any - if vc isn't used, this loop does all the processing
+    // process leftovers. If vc isn't used, this loop does all the processing
     while ( leftover-- )
     {
       // process leftovers with single-value evaluation
@@ -499,6 +507,8 @@ struct _fill < generator_type , 1 >
   }
 } ;
 
+
+
 /// single-threaded fill. This routine receives the range to process and the generator
 /// object capable of providing result values. The generator object is set up to provide
 /// values for the desired subrange and then passed to _fill, which handles the calls to
@@ -515,7 +525,12 @@ void st_fill ( shape_range_type < dim_out >                  range ,
 
   auto output = p_output->subarray ( range[0] , range[1] ) ;
   
-  // limit the generator to cover the same range
+  // get a new generator to cover the same range. we need an instance here.
+  // the generator carries state, we're in the single thread, processing one
+  // chunk out of the partitioning, so the generator we have here won't be
+  // used by other threads (which would be wrong, since it carries state).
+  // but it may be subdivided in yet more generators if fill decides to slice
+  // it and process slices.
   
   auto gen = p_gen->subrange ( range ) ;
   
@@ -564,10 +579,84 @@ void fill ( generator_type & gen ,
                 use_vc ) ;     // flag to switch use of Vc on/off
 } ;
 
-/// struct warp_generator combines a warp array and an interpolator into a functor
-/// producing values to store in the output. This functor handles the whole chain of
-/// operations from picking the coordinates from the warp array to delivering target
-/// values. It can be fed to fill() to produce a remap function, see below.
+// /// struct warp_generator combines a warp array and an interpolator into a functor
+// /// producing values to store in the output. This functor handles the whole chain of
+// /// operations from picking the coordinates from the warp array to delivering target
+// /// values. It can be fed to fill() to produce a remap function, see below.
+// 
+// template < int dimension ,
+//            typename unary_functor_type ,
+//            bool strided_warp >
+// struct _warp_generator
+// {
+//   typedef typename unary_functor_type::in_type nd_rc_type ;
+//   typedef typename unary_functor_type::out_type value_type ;
+//   
+//   typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
+//   
+//   const warp_array_type warp ; // must not use reference here!
+//   typename warp_array_type::const_iterator witer ;
+//   
+//   const unary_functor_type & itp ;
+// 
+//   _warp_generator
+//     ( const warp_array_type & _warp ,
+//       const unary_functor_type & _itp )
+//   : warp ( _warp ) ,
+//     itp ( _itp ) ,
+//     witer ( _warp.begin() )
+//   { } ;
+// 
+//   void operator() ( value_type & target )
+//   {
+//     itp.eval ( *witer , target ) ;
+//     ++witer ;
+//   }
+// 
+// #ifdef USE_VC
+// 
+//   enum { vsize = unary_functor_type :: vsize } ;
+// 
+//   // operator() incorporates two variants, which depend on a template argument,
+//   // so the conditional has no run-time effect. We use witer to stay consistent
+//   // with the unvectorized version, which may be used to mop up leftovers.
+//   
+//   void operator() ( value_type * target )
+//   {
+//     if ( strided_warp )
+//     {
+//       nd_rc_type buffer [ vsize ] ;
+//       for ( int e = 0 ; e < vsize ; e++ , witer++ )
+//         buffer[e] = *witer ;
+//       itp.eval ( buffer , target ) ;
+//     }
+//     else
+//     {
+//       itp.eval ( &(*witer) , target ) ;
+//       witer += vsize ;
+//     }
+//   }
+// 
+// #endif
+// 
+//   _warp_generator < dimension , unary_functor_type , strided_warp >
+//     subrange ( const shape_range_type < dimension > & range ) const
+//   {
+//     return _warp_generator < dimension , unary_functor_type , strided_warp >
+//              ( warp.subarray ( range[0] , range[1] ) , itp ) ;
+//   }
+//   
+//   _warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+//     bindOuter ( const int & c ) const
+//   {
+//     return _warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+//              ( warp.bindOuter ( c ) , itp ) ;
+//   }  
+// } ;
+
+/// warp_generator for dimensions > 1. Here we provide 'subrange' and
+/// 'bindOuter' to be used for the hierarchical descent in _fill. The current
+/// implementation relies of the hierarchical descent going all the way to 1D!
 
 template < int dimension ,
            typename unary_functor_type ,
@@ -580,18 +669,87 @@ struct warp_generator
   typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
   
   const warp_array_type warp ; // must not use reference here!
+  const nd_rc_type * data ;
   typename warp_array_type::const_iterator witer ;
   
   const unary_functor_type & itp ;
-
+  
   warp_generator
     ( const warp_array_type & _warp ,
       const unary_functor_type & _itp )
   : warp ( _warp ) ,
     itp ( _itp ) ,
-    witer ( _warp.begin() )
+    witer ( _warp.begin() ) ,
+    data ( _warp.data() )
   { } ;
 
+  // with Vc present, we only need operator() here for the code to compile,
+  // it's not used. But the unvectorized version may use it.
+
+  void operator() ( value_type & target )
+  {
+    itp.eval ( *witer , target ) ;
+    ++witer ;
+  }
+
+  warp_generator < dimension , unary_functor_type , strided_warp >
+    subrange ( const shape_range_type < dimension > & range ) const
+  {
+    return warp_generator < dimension , unary_functor_type , strided_warp >
+             ( warp.subarray ( range[0] , range[1] ) , itp ) ;
+  }
+  
+  warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+    bindOuter ( const int & c ) const
+  {
+    return warp_generator < dimension - 1 , unary_functor_type , strided_warp >
+             ( warp.bindOuter ( c ) , itp ) ;
+  }  
+} ;
+
+/// specialization of warp_generator for dimension 1. Here we have
+/// (vectorized) evaluation code, but not for subrange and bindOuter.
+
+template < typename unary_functor_type ,
+           bool strided_warp >
+struct warp_generator < 1 , unary_functor_type , strided_warp >
+{
+  typedef typename unary_functor_type::in_type nd_rc_type ;
+  typedef typename unary_functor_type::out_type value_type ;
+  
+  typedef MultiArrayView < 1 , nd_rc_type > warp_array_type ;
+  
+  const warp_array_type warp ; // must not use reference here!
+  const nd_rc_type * data ;
+  typename warp_array_type::const_iterator witer ;
+  
+  const unary_functor_type & itp ;
+  
+  warp_generator
+    ( const warp_array_type & _warp ,
+      const unary_functor_type & _itp )
+  : warp ( _warp ) ,
+    itp ( _itp ) ,
+    witer ( _warp.begin() ) ,
+    data ( _warp.data() )
+  {
+#ifdef USE_VC
+    if ( ! strided_warp )
+    {
+      // if the warp array is unstrided, 'witer' will only be used
+      // by the 'mop-up' action processing the leftover single values
+      // so we advance witer to this location.
+
+      int aggregates = warp.size() / vsize ;
+      witer += aggregates * vsize ;
+    }
+#endif
+  } ;
+
+  // in the context of this class, single value evaluation is only ever
+  // used for mop-up action after all full vectors have been processed.
+  // If Vc isn't used, this routine does all the work.
+
   void operator() ( value_type & target )
   {
     itp.eval ( *witer , target ) ;
@@ -603,13 +761,13 @@ struct warp_generator
   enum { vsize = unary_functor_type :: vsize } ;
 
   // operator() incorporates two variants, which depend on a template argument,
-  // so the conditional has no run-time effect. We use witer to stay consistent
-  // with the unvectorized version, which may be used to mop up leftovers.
+  // so the conditional has no run-time effect.
   
   void operator() ( value_type * target )
   {
     if ( strided_warp )
     {
+      // if the warp array is strided, we collect the values one by one
       nd_rc_type buffer [ vsize ] ;
       for ( int e = 0 ; e < vsize ; e++ , witer++ )
         buffer[e] = *witer ;
@@ -617,26 +775,15 @@ struct warp_generator
     }
     else
     {
-      itp.eval ( &(*witer) , target ) ;
-      witer += vsize ;
+      // otherwise we can collect them with an efficient load operation,
+      // which is affected by passing a pointer (data) to the evaluation
+      // routine
+      itp.eval ( data , target ) ;
+      data += vsize ;
     }
   }
 
 #endif
-
-  warp_generator < dimension , unary_functor_type , strided_warp >
-    subrange ( const shape_range_type < dimension > & range ) const
-  {
-    return warp_generator < dimension , unary_functor_type , strided_warp >
-             ( warp.subarray ( range[0] , range[1] ) , itp ) ;
-  }
-  
-  warp_generator < dimension - 1 , unary_functor_type , strided_warp >
-    bindOuter ( const int & c ) const
-  {
-    return warp_generator < dimension - 1 , unary_functor_type , strided_warp >
-             ( warp.bindOuter ( c ) , itp ) ;
-  }  
 } ;
 
 /// implementation of remap() by delegation to the more general fill() routine, passing
@@ -693,7 +840,8 @@ void remap ( const unary_functor_type & ev ,
   // if it is strided in higher dimensions, via the hierarchical access we will
   // eventually arrive in dimension 0 and iterate over an unstrided array.
   // This only matters if Vc is used, because if the warp array is unstrided,
-  // the coordinates can be loaded more effectively.
+  // the coordinates can be loaded more effectively. Note that this method
+  // requires that the hierarchical access goes down all the way to 1D.
 
   if ( warp.isUnstrided ( 0 ) )
   {
diff --git a/unary_functor.h b/unary_functor.h
index b2f4a89..6f6ccde 100644
--- a/unary_functor.h
+++ b/unary_functor.h
@@ -180,8 +180,8 @@ struct unary_functor
   /// it's slow compared to proper vector code.
   /// But for 'quick shots' one can use this idiom in a derived class:
   ///
-  ///    virtual void eval ( const in_v & c ,
-  ///                        out_v & result ) const
+  ///    void eval ( const in_v & c ,
+  ///                out_v & result ) const
   ///    {
   ///      broadcast_eval ( c , result ) ;
   ///    }
@@ -198,7 +198,7 @@ struct unary_functor
     {
       for ( int d = 0 ; d < dim_in ; d++ )
         pc[d] = c[d][e] ;
-      eval ( c , v ) ;
+      eval ( cc , v ) ;
       for ( int d = 0 ; d < dim_out ; d++ )
         result[d][e] = pv[d] ;
     }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git