[vspline] 42/72: mainly work on remap.h This was consolidation work and performance tuning, also streamlining of the code to make it more compact. Added more comments, as well.

Sun Jul 2 09:02:41 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 9f272ab1be20e80287052c23451faee9bb11ecd8
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Mon Apr 10 11:42:38 2017 +0200

    mainly work on remap.h
    This was consolidation work and performance tuning, also streamlining of the code
    to make it more compact. Added more comments, as well.
---
 multithread.h   |  31 +++
 remap.h         | 759 ++++++++++++++++++++------------------------------------
 unary_functor.h |  64 ++---
 3 files changed, 327 insertions(+), 527 deletions(-)

diff --git a/multithread.h b/multithread.h
index d32d151..ffa1c19 100644
--- a/multithread.h
+++ b/multithread.h
@@ -367,6 +367,37 @@ partition ( shape_range_type<d> range ,
   return res ;
 }
 
+/// specialization for 1D shape range
+
+template<>
+partition_type < shape_range_type<1> >
+partition ( shape_range_type<1> range ,
+            int nparts )
+{
+  auto size = range[1][0] - range[0][0] ;
+  auto part_size = size / nparts ;
+  if ( part_size < 1 )
+    part_size = size ;
+  
+  nparts = int ( size / part_size ) ;
+  if ( nparts * part_size < size )
+    nparts++ ;
+
+  partition_type < shape_range_type<1> > res ( nparts ) ;
+  
+  auto start = range[0] ;
+  auto stop = start + part_size ;
+  for ( auto & e : res )
+  {
+    e[0] = start ;
+    e[1] = stop ;
+    start = stop ;
+    stop = start + part_size ;
+  }
+  res[nparts-1][1] = size ;
+  return res ;
+}
+
 /// action_wrapper wraps a functional into an outer function which
 /// first calls the functional and then checks if this was the last
 /// of a bunch of actions to complete, by incrementing the counter
diff --git a/remap.h b/remap.h
index 8b6e7de..762c693 100644
--- a/remap.h
+++ b/remap.h
@@ -3,7 +3,7 @@
 /*    vspline - a set of generic tools for creation and evaluation      */
 /*              of uniform b-splines                                    */
 /*                                                                      */
-/*            Copyright 2015, 2016 by Kay F. Jahnke                     */
+/*            Copyright 2015 - 2017 by Kay F. Jahnke                    */
 /*                                                                      */
 /*    Permission is hereby granted, free of charge, to any person       */
 /*    obtaining a copy of this software and associated documentation    */
@@ -50,12 +50,12 @@
 /// t[j] = i(a,c[j]) for all j
 ///
 /// st_remap is the single_threaded implementation; remap itself partitions it's work
-/// and creates several threads, each running one instance of st_remap.
+/// and feeds several threads, each running one instance of st_remap.
 ///
-/// all remap routines take two template arguments:
+/// remap takes two template arguments:
 ///
 /// - unary_functor_type: functor object yielding values for coordinates
-/// - dim_target:        number of dimensions of output array
+/// - dim_target:         number of dimensions of output array
 ///
 /// remaps to other-dimensional objects are supported. This makes it possible to,
 /// for example, remap from a volume to a 2D image, using a 2D warp array containing
@@ -63,24 +63,22 @@
 ///
 /// In these routines, we can switch the use of vectorization on or off. When using
 /// vectorization, this is done in a straightforward fashion by aggregating the input.
-/// If the source/target array lend themselves to it, we can pass it's memory directly
-/// to the vectorized eval code. To keep the complexity down,  if the memory is not
-/// suited, I 'manually' aggregate to a small buffer and pass the buffer to and fro.
-/// There is possibly room for improvement here - one might use gather/scatter operations
-/// where the data aren't strictly in sequence - but the bulk of the processing time
-/// needed is in the access to (possibly widely scattered) memory and the complex
-/// calculations needed to provide the result, so I don't include code for more
-/// data access modes here.
+/// If the source/target array lends itself to it, we can pass it's memory directly
+/// to the vectorized eval code.
+/// If the memory is not suited, I 'manually' aggregate to a simdized type.
 ///
 /// There is also a second set of remap functions in this file, which don't take a
 /// 'warp' array. Instead, for every target location, the location's discrete coordinates
 /// are passed to the unary_functor_type object. This way, transformation-based remaps
-/// can be implemented easily: the user code just has to provide a suitable 'interpolator'
+/// can be implemented easily: the user code just has to provide a suitable functor
 /// to yield values for coordinates. This interpolator will internally take the discrete
 /// incoming coordinates (into the target array) and transform them as required, internally
 /// producing coordinates suitable for the 'actual' interpolation using a b-spline or some
 /// other object capable of producing values for real coordinates. The routine offering
-/// this service is called index_remap.
+/// this service is called index_remap, and only takes one template argument, which is
+/// enough to derive all other types involved:
+///
+/// - unary_functor_type: functor object yielding values for coordinates
 ///
 /// This file also has code to evaluate a b-spline at positions in a mesh grid, which can be used
 /// for scaling, and for separable geometric transformations.
@@ -111,181 +109,6 @@ using namespace vigra ;
 template < int dimension >
 using bcv_type = vigra::TinyVector < bc_code , dimension > ;
 
-/// struct coordinate_iterator_v provides the vectorized equivalent of vigra's
-/// MultiCoordinateIterator. The iterator itself is not very elaborate, it only
-/// supports dereferencing and preincrement, but for the intended purpose that's
-/// enough. The iterator is cyclical, so the number of times it's used has to be
-/// controlled by the calling code.
-///
-/// dereferencing this object will yield a vectorized coordinate which is guaranteed
-/// to deinterleave to vsize valid ordinary coordinates, but it is only guaranteed that
-/// these ordinary coordinates are unique if several preconditions are met:
-///
-/// - the shape (end - start) must have more than vsize entries
-///
-/// - the iteration doesn't proceed beyond the number of full vectors
-///
-/// The vectorized coordinate delivered at the 'wrap-around-point', so one iteration
-/// after the last full set has been taken, will be partly wrapped around to the start
-/// of the iteration. This makes sure the contained ordinary coordinates are valid.
-/// but accessing data through it will again touch elements which have already been
-/// seen. As long as the indices aren't used for an in-place operation, recalculating
-/// a few values does no harm.
-
-// TODO this is utility code and might be placed into another header, probably next to
-// the routine producing successive gather indices for vectorized array traversal
-
-// TODO when experimenting with coordinate iterators, I found that vectorized code for
-// creating gather/scatter indexes performed worse than simply filling the index vector
-// with successive single indices generated by vigra, and using vigra's facilities
-// also simplifies matters, see the use in filter.h, ele_aggregating_filter.
-
-#ifdef USE_VC
-
-template < class rc_type , class ic_type , int _dimension , int vsize >
-struct coordinate_iterator_v
-{
-  enum { dimension = _dimension } ;
-
-  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
-  typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
-
-  typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
-  typedef typename vector_traits < ic_type , vsize > :: type ic_v ;
-  
-  typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
-  typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
-
-  const nd_ic_type start ;
-  const nd_ic_type end ;
-  const nd_ic_type shape ;
-  
-  nd_ic_v c ;
-
-public:
-
-  coordinate_iterator_v ( nd_ic_type _start , nd_ic_type _end )
-  : start ( _start ) ,
-    end ( _end ) ,
-    shape ( _end - _start )
-  {
-    vigra::MultiCoordinateIterator<dimension> mci_start ( shape ) ;
-    auto mci_end = mci_start.getEndIterator() ;
-    auto mci = mci_start ;
-
-    for ( int i = 0 ; i < vsize ; i++ )
-    {
-      nd_ic_type index = *mci + start ;
-      ++mci ;
-      if ( mci >= mci_end )
-        mci = mci_start ;
-      for ( int d = 0 ; d < dimension ; d++ )
-        c[d][i] = index[d] ;
-    }
-  } ;
-
-  const nd_ic_v & operator*()
-  {
-    return c ;
-  }
-
-  coordinate_iterator_v & operator++()
-  {
-    c[0] += vsize ;        // increase level 0 values
-    auto mask = ( c[0] >= end[0] ) ; // mask overflow
-    while ( any_of ( mask ) )
-    {
-      // we need to do more only if c[0] overflows
-      c[0] ( mask ) -= shape[0] ; // fold back to range
-      for ( int d = 1 ; d < dimension ; d++ )
-      {
-        c[d] ( mask ) += 1 ; // increase next-higher dimension's index
-        mask = ( c[d] >= end[d] ) ; // check for range overflow
-        // resultant mask is either empty or the same as before
-        if ( none_of ( mask ) )         // if no overflow, we're done
-          break ;
-        c[d] ( mask ) = start[d] ; // set back to lower bound
-        // with the next loop iteration, the next dimension's index is increased
-      }
-      // having increased c[0] by vsize can require several iterations
-      // if shape[0] < vsize, so we need to mask, test, etc. again until
-      // all of c[0] are back in range 
-      mask = ( c[0] >= end[0] ) ; // mask overflow
-    }
-    // otherwise, increasing input[0] landed inside the range,
-    return *this ;
-  } ;
-} ;
-
-#endif
-
-// currently unused variant, using vigra's MultiCoordinateIterator. seems to perform the same.
-
-// template < class _rc_v , int _dimension >
-// struct coordinate_iterator_v
-// {
-//   enum { dimension = _dimension } ;
-//   typedef vigra::TinyVector < int , dimension > shape_type ;
-// 
-//   typedef _rc_v rc_v ;
-//   enum { vsize = rc_v::Size } ;
-//   typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
-// 
-//   const shape_type start ;
-//   const shape_type end ;
-//   const shape_type shape ;
-//   
-//   nd_rc_v c ;
-// 
-//   typedef vigra::MultiCoordinateIterator<dimension> iter_type ;
-//   
-//   const iter_type mci_start ;
-//   const iter_type mci_end ;
-//   iter_type mci ;
-//   
-// public:
-// 
-//   coordinate_iterator_v ( const shape_type & _start ,
-//                           const shape_type & _end )
-//   : mci_start ( _end - _start ) ,
-//     mci_end ( iter_type ( _end - start ) . getEndIterator() ) ,
-//     start ( _start ) ,
-//     end ( _end ) ,
-//     shape ( _end - _start )
-//   {
-//     mci = mci_start ;
-// 
-//     for ( int i = 0 ; i < vsize ; i++ )
-//     {
-//       shape_type index = *mci + start ;
-//       ++mci ;
-//       if ( mci >= mci_end )
-//         mci = mci_start ;
-//       for ( int d = 0 ; d < dimension ; d++ )
-//         c[d][i] = index[d] ;
-//     }
-//   } ;
-// 
-//   const nd_rc_v& operator*() const
-//   {
-//     return c ;
-//   }
-// 
-//   coordinate_iterator_v & operator++()
-//   {
-//     for ( int i = 0 ; i < vsize ; i++ )
-//     {
-//       shape_type index = *mci + start ;
-//       ++mci ;
-//       if ( mci >= mci_end )
-//         mci = mci_start ;
-//       for ( int d = 0 ; d < dimension ; d++ )
-//         c[d][i] = index[d] ;
-//     }
-//     return *this ;
-//   } ;
-// } ;
-
 /// struct _fill contains the implementation of the 'engine' used for remap-like
 /// functions. The design logic is this: a remap will ultimately produce an array
 /// of results. This array is filled in standard scan order sequence by repeated
@@ -302,7 +125,7 @@ public:
 /// to the caller, since the data are deposited in normal interleaved fashion. The
 /// extra effort for vectorized operation is in the implementation of the generator
 /// functor and reasonably straightforward. If only the standard remap functions
-/// are used, the user can remain ignorant of the vectorizetion.
+/// are used, the user can remain ignorant of the vectorization.
 ///
 /// Why is _fill an object? It might be a function if partial specialization of functions
 /// were allowed. Since this is not so, we have to use a class and put the function in it's
@@ -321,6 +144,20 @@ public:
 /// - it has to offer a subrange routine, limiting output to a subarray
 ///   of the 'whole' output
 
+//  TODO might write an abstract base class specifying the interface
+
+/// In the current implementation, the hierarchical descent to subdimensional slices
+/// is always taken to the lowest level, leaving the actual calls of the functor to
+/// occur there. While the hierarchical access may consume some processing time, mainly
+/// to establish the bounds for the 1D operation - but possibly optimized away,
+/// the operation on 1D data can use optimizations which gain more than is needed
+/// for the hierarchical descent. Especially when vectorized code is used, operation
+/// on 1D data is very efficient, since the data can be accessed using load/store
+/// or gather/scatter operations, even when the arrays involved are strided.
+/// Taking the hierarchical descent down to level 0 is encoded in fill() and it's
+/// workhorse code, the generator objects implemented here depend on the descent
+/// going all thw way down.
+///
 /// _fill is used by st_fill. It's an object implementing an hierarchical fill
 /// of the target array. This is done recursively. while the generator's and the
 /// target's dimension is greater 1 they are sliced and the slices are processed
@@ -328,7 +165,6 @@ public:
 
 template < typename generator_type  , // functor object yielding values
            int dim_out >              // number of dimensions of output array
-//            int floor_dim = 1 >        // intended lowest dimensionality
 struct _fill
 {
   void operator() ( generator_type & gen ,
@@ -337,94 +173,22 @@ struct _fill
                     bool use_vc = true
                   )
   {
-#ifdef USE_VC
-
-//     const int vsize = generator_type::vsize ;
-
-    if ( use_vc )
-    {
-//       if ( dim_out > floor_dim )
-//       {
-        // if we're not yet at the intended lowest level of recursion,
-        // we slice output and generator and feed the slices to the
-        // next lower recursion level
-        for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
-        {
-          // recursively call _fill for each slice along the highest axis
-          auto sub_output = output.bindOuter ( c ) ;
-          auto sub_gen = gen.bindOuter ( c ) ;
-          _fill < decltype ( sub_gen ) , dim_out - 1 >()
-            ( sub_gen , sub_output ) ;
-        }
-        // after processing all slices, we're done and can return straight away
-        return ;
-//       }
-    }
-
-//       // we are at the intended lowest level of recursion already, so we
-//       // do the work here and now. This is the same code as the specialization
-//       // for dim_out==1 uses, so this code could be factored out. But we
-//       // want the option to do the processing without recursing all the way
-//       // down if that isn't necessary. The calling code can figure this out
-//       // and set floor_dim accordingly.
-// 
-//       int aggregates = leftover / vsize ;        // number of full vectors
-//       leftover -= aggregates * vsize ;           // remaining leftover single values
-// 
-//       if ( output.isUnstrided() )
-//       {
-//         // best case: output array has consecutive memory
-//         // get a pointer to target memory        
-//         value_type * destination = output.data() ;
-//         
-//         for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
-//         {
-//           // pass pointer to target memory to the generator
-//           gen ( destination ) ;
-//         }
-//         // if there aren't any leftovers, we can return straight away.
-//         if ( ! leftover )
-//           return ;
-// 
-//         // otherwise, advance target_it to remaining single values
-//         target_it += aggregates * vsize ;
-//       }
-//       else
-//       {
-//         // fall back to buffering and storing individual result values.
-//         // have a buffer ready
-//         value_type target_buffer [ vsize ] ;
-// 
-//         for ( int a = 0 ; a < aggregates ; a++ )
-//         {
-//           gen ( target_buffer ) ;             // pass buffer to the generator
-//           for ( int e = 0 ; e < vsize ; e++ )
-//           {
-//             *target_it = target_buffer[e] ;   // and deposit in target
-//             ++target_it ;
-//           }
-//         }
-//       }
-//     }
-    
-  #endif // USE_VC
-
-    typedef typename generator_type::value_type value_type ;
-    
-    auto target_it = output.begin() ;  
-    int leftover = output.elementCount() ;
-
-    // process leftovers. If vc isn't used, this loop does all the processing
-    
-    while ( leftover-- )
-    {
-      // process leftovers with single-value evaluation
-      gen ( *target_it ) ;
-      ++target_it ;
-    }
+      // we're not yet at the intended lowest level of recursion,
+      // so we slice output and generator and feed the slices to the
+      // next lower recursion level
+      for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
+      {
+        // recursively call _fill for each slice along the highest axis
+        auto sub_output = output.bindOuter ( c ) ;
+        auto sub_gen = gen.bindOuter ( c ) ;
+        _fill < decltype ( sub_gen ) , dim_out - 1 >()
+          ( sub_gen , sub_output ) ;
+      }
   }
 } ;
 
+/// specialization of _fill for level 0 ends the recursive descent
+
 template < typename generator_type > // functor object yielding values
 struct _fill < generator_type , 1 >
 {
@@ -435,6 +199,7 @@ struct _fill < generator_type , 1 >
                   )
   {
     typedef typename generator_type::value_type value_type ;
+    typedef typename generator_type::functor_type functor_type ;
     
     auto target_it = output.begin() ;  
     int leftover = output.elementCount() ;
@@ -458,41 +223,25 @@ struct _fill < generator_type , 1 >
           // pass pointer to target memory to the generator
           gen ( destination ) ;
         }
-        // if there aren't any leftovers, we can return straight away.
-        if ( ! leftover )
-          return ;
-
-        // otherwise, advance target_it to remaining single values
-        target_it += aggregates * vsize ;
       }
       else
       {
-        // TODO: we're level 0 now, so we can scatter
-//         generator_type::nd_rc_v target_buffer ;
-//         for ( int a = 0 ; a < aggregates ; a++ )
-//         {
-//           gen ( target_buffer ) ;             // pass buffer to the generator
-//           
-//           for ( int d = 0 ; d < value_type::size() ; d++ )
-//           {
-//             target_buffer[d].scatter ( destination , rc_v::IndexesFromZero * d + d ) ;
-//           }
+        const functor_type & f ( gen.get_functor() ) ;
+        typename functor_type::out_v target_buffer ;
+        value_type * destination = output.data() ;
         
-        // fall back to buffering and storing individual result values.
-        // have a buffer ready
-
-        value_type target_buffer [ vsize ] ;
-        for ( int a = 0 ; a < aggregates ; a++ )
+        for ( int a = 0 ; a < aggregates ; a++ , destination += vsize * output.stride(0) )
         {
-          gen ( target_buffer ) ;             // pass buffer to the generator
-          
-          for ( int e = 0 ; e < vsize ; e++ )
-          {
-            *target_it = target_buffer[e] ;   // and deposit in target
-            ++target_it ;
-          }
+          gen ( target_buffer ) ;
+          f.store ( target_buffer , destination , output.stride(0) ) ;
         }
-      }
+      }        
+      // if there aren't any leftovers, we can return straight away.
+      if ( ! leftover )
+        return ;
+
+      // otherwise, advance target_it to remaining single values
+      target_it += aggregates * vsize ;
     }
     
 #endif // USE_VC
@@ -507,8 +256,6 @@ struct _fill < generator_type , 1 >
   }
 } ;
 
-
-
 /// single-threaded fill. This routine receives the range to process and the generator
 /// object capable of providing result values. The generator object is set up to provide
 /// values for the desired subrange and then passed to _fill, which handles the calls to
@@ -562,9 +309,6 @@ void fill ( generator_type & gen ,
 
   int njobs = 64 ; // = output.shape ( dim_target - 1 ) / 4 ;
 
-//   if ( njobs < 16 ) // small thing, try at least 16
-//     njobs = 16 ;
-
   // call multithread(), specifying the single-threaded fill routine as the functor
   // to invoke the threads, passing in 'range', which will be partitioned by multithread(),
   // followed by all the other parameters the single-threaded fill needs, which is
@@ -579,119 +323,43 @@ void fill ( generator_type & gen ,
                 use_vc ) ;     // flag to switch use of Vc on/off
 } ;
 
-// /// struct warp_generator combines a warp array and an interpolator into a functor
-// /// producing values to store in the output. This functor handles the whole chain of
-// /// operations from picking the coordinates from the warp array to delivering target
-// /// values. It can be fed to fill() to produce a remap function, see below.
-// 
-// template < int dimension ,
-//            typename unary_functor_type ,
-//            bool strided_warp >
-// struct _warp_generator
-// {
-//   typedef typename unary_functor_type::in_type nd_rc_type ;
-//   typedef typename unary_functor_type::out_type value_type ;
-//   
-//   typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
-//   
-//   const warp_array_type warp ; // must not use reference here!
-//   typename warp_array_type::const_iterator witer ;
-//   
-//   const unary_functor_type & itp ;
-// 
-//   _warp_generator
-//     ( const warp_array_type & _warp ,
-//       const unary_functor_type & _itp )
-//   : warp ( _warp ) ,
-//     itp ( _itp ) ,
-//     witer ( _warp.begin() )
-//   { } ;
-// 
-//   void operator() ( value_type & target )
-//   {
-//     itp.eval ( *witer , target ) ;
-//     ++witer ;
-//   }
-// 
-// #ifdef USE_VC
-// 
-//   enum { vsize = unary_functor_type :: vsize } ;
-// 
-//   // operator() incorporates two variants, which depend on a template argument,
-//   // so the conditional has no run-time effect. We use witer to stay consistent
-//   // with the unvectorized version, which may be used to mop up leftovers.
-//   
-//   void operator() ( value_type * target )
-//   {
-//     if ( strided_warp )
-//     {
-//       nd_rc_type buffer [ vsize ] ;
-//       for ( int e = 0 ; e < vsize ; e++ , witer++ )
-//         buffer[e] = *witer ;
-//       itp.eval ( buffer , target ) ;
-//     }
-//     else
-//     {
-//       itp.eval ( &(*witer) , target ) ;
-//       witer += vsize ;
-//     }
-//   }
-// 
-// #endif
-// 
-//   _warp_generator < dimension , unary_functor_type , strided_warp >
-//     subrange ( const shape_range_type < dimension > & range ) const
-//   {
-//     return _warp_generator < dimension , unary_functor_type , strided_warp >
-//              ( warp.subarray ( range[0] , range[1] ) , itp ) ;
-//   }
-//   
-//   _warp_generator < dimension - 1 , unary_functor_type , strided_warp >
-//     bindOuter ( const int & c ) const
-//   {
-//     return _warp_generator < dimension - 1 , unary_functor_type , strided_warp >
-//              ( warp.bindOuter ( c ) , itp ) ;
-//   }  
-// } ;
-
-/// warp_generator for dimensions > 1. Here we provide 'subrange' and
+/// Next we code 'generators' for use with fill(). These objects can yield values
+/// to the fill routine, each in it's specific way. The first type we define is
+/// warp_generator. This generator yields data from an array, which, in the context
+/// of a remap-like function, will provide the coordinates to feed to the interpolator.
+/// First is warp_generator for dimensions > 1. Here we provide 'subrange' and
 /// 'bindOuter' to be used for the hierarchical descent in _fill. The current
-/// implementation relies of the hierarchical descent going all the way to 1D!
+/// implementation relies of the hierarchical descent going all the way to 1D,
+/// and does not implement operator() until the 1D specialization.
 
 template < int dimension ,
            typename unary_functor_type ,
            bool strided_warp >
 struct warp_generator
 {
-  typedef typename unary_functor_type::in_type nd_rc_type ;
+  typedef unary_functor_type functor_type ;
+  
   typedef typename unary_functor_type::out_type value_type ;
+  typedef typename unary_functor_type::in_type nd_rc_type ;
   
   typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
   
   const warp_array_type warp ; // must not use reference here!
-  const nd_rc_type * data ;
-  typename warp_array_type::const_iterator witer ;
   
   const unary_functor_type & itp ;
+
+  const unary_functor_type & get_functor()
+  {
+    return itp ;
+  }
   
   warp_generator
     ( const warp_array_type & _warp ,
       const unary_functor_type & _itp )
   : warp ( _warp ) ,
-    itp ( _itp ) ,
-    witer ( _warp.begin() ) ,
-    data ( _warp.data() )
+    itp ( _itp )
   { } ;
 
-  // with Vc present, we only need operator() here for the code to compile,
-  // it's not used. But the unvectorized version may use it.
-
-  void operator() ( value_type & target )
-  {
-    itp.eval ( *witer , target ) ;
-    ++witer ;
-  }
-
   warp_generator < dimension , unary_functor_type , strided_warp >
     subrange ( const shape_range_type < dimension > & range ) const
   {
@@ -708,41 +376,43 @@ struct warp_generator
 } ;
 
 /// specialization of warp_generator for dimension 1. Here we have
-/// (vectorized) evaluation code, but not for subrange and bindOuter.
+/// (vectorized) evaluation code and subrange(), but no bindOuter().
 
 template < typename unary_functor_type ,
            bool strided_warp >
 struct warp_generator < 1 , unary_functor_type , strided_warp >
 {
+  typedef unary_functor_type functor_type ;
+  
   typedef typename unary_functor_type::in_type nd_rc_type ;
   typedef typename unary_functor_type::out_type value_type ;
   
   typedef MultiArrayView < 1 , nd_rc_type > warp_array_type ;
   
   const warp_array_type warp ; // must not use reference here!
+  const int stride ;
   const nd_rc_type * data ;
   typename warp_array_type::const_iterator witer ;
   
   const unary_functor_type & itp ;
   
+  const unary_functor_type & get_functor()
+  {
+    return itp ;
+  }
+  
   warp_generator
     ( const warp_array_type & _warp ,
       const unary_functor_type & _itp )
   : warp ( _warp ) ,
+    stride ( _warp.stride(0) ) ,
     itp ( _itp ) ,
     witer ( _warp.begin() ) ,
     data ( _warp.data() )
   {
 #ifdef USE_VC
-    if ( ! strided_warp )
-    {
-      // if the warp array is unstrided, 'witer' will only be used
-      // by the 'mop-up' action processing the leftover single values
-      // so we advance witer to this location.
-
-      int aggregates = warp.size() / vsize ;
-      witer += aggregates * vsize ;
-    }
+    int aggregates = warp.size() / vsize ;
+    witer += aggregates * vsize ;
 #endif
   } ;
 
@@ -761,36 +431,51 @@ struct warp_generator < 1 , unary_functor_type , strided_warp >
   enum { vsize = unary_functor_type :: vsize } ;
 
   // operator() incorporates two variants, which depend on a template argument,
-  // so the conditional has no run-time effect.
+  // so the conditional has no run-time effect. The template argument target_type
+  // determines the evaluation target and picks the appropriate eval variant.
   
-  void operator() ( value_type * target )
+  template < typename target_type >
+  void operator() ( target_type & target )
   {
     if ( strided_warp )
     {
-      // if the warp array is strided, we collect the values one by one
-      nd_rc_type buffer [ vsize ] ;
-      for ( int e = 0 ; e < vsize ; e++ , witer++ )
-        buffer[e] = *witer ;
+      // if the warp array is strided, we use unary_functor's strided load
+      // routine to assemble a simdized input value with gather operations      
+      typename unary_functor_type::in_v buffer ;
+      itp.load ( data , stride , buffer ) ;
+      // now we pass the simdized value to the evaluation routine
       itp.eval ( buffer , target ) ;
+      data += vsize * stride ;
     }
     else
     {
-      // otherwise we can collect them with an efficient load operation,
-      // which is affected by passing a pointer (data) to the evaluation
-      // routine
+      // otherwise we can collect them with a more efficient load operation,
+      // which is affected by passing a pointer to the evaluation routine
       itp.eval ( data , target ) ;
       data += vsize ;
     }
   }
 
 #endif
+
+  warp_generator < 1 , unary_functor_type , strided_warp >
+    subrange ( const shape_range_type < 1 > & range ) const
+  {
+    return warp_generator < 1 , unary_functor_type , strided_warp >
+             ( warp.subarray ( range[0] , range[1] ) , itp ) ;
+  }
+
 } ;
 
 /// implementation of remap() by delegation to the more general fill() routine, passing
-/// in the warp array and the interpolator via a generator object. warp_generator
-/// has a fast variant which requires the whole warp array to be unstrided, a requirement
-/// which can usually be fulfilled, since warp arrays tend to be made just for the purpose
-/// of a remap. Picking of the correct variant is done here.
+/// in the warp array and the interpolator via a generator object. Calling this routine
+/// remap() doesn't quite do it's scope justice, it might be more appropriate to call it
+/// transform(), since it applies a functor to a set of inputs yielding a set of outputs.
+/// This is a generalization of a remap routine: the remap concept looks at the incoming
+/// data as coordinates, at the functor as an interpolator yielding values for coordinates,
+/// and at the output as an array of thusly generated values. In this implementation of
+/// remap incoming and outgoing data aren't necessarily coordinates or the result of
+/// an interpolation, they can be any pair of types which the functor can handle.
 ///
 /// remap takes two template arguments:
 ///
@@ -820,7 +505,7 @@ struct warp_generator < 1 , unary_functor_type , strided_warp >
 ///   code in compiles which have it activated
 
 template < typename unary_functor_type  , // functor object yielding values for coordinates
-           int dim_target >              // number of dimensions of output array
+           int dim_target >               // number of dimensions of output array
 void remap ( const unary_functor_type & ev ,
              const MultiArrayView < dim_target ,
                                     typename unary_functor_type::in_type > & warp ,
@@ -859,6 +544,22 @@ void remap ( const unary_functor_type & ev ,
   }
 }
 
+/// we code 'apply' as a special variant of remap where the output
+/// is also used as 'warp', so the effect is to feed the unary functor
+/// each 'output' value in turn, let it process it and store the result
+/// back to the same location.
+
+template < class unary_functor_type , // type satisfying the interface in class unary_functor
+           int dim_target >          // dimension of target array
+void apply ( const unary_functor_type & ev ,
+              MultiArrayView < dim_target ,
+                               typename unary_functor_type::out_type > & output ,
+              bool use_vc = true           
+            )
+{
+  remap < unary_functor_type , dim_target > ( ev , output , output ) ;
+}
+
 /// This is a variant of remap, which directly takes an array of values and remaps it,
 /// internally creating a b-spline of given order just for the purpose. This is used for
 /// one-shot remaps where the spline isn't reused.
@@ -900,7 +601,6 @@ int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dime
 
   // create an evaluator over the bspline
 
-//   typedef evaluator < dim_in , value_type , rc_type , ic_type > evaluator_type ;
   typedef evaluator < coordinate_type , value_type > evaluator_type ;
   evaluator_type ev ( bsp ) ;
   
@@ -912,19 +612,20 @@ int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dime
   return 0 ;
 }
 
-/// index_generator is like warp_generator, but instead of picking coordinates from
-/// an array, the corresponding discrete coordinates are passed to the interpolator.
-/// This is a convenience
-/// routine, because it delivers the logic of both coordinate generation and value
-/// storage, in several threads (due to the use of fill) - and leaves the interesting
-/// work to be done in the unary_functor.
+/// class index_generator provides nD indices as input to it's functor which coincide
+/// with the location in the target array for which the functor is called. The data type
+/// of these indices is derived from the functor's input type. Again we presume that
+/// fill() will recurse until level 0, so index_generator's operator() will only be called
+/// at the loweset level of recursion, and we needn't even define it for higher levels.
 
-template < int dim_target , typename unary_functor_type >
+template < typename unary_functor_type , int level >
 struct index_generator
 {
+  typedef unary_functor_type functor_type ;
+  
   typedef typename unary_functor_type::out_type value_type ;
 
-  enum { dim_source = unary_functor_type::dim_in } ;
+  enum { dimension = unary_functor_type::dim_in } ;
   
 #ifdef USE_VC
 
@@ -937,92 +638,159 @@ struct index_generator
 #endif
   
   const unary_functor_type & itp ;
-  const shape_range_type < dim_target > range ;
-  vigra::MultiCoordinateIterator < dim_source > mci ;
+  const shape_range_type < dimension > range ;
+  
+  const unary_functor_type & get_functor()
+  {
+    return itp ;
+  }
   
   index_generator
     ( const unary_functor_type & _itp ,
-      const shape_range_type < dim_target > _range
+      const shape_range_type < dimension > _range
     )
   : itp ( _itp ) ,
-    range ( _range ) ,
-    mci ( _range[1] - _range[0] )
+    range ( _range )
   { } ;
 
-  void operator() ( value_type & target )
+  index_generator < unary_functor_type , level >
+    subrange ( const shape_range_type < dimension > range ) const
   {
-    itp.eval ( *mci + range[0] , target ) ;
-    ++mci ;
+    return index_generator < unary_functor_type , level >
+             ( itp , range ) ;
   }
+  
+  index_generator < unary_functor_type , level - 1 >
+    bindOuter ( const int & c ) const
+  {
+    auto slice_start = range[0] , slice_end = range[1] ;
+
+    slice_start [ level ] += c ;
+    slice_end [ level ] = slice_start [ level ] + 1 ;
+    
+    return index_generator < unary_functor_type , level - 1 >
+             ( itp , shape_range_type < dimension > ( slice_start , slice_end ) ) ;
+  }  
+} ;
+
+/// specialization of index_generator for level 0. Here, the indices for all higher
+/// dimensions have been fixed by the hierarchical descent, and we only need to concern
+/// ourselves with the index for dimension 0, and supply the operator() implementations.
+/// Note how we derive the concrete type of index from the functor. This way, whatever
+/// the functor takes is provided with no need of type conversion, which would be necessary
+/// if we'd only produce integral indices here.
 
+template < typename unary_functor_type >
+struct index_generator < unary_functor_type , 0 >
+{
+  typedef unary_functor_type functor_type ;
+  
+  typedef typename unary_functor_type::in_type index_type ;
+  typedef typename unary_functor_type::in_ele_type index_ele_type ;
+  typedef typename unary_functor_type::out_type value_type ;
+
+  enum { dimension = unary_functor_type::dim_in } ;
+  
 #ifdef USE_VC
+
+  enum { vsize = unary_functor_type::vsize } ;
+  typedef typename unary_functor_type::in_v index_v ;
+  typedef typename unary_functor_type::in_ele_v index_ele_v ;
+
+  index_v current_v ; // current vectorized index to feed to functor
+
+#else
+  
+  enum { vsize = 1 } ;
+
+#endif
   
-  typedef typename vigra::TinyVector < typename unary_functor_type::ic_v ,
-                                       dim_source > nd_ic_v ;
+  index_type current ; // singular index
+
+  const unary_functor_type & itp ;
+  const shape_range_type < dimension > range ;
 
-  void operator() ( value_type * target )
+  const unary_functor_type & get_functor()
   {
-    nd_ic_v index ;
-    for ( int e = 0 ; e < vsize ; e++ )
-    {
-      for ( int d = 0 ; d < dim_source ; d++ )
-      {
-        index[d][e] = (*mci)[d] + range[0][d] ;
-      }
-      ++mci ;
-    }
-    itp.eval ( index , target ) ;
+    return itp ;
   }
+  
+  index_generator
+    ( const unary_functor_type & _itp ,
+      const shape_range_type < dimension > _range
+    )
+  : itp ( _itp ) ,
+    range ( _range )
+  {
+    // initially, set the singular index to the beginning of the range
+    current = index_type ( range[0] ) ;
+    
+#ifdef USE_VC
+    
+    // initialize current_v to hold the first simdized index
+    for ( int d = 0 ; d < dimension ; d++ )
+      current_v[d] = index_ele_v ( range[0][d] ) ;
+    current_v[0] += index_ele_v::IndexesFromZero() ;
+    
+    // if vc is used, the singular index will only be used for mop-up action
+    // after all aggregates have been processed.
+    int size = range[1][0] - range[0][0] ;
+    int aggregates = size / vsize ;
+    current[0] += index_ele_type ( aggregates * vsize ) ; // for mop-up
 
 #endif
 
-  index_generator < dim_target , unary_functor_type >
-    subrange ( const shape_range_type < dim_target > range ) const
+  } ;
+
+  /// single-value evaluation. This will be used for all values if vc isn't used,
+  /// or only for mop-up action after all full vectors are processed.
+
+  void operator() ( value_type & target )
   {
-    return index_generator < dim_target , unary_functor_type >
-             ( itp , range ) ;
+    itp.eval ( current , target ) ;
+    current[0] += index_ele_type ( 1 ) ;
   }
-  
-  index_generator < dim_target , unary_functor_type >
-    bindOuter ( const int & c ) const
+
+#ifdef USE_VC
+ 
+  /// vectorized evaluation. Hierarchical decent has left us with only the
+  /// level0 coordinate to increase, making this code very efficient.
+
+  template < typename target_type >
+  void operator() ( target_type & target )
   {
-    auto slice_start = range[0] , slice_end = range[1] ;
+    itp.eval ( current_v , target ) ;
+    current_v[0] += index_ele_v ( vsize ) ;
+  }
 
-    slice_start [ dim_target - 1 ] += c ;
-    slice_end [ dim_target - 1 ] = slice_start [ dim_target - 1 ] + 1 ;
-    
-    return index_generator < dim_target , unary_functor_type >
-             ( itp , shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
-  }  
+#endif
+
+  /// while we are at the lowest level here, we still need the subrange routine
+  /// for cases where the data are 1D in the first place: in this situation,
+  /// we need to split up the range as well.
+
+  index_generator < unary_functor_type , 0 >
+    subrange ( const shape_range_type < dimension > range ) const
+  {
+    return index_generator < unary_functor_type , 0 >
+             ( itp , range ) ;
+  }
 } ;
 
 /// index_remap() is very similar to remap(), but while remap() picks coordinates from
 /// an array (the 'warp' array), index_remap feeds the discrete coordinates to the
 /// successive places data should be rendered to to the unary_functor_type object.
-/// So here we need a different type of unary_functor_type object: in remap(), we would use
-/// one which takes nD real coordinates, here we use one which takes nD discrete
-/// coordinates. Typically the unary_functor_type object we use here will therefore have
-/// some stage of converting the incoming discrete coordinates to 'interesting' real
-/// coordinates. If, for example, incoming coordinates are like
-/// (0,0), (1,0), (2,0) ...
-/// the unary_functor_type object might, in a first step, use a real factor on them to
-/// produce real coordinates like
-/// (0.0,0.0), (0.1,0.0), (0.2,0.0)...
-/// which in turn are used to perform some evaluation, yielding target data
-/// (R,G,B), (R,G,B),...
-/// which might be taken up again by index_remap to be stored in the target.
+/// Since the data type of the coordinates is derived from the unary functor's input type,
+/// index_generator can produce integral and real indices, as needed.
 ///
-/// index_remap takes two template arguments:
+/// index_remap takes one template argument:
 ///
 /// - 'unary_functor_type', which is a class satisfying the interface laid down in
 ///   unary_functor.h. This is an object which can provide values given coordinates,
 ///   like class evaluator, but generalized to allow for arbitrary ways of achieving
-///   it's goal
-///
-/// - 'dim_target' - the number of dimensions of the target array. While the number of
-///   dimensions of the source data is apparent from the unary_functor_type object, the
-///   target array's dimensionality may be different, like when picking 2D slices from
-///   a volume.
+///   it's goal. The unary functor's in_type determines the number of dimensions of
+///   the indices - since they are indices into the target array, the functor's input
+///   type has to have the same number of dimensions as the target array.
 ///
 /// index_remap takes three parameters:
 ///
@@ -1035,17 +803,18 @@ struct index_generator
 /// - a boolan flag 'use_vc' which can be set to false to switch off the use of vector
 ///   code in compiles which have it activated
 
-template < class unary_functor_type , // type satisfying the interface in class unary_functor
-           int dim_target >          // dimension of target array
+template < class unary_functor_type > // type satisfying the interface in class unary_functor
 void index_remap ( const unary_functor_type & ev ,
-                   MultiArrayView < dim_target ,
+                   MultiArrayView < unary_functor_type::dim_in ,
                                     typename unary_functor_type::out_type > & output ,
                    bool use_vc = true           
                 )
 {
+  enum { dim_target = unary_functor_type::dim_in } ;
+  
   typedef typename unary_functor_type::out_type value_type ;
   typedef TinyVector < int , dim_target > nd_ic_type ;
-  typedef index_generator < dim_target , unary_functor_type > gen_t ;
+  typedef index_generator < unary_functor_type , dim_target - 1 > gen_t ;
 
   shape_range_type < dim_target > range ( nd_ic_type() , output.shape() ) ;  
   gen_t gen ( ev , range ) ;  
@@ -1283,7 +1052,7 @@ void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
 // this is the multithreaded version of grid_eval, which sets up the
 // full range over 'result' and calls 'multithread' to do the rest
 
-/// grid_eval evaluates a b-spline object (TODO: general interpolator obj?)
+/// grid_eval evaluates a b-spline object
 /// at points whose coordinates are distributed in a grid, so that for
 /// every axis there is a set of as many coordinates as this axis is long,
 /// which will be used in the grid as the coordinate for this axis at the
@@ -1305,7 +1074,7 @@ void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
 /// splines this saves quite some time, since the generation of weights
 /// is computationally expensive.
 ///
-/// grid_eval is useful for generating scaled representation of the original
+/// grid_eval is useful for generating a scaled representation of the original
 /// data, but when scaling down, aliasing will occur and the data should be
 /// low-pass-filtered adequately before processing. Let me hint here that
 /// low-pass filtering can be achieved by using b-spline reconstruction on
diff --git a/unary_functor.h b/unary_functor.h
index 6f6ccde..5ddfdd6 100644
--- a/unary_functor.h
+++ b/unary_functor.h
@@ -238,22 +238,22 @@ struct unary_functor
     }
   }
   
-//   /// variant of load() taking a stride, currently unused
-//   
-//   template < typename T >
-//   void load ( const T * const & pc ,
-//               const int & stride ,
-//               typename vector_traits < T , vsize > :: type & cv ) const
-//   {
-//     typedef typename vigra::ExpandElementResult < T > :: type ele_type ;
-//     enum { dimension = vigra::ExpandElementResult < T > :: size } ;
-// 
-//     for ( int d = 0 ; d < dimension ; d++ )
-//     {
-//       cv[d].gather ( (ele_type*) pc ,
-//                       ic_v::IndexesFromZero() * dimension * stride + d ) ;
-//     }
-//   }
+  /// variant of load() taking a stride
+  
+  template < typename T >
+  void load ( const T * const & pc ,
+              const int & stride ,
+              typename vector_traits < T , vsize > :: type & cv ) const
+  {
+    typedef typename vigra::ExpandElementResult < T > :: type ele_type ;
+    enum { dimension = vigra::ExpandElementResult < T > :: size } ;
+
+    for ( int d = 0 ; d < dimension ; d++ )
+    {
+      cv[d].gather ( (ele_type*) pc ,
+                      ic_v::IndexesFromZero() * dimension * stride + d ) ;
+    }
+  }
   
   /// method to store/scatter a simdized value to interleaved memory
 
@@ -278,22 +278,22 @@ struct unary_functor
     }
   } ;
 
-//   /// variant of store taking a stride, currently unused
-//   
-//   template < typename T >
-//   void store ( const typename vector_traits < T , vsize > :: type & ev ,
-//                T * const & result ,
-//                const int & stride ) const
-//   {
-//     typedef typename vigra::ExpandElementResult < T > :: type ele_type ;
-//     enum { dimension = vigra::ExpandElementResult < T > :: size } ;
-// 
-//     for ( int d = 0 ; d < dimension ; d++ )
-//     {
-//       ev[d].scatter ( (ele_type*) result ,
-//                       ic_v::IndexesFromZero() * dimension * stride + d ) ;
-//     }
-//   } ;
+  /// variant of store taking a stride
+  
+  template < typename T >
+  void store ( const typename vector_traits < T , vsize > :: type & ev ,
+               T * const & result ,
+               const int & stride ) const
+  {
+    typedef typename vigra::ExpandElementResult < T > :: type ele_type ;
+    enum { dimension = vigra::ExpandElementResult < T > :: size } ;
+
+    for ( int d = 0 ; d < dimension ; d++ )
+    {
+      ev[d].scatter ( (ele_type*) result ,
+                      ic_v::IndexesFromZero() * dimension * stride + d ) ;
+    }
+  } ;
 
   void eval ( const in_type * const pmc ,
               out_type * result ) const

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git