Sun Jul 2 09:02:38 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 8e7ee9df61a95c6b06202a6606cd92a84457884f
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Mon Nov 21 17:46:05 2016 +0100

    multithread() from multithread.h now used for all multithreading
    I did the rewrite of the code for multithreading in remap.h and pv.cc.
    This made the 'divide_and_conquer' code in common.h obsolete. While using
    multithread() I found no way to pass references 'through' multithread(),
    so now I use pointers throughout to do the passage. Changing the single-threaded
    code turned out straightforward by simply adding the extra range parameter,
    changing all access to manifolds to pointer access but instantly on procedure
    entry forming the subarrays:
    
    void pan_v ( shape_range_type < 2 > range ,
                 warp_array_view_type * p_warp )
    {
      auto warp = p_warp->subarray ( range[0] , range[1] ) ;
      auto ofs = range[0] ;
      ...
    
    and in the code calling multithread():
    
      shape_range_type < 2 > range ( shape_type < 2 > () , warp.shape() ) ;
      warp_array_view_type wv ( warp ) ;
      multithread ( & pan_v , ncores , range , &wv ) ;
---
 common.h      | 489 ----------------------------------------------------------
 filter.h      |  28 ++--
 multithread.h |  74 ++++++++-
 remap.h       | 278 +++++++++++++++------------------
 4 files changed, 212 insertions(+), 657 deletions(-)

diff --git a/common.h b/common.h
index aeb9768..50cb7ff 100644
--- a/common.h
+++ b/common.h
@@ -187,495 +187,6 @@ std::vector < std::string > bc_name =
 
 const int ncores = std::thread::hardware_concurrency() ;
 
-/// split_array_to_chunks partitions an array. This is used for multithreading:
-/// It's possible to distribute the processing to several threads by splitting
-/// the array to chunks so that the axis which is processed is not broken up.
-/// That's what the 'forbid' parameter is for: it keeps the routine from wrongly
-/// splitting the array along a specific axis. With the default of -1, all axes
-/// are considered valid candidates for splitting.
-/// The return value indicates into how many chunks the input array was actually
-/// split. Splitting sounds like an expensive operation, but it is not. All processing
-/// is done with views, the data aren't copied.
-
-template < class view_t >
-int split_array_to_chunks ( view_t & a ,                   ///< view to array to be split n-ways
-                            std::vector < view_t > & res , ///< vector to accomodate the chunks
-                            int n ,                        ///< intended number of chunks
-                            int forbid = -1 )              ///< axis which shouldn't be split
-{
-  const int dim = view_t::actual_dimension ;
-  typedef typename view_t::difference_type shape_t ;
-  
-  // find outermost dimension which can be split n-ways
-  shape_t shape = a.shape() ;
-  
-  // find the outermost dimension that can be split n ways, and it's extent 
-  int split_dim = -1 ;
-  int max_extent = -1 ;
-  for ( int md = dim - 1 ; md >= 0 ; md-- )
-  {
-    if (    md != forbid
-         && shape[md] > max_extent
-         && shape[md] >= n )
-    {
-      max_extent = shape[md] ;
-      split_dim = md ;
-      break ;
-    }
-  }
-  
-  // if the search did not yet succeed:
-  if ( max_extent == -1 )
-  {
-    // repeat process with relaxed conditions: now the search will also succeed
-    // if there is an axis which can be split less than n ways
-    for ( int md = dim - 1 ; md >= 0 ; md-- )
-    {
-      if (    md != forbid
-           && shape[md] > 1 )
-      {
-        max_extent = shape[md] ;
-        split_dim = md ;
-        break ;
-      }
-    }
-  }
-  
-  if ( split_dim != -1 ) // we have found a dimension for splitting
-  {
-    int w = shape [ split_dim ] ;  // extent of the dimension we can split
-    n = std::min ( n , w ) ;       // just in case, if that is smaller than n
-    
-    int cut [ n ] ;   // where to chop up this dimension
-    
-    for ( int i = 0 ; i < n ; i++ )
-      cut[i] = ( (i+1) * w ) / n ;   // roughly equal chunks, but certainly last cut == a.end()
-
-    shape_t start , end = shape ;
-
-    for ( int i = 0 ; i < n ; i++ )
-    {
-      end [ split_dim ] = cut [ i ];                  // apply the cut locations
-      res.push_back ( a.subarray ( start , end ) ) ;
-      start [ split_dim ] = end [ split_dim ] ;
-    }
-  }
-  else // no luck, fill the vector with the initial array as it's sole member
-  {
-    res.push_back ( a ) ;
-    n = 1 ;
-  }
-  return n ; // return the number of chunks
-}
-
-/// divide_and_conquer tries to split an array into chunks and processes each chunk
-/// in a separate thread with the functor passed as 'func'. This functor takes a reference
-/// to a chunk of data and it's offset in the original array. There's a variant of run()
-/// taking a point function and a variant taking a vectorized point function
-
-template < class view_type > // type of input array
-struct divide_and_conquer
-{
-  enum { dim = view_type::actual_dimension } ;
-  typedef typename view_type::value_type value_type ;
-  enum { channels = vigra::ExpandElementResult < value_type > :: size } ;
-  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
-  typedef typename view_type::difference_type shape_type ;
-
-  /// callable chunk_func is used to process each individual chunk in it's
-  /// separate thread. Alternatively, if splitting fails, it is used on the whole
-  /// data set. The additional shape which is passed in can be used by the chunk
-  /// processing function to determine it's position inside the 'whole' array.
-
-  typedef std::function < void ( view_type & , // chunk of data to process
-                                 shape_type )  // offset of chunk in 'whole' data set
-                        > chunk_func ;
-
-  /// run tries to split 'data' into 'nthread' views. If 'forbid' refers to an axis
-  /// of 'data', this axis will not be split. If splitting fails, 'func' is applied
-  /// to 'data', otherwise, 'func' is applied to all chunks that resulted from splitting,
-  /// concurrently, in separate threads.
-
-  static void run ( view_type & data ,   // array of data to divide (into chunks)
-                    chunk_func func ,    // functor to apply to each chunk
-                    int forbid = -1 ,    // axis that should not be split (default: no restriction)
-                    int nthreads = ncores )
-  {
-    if ( nthreads > 1 )
-    {
-      // we split the array into equal chunks
-      // to process them by one thread each
-
-      std::vector<view_type> dn ; // space for chunks
-      nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
-      
-      // one thread's here already
-      int new_threads = nthreads - 1 ;
-      std::thread * t[new_threads] ;
-      
-      // we'll tell the threads where in relation to the start of the data
-      // 'their' chunk is located. This is needed if the coordinates are used
-      // as input, like in the transform-based remap
-      
-      shape_type offset ;
-
-      // create the threads, each with correct chunk and offset
-      int s ;
-      for ( s = 0 ; s < new_threads ; s++ )
-      {
-        t[s] = new std::thread ( func , std::ref(dn[s]) , offset ) ;
-        for ( int d = 0 ; d < dim ; d ++ )
-        {
-          if ( dn[s].shape ( d ) != data.shape(d) )
-            offset[d] += dn[s].shape(d) ;
-        }
-      }
-      // and process the last chunk in this thread
-      func ( dn[s] , offset ) ;
-      
-      // finally, join the threads
-      for ( int s = 0 ; s < new_threads ; s++ )
-      {
-        t[s]->join() ;
-        delete t[s] ;
-      }
-    }
-    else // if we're single-threaded we call func in this thread
-    {
-      func ( data , shape_type() ) ;
-    }
-  }
-
-  /// next we have a few methods for applying point functors to a MultiArrayView.
-  /// this is done by building a suitable chunk_func from a point functor and an
-  /// 'applicator' routine, and then passing this chunk_func to run() above, to
-  /// have it applied in multiple threads. First we define the mechanism for
-  /// point functors operating on value_type:
-  ///
-  /// callable point_func 'treats' a value_type passed in by reference
-
-  typedef std::function < void ( value_type & ) > point_func ;
-
-  /// apply is a helper function that applies 'f' to each value in 'data'.
-  /// By binding it's first agument to a specific point function, we receive
-  /// a functor that satisfies the chunk_func signature.
-
-  static void apply ( point_func f , view_type & data , shape_type offset )
-  {
-    for ( auto& value : data )
-      f ( value ) ;
-  }
-
-  /// overload of 'run' applying a point function to all values in all chunks
-  /// this is where the chunk_func needed for run() above is built and used
-
-  static void run ( view_type & data ,   // array of data to divide (into chunks)
-                    point_func func ,    // point functor to apply to each value
-                    int nthreads = ncores )
-  {
-    using namespace std::placeholders ;
-
-    // we use bind to make a chunk_func by binding the point function argument
-    // to apply() to 'func'. The resulting two-argument functor satisfies the
-    // chunk_func signature and can be passed to run() above
-
-    chunk_func treat = std::bind ( apply ,    // use apply to apply
-                                   func ,     // this point function
-                                   _1 ,       // to this chunk of data
-                                   _2 ) ;     // located at this offset inside 'data'
-
-    // the resulting functor is passed to the version of run() taking a chunk_func
-
-    run ( data , treat , -1 , nthreads ) ; // delegate processing to run variant above
-  }
-  
-#ifdef USE_VC
-
-  typedef typename vector_traits < ele_type > :: type value_v ;
-  enum { vsize = value_v::Size } ;
-
-  /// now we repeat the pattern for vectorized operation.
-  /// callable v_point_func 'treats' a value_v passed in by reference.
-
-  typedef std::function < void ( value_v & ) > v_point_func ;
-  
-  /// again we have a helper function. We'll use bind on the first
-  /// argument to generate a functor satisfying the chunk_func signature.
-  /// vapply reads individual elements into a vector-sized buffer and calls
-  /// the vectorized point function vf on them.
-  
-  static void vapply ( v_point_func vf ,
-                       view_type & data ,
-                       shape_type offset )
-  {
-    int aggregates = data.elementCount() / vsize ;             // number of full vectors
-    int leftovers = data.elementCount() - aggregates * vsize ; // and of any leftover single values
-    value_v buffer ;
-    auto it = data.begin() ;
-
-    if ( data.isUnstrided() )
-    {
-      typedef typename vector_traits < int , vsize > :: type ic_v ;
-      typedef typename vigra::TinyVector < ic_v , channels > :: type mc_ic_v ;
-
-      // if data is unstrided, we can process it's contents by gather/scatter
-      // operations to vectors which we process with the functor vf. This is
-      // about as fast as we can go.
-
-      mc_ic_v mc_interleave ;
-      for ( int ch = 0 ; ch < channels ; ch++ )
-      {
-        mc_interleave[ch] = ic_v::IndexesFromZero() ;
-        mc_interleave[ch] *= channels ;
-        mc_interleave[ch] += ch ;
-      }
-
-      value_type * destination = data.data() ;
-
-      for ( int a = 0 ; a < aggregates ; a++ )
-      {
-        for ( int ch = 0 ; ch  < channels ; ch++ )
-        {
-          buffer.gather ( (ele_type*) destination , mc_interleave [ ch ] ) ;
-          vf ( buffer ) ;
-          buffer.scatter ( (ele_type*) destination , mc_interleave [ ch ] ) ;
-        }
-        destination += vsize ;
-      }
-      it += aggregates * vsize ;
-    }
-    else
-    {
-      // if the data are strided, we 'manually' fill a vector at a time
-      for ( int a = 0 ; a < aggregates ; a++ )
-      {
-        for ( int ch = 0 ; ch < channels ; ch++ )
-        {
-          for ( int e = 0 ; e < vsize ; e++ )            // fill vector
-            buffer[e] = it[e][ch] ;
-          vf ( buffer ) ;                                // call functor
-          for ( int e = 0 ; e < vsize ; e++ )
-            it[e][ch] = buffer[e] ;                      // unpack vector
-        }
-        it += vsize ;
-      }
-    }
-
-    // process leftovers, if any
-
-    if ( leftovers )
-    {
-      for ( int ch = 0 ; ch < channels ; ch++ )
-      {
-        int e ;
-        for ( e = 0 ; e < leftovers ; e++ )
-          buffer[e] = it[e][ch] ;          // fill in leftover values
-
-        for ( ; e < vsize ; e++ )          // just in case no aggregates were processed previously:
-          buffer[e] = it[0][ch] ;          // fill buffer up with suitable values
-
-        vf ( buffer ) ;                    // process buffer
-
-        for ( e = 0 ; e < leftovers ; e++ )
-          it[e][ch] = buffer[e] ;          // write back processed values
-      }
-    }
-  }
-
-  /// overload of 'run' applying a vectorized point function to all elements in all chunks
-
-  static void run ( view_type & data ,   // array of data to divide (into chunks)
-                    v_point_func vfunc , // vectorized point functor
-                    int nthreads = ncores )
-  {
-    using namespace std::placeholders ;
-
-    // again we use bind to create a functor satisfying the chunk_func signature
-    // by using bind - this time on vapply, where we bind the first two arguments
-    // to 'vfunc' and 'func'.
-
-    chunk_func treat = std::bind ( vapply ,   // use vapply to apply
-                                   vfunc ,    // this vectorized point functor
-                                   _1 ,       // to this chunk of data
-                                   _2 ) ;     // which is at this offset inside 'data'
-
-    // the resulting functor is delegated to the version of run() taking a chunk_func
-
-    run ( data , treat , -1 , nthreads ) ;
-  }
-  
-
-#endif
-
-} ;
-
-/// divide_and_conquer_2 works just like divide_and_conquer, but operates on
-/// two arrays synchronously.
-
-template < class view_type ,   // type of first array
-           class view_type_2 > // type of second array
-struct divide_and_conquer_2
-{
-  enum { dim = view_type::actual_dimension } ;
-  typedef typename view_type::difference_type shape_type ;
-
-  // run() takes two MultiArrayViews of the same shape, splits them
-  // in the same way, and applies a functor to each resulting pair of chunks. This is
-  // needed for functions like remap().
-  // TODO: we might even formulate a general manifold by accepting a vector of arrays,
-  // but currently there is no code in vspline which uses more than two views together.
-
-  typedef std::function < void ( view_type & ,   // chunk of data to process
-                                 view_type_2 & , // chunk of data2 to process
-                                 shape_type )    // offset of chunk in 'whole' data set
-                        > chunk_func_2 ;
-
-  static void run ( view_type & data ,   // array of data to divide (into chunks)
-                    view_type_2 & data2 , // second array to coprocess with data
-                    chunk_func_2 func ,  // functor to apply to each pair of chunks
-                    int forbid = -1 ,    // axis that should not be split (default: no restriction)
-                    int nthreads = ncores )
-  {
-    if ( view_type_2::actual_dimension != dim )
-      throw ( dimension_mismatch ( "both views must have the same dimension" ) ) ;
-
-    // make sure that data and data2 are shape-compatible
-    // we silently assume that we can coiterate both arrays in scan order.
-    if ( data.shape() != data2.shape() )
-    {
-      throw shape_mismatch ( "both views must have the same shape" ) ;
-    }
-
-    if ( nthreads > 1 )
-    {
-      // we split both arrays into equal chunks
-      // to coprocess them by one thread each
-
-      std::vector<view_type> dn ; // space for chunks of data
-      nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
-
-      std::vector<view_type_2> d2n ; // space for chunks of data2
-      nthreads = split_array_to_chunks <view_type_2> ( data2 , d2n , nthreads , forbid ) ;
-
-      int new_threads = nthreads - 1 ;
-      
-      std::thread * t[new_threads] ;
-      shape_type offset ;
-
-      int s ;
-      for ( s = 0 ; s < new_threads ; s++ )
-      {
-        t[s] = new std::thread ( func , std::ref(dn[s]) , std::ref(d2n[s]), offset ) ;
-        for ( int d = 0 ; d < dim ; d ++ )
-        {
-          if ( dn[s].shape ( d ) != data.shape(d) )
-            offset[d] += dn[s].shape(d) ;
-        }
-      }
-      // process the last chunk in this thread
-      func ( std::ref(dn[s]) , std::ref(d2n[s]), offset ) ;
-      for ( int s = 0 ; s < new_threads ; s++ )
-      {
-        t[s]->join() ;
-        delete t[s] ;
-      }
-    }
-    else // if we're single-threaded we call func in this thread
-    {
-      func ( data , data2 , shape_type() ) ;
-    }
-  }
-} ;
-
-/// divide_and_conquer_3 works just like divide_and_conquer_2, but operates on
-/// three arrays synchronously.
-
-template < class view_type ,   // type of first array
-           class view_type_2 , // type of second array
-           class view_type_3 >
-struct divide_and_conquer_3
-{
-  enum { dim = view_type::actual_dimension } ;
-  typedef typename view_type::difference_type shape_type ;
-
-  // run() takes two MultiArrayViews of the same shape, splits them
-  // in the same way, and applies a functor to each resulting pair of chunks. This is
-  // needed for functions like remap().
-  // TODO: we might even formulate a general manifold by accepting a vector of arrays,
-  // but currently there is no code in vspline which uses more than two views together.
-
-  typedef std::function < void ( view_type & ,   // chunk of data to process
-                                 view_type_2 & , // chunk of data2 to process
-                                 view_type_3 & , // chunk of data3 to process
-                                 shape_type )    // offset of chunk in 'whole' data set
-                        > chunk_func_3 ;
-
-  static void run ( view_type & data ,   // array of data to divide (into chunks)
-                    view_type_2 & data2 , // second array to coprocess with data
-                    view_type_3 & data3 , // third array to coprocess with data
-                    chunk_func_3 func ,  // functor to apply to each pair of chunks
-                    int forbid = -1 ,    // axis that should not be split (default: no restriction)
-                    int nthreads = ncores )
-  {
-    if (    view_type_2::actual_dimension != dim
-         || view_type_3::actual_dimension != dim
-       )
-      throw ( dimension_mismatch ( "all views must have the same dimension" ) ) ;
-
-    // make sure that data, data2 and data3 are shape-compatible
-    // we silently assume that we can coiterate both arrays in scan order.
-    if (    data.shape() != data2.shape()
-         || data.shape() != data3.shape() )
-    {
-      throw shape_mismatch ( "all views must have the same shape" ) ;
-    }
-
-    if ( nthreads > 1 )
-    {
-      // we split both arrays into equal chunks
-      // to coprocess them by one thread each
-
-      std::vector<view_type> dn ; // space for chunks of data
-      nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
-
-      std::vector<view_type_2> d2n ; // space for chunks of data2
-      nthreads = split_array_to_chunks <view_type_2> ( data2 , d2n , nthreads , forbid ) ;
-
-      std::vector<view_type_3> d3n ; // space for chunks of data3
-      nthreads = split_array_to_chunks <view_type_3> ( data3 , d3n , nthreads , forbid ) ;
-
-      int new_threads = nthreads - 1 ;
-      
-      std::thread * t[new_threads] ;
-      shape_type offset ;
-
-      int s ;
-      for ( s = 0 ; s < new_threads ; s++ )
-      {
-        t[s] = new std::thread ( func ,
-                                 std::ref(dn[s]) , std::ref(d2n[s]), std::ref(d3n[s]) ,
-                                 offset ) ;
-        for ( int d = 0 ; d < dim ; d ++ )
-        {
-          if ( dn[s].shape ( d ) != data.shape(d) )
-            offset[d] += dn[s].shape(d) ;
-        }
-      }
-      // process the last chunk in this thread
-      func ( std::ref(dn[s]) , std::ref(d2n[s]) , std::ref(d3n[s]), offset ) ;
-      for ( int s = 0 ; s < new_threads ; s++ )
-      {
-        t[s]->join() ;
-        delete t[s] ;
-      }
-    }
-    else // if we're single-threaded we call func in this thread
-    {
-      func ( data , data2 , data3 , shape_type() ) ;
-    }
-  }
-} ;
-
 } ; // end of namespace vspline
 
 #endif // VSPLINE_COMMON
\ No newline at end of file
diff --git a/filter.h b/filter.h
index e784109..6200ba8 100644
--- a/filter.h
+++ b/filter.h
@@ -1359,6 +1359,7 @@ public:
   // multithread() expects a std::function, so here we fix the type of
   // std::function we want to pass to it, create an instance of the std::function
   // and assign the routine appropriate to the task:
+  // TODO: I'd like to pass the MultiArrayViews per reference, just for consistency.
   
   typedef
   std::function < void ( range_t ,
@@ -1373,19 +1374,28 @@ public:
                          
   filter_functor_type filter_functor ;
   
+// #ifdef USE_VC
+// 
+//   if ( use_vc )
+//     filter_functor = aggregating_filter < input_array_type , output_array_type > ;
+//   else
+//     filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
+// 
+// #else
+//   
+//   filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
+//   
+// #endif
+
+  auto pf = & nonaggregating_filter < input_array_type , output_array_type > ;
+  
 #ifdef USE_VC
 
   if ( use_vc )
-    filter_functor = aggregating_filter < input_array_type , output_array_type > ;
-  else
-    filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
+    pf = &aggregating_filter < input_array_type , output_array_type > ;
 
-#else
-  
-  filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
-  
 #endif
-
+  
   // obtain a partitioning of the data array into subranges. We do this 'manually' here
   // because we must instruct shape_splitter not to chop up the current processing axis
   // (by passing the 'forbid' parameter, which is taken as -1 (no effect) in the default
@@ -1393,7 +1403,7 @@ public:
 
   partition_t partitioning = shape_splitter < dim > :: part ( input.shape() , nslices , axis ) ;
   
-  multithread ( filter_functor ,
+  multithread ( pf ,
                 partitioning ,
                 input ,
                 output ,
diff --git a/multithread.h b/multithread.h
index 00bff78..96abd37 100644
--- a/multithread.h
+++ b/multithread.h
@@ -51,7 +51,7 @@
 /// range_type merely captures the concept of a range, taking 'limit_type' as it's
 /// template parameter, so that any type of range can be accomodated.
 /// Next we define an object holding a set of ranges, modeling a partitioning of
-/// an 'original' range into subranges, which, withing the context of this code,
+/// an 'original' range into subranges, which, within the context of this code,
 /// are disparate and in sequence. This object is modeled as struct partition_type,
 /// taking a range_type as it's template argument.
 /// With these types, we model concrete ranges and partitionings. The most important
@@ -283,9 +283,53 @@ partition_type < shape_range_type<d> > partition ( shape_range_type<d> range , i
 /// process. The situation inside vspline where this is necessary is in the prefiltering
 /// code, where the partitioning has to avoid splitting up the current processing axis,
 /// passing the exis as additional parameter 'forbid' to class shape_splitter.
+// TODO I can't yet pass references 'through' the multithreading mechanism.
+
+// template < class range_type , class ...Types >
+// int multithread ( std::function < void ( range_type , Types... ) > func ,
+//                   partition_type < range_type > partitioning ,
+//                   Types ...args  )
+// {
+//   // get the number of ranges in the partitioning
+// 
+//   int nparts = partitioning.size() ;
+//   
+//   assert ( nparts > 0 ) ; // make sure, for now, that there is at least one range
+// 
+//   // set up thread objects for the additional threads we'll use (if any)
+// 
+//   int new_threads = nparts - 1 ;
+//   std::thread * t[new_threads] ;
+//   
+//   // start the additional threads (if any), passing func as the thread's functional,
+//   // followed by the range over which func is meant to operate, followed by any other
+//   // arguments passed to multithread as parameter pack 'args'
+// 
+//   for ( int s = 0 ; s < new_threads ; s++ )
+//   {
+//     t[s] = new std::thread ( func , partitioning[s] , args... ) ;
+//   }
+// 
+//   // do the last partial job in this very thread. this may be the only job,
+//   // if partitioning only contains a single range
+// 
+//   func ( partitioning [ new_threads ] , args... ) ;
+//   
+//   // join the additional threads (if any) and delete their thread objects
+// 
+//   for ( int s = 0 ; s < new_threads ; s++ )
+//   {
+//     t[s]->join() ;
+//     delete t[s] ;
+//   }
+// 
+//   // and return the number of threads which was used.
+// 
+//   return nparts ;
+// }
 
 template < class range_type , class ...Types >
-int multithread ( std::function < void ( range_type , Types... ) > func ,
+int multithread ( void (*pfunc) ( range_type , Types... ) ,
                   partition_type < range_type > partitioning ,
                   Types ...args  )
 {
@@ -306,13 +350,13 @@ int multithread ( std::function < void ( range_type , Types... ) > func ,
 
   for ( int s = 0 ; s < new_threads ; s++ )
   {
-    t[s] = new std::thread ( func , partitioning[s] , args... ) ;
+    t[s] = new std::thread ( pfunc , partitioning[s] , args... ) ;
   }
 
   // do the last partial job in this very thread. this may be the only job,
   // if partitioning only contains a single range
 
-  func ( partitioning [ new_threads ] , args... ) ;
+  pfunc ( partitioning [ new_threads ] , args... ) ;
   
   // join the additional threads (if any) and delete their thread objects
 
@@ -333,8 +377,24 @@ int multithread ( std::function < void ( range_type , Types... ) > func ,
 /// subranges. If the calling code doesn't to influence the partitioning process, this
 /// is the variant to use.
 
+// template < class range_type , class ...Types >
+// int multithread2 ( std::function < void ( range_type , Types... ) > func ,
+//                   int nparts ,
+//                   range_type range ,
+//                   Types ...args  )
+// {
+//   // partition the range and update nparts in case the partitioning produced
+//   // less ranges than we initially asked for. Rely on the presence of a function
+//   // partition() which can process range_type:
+// 
+//   partition_type < range_type > partitioning = partition ( range , nparts ) ;
+//   nparts = partitioning.size() ;
+// 
+//   return multithread ( func , partitioning , args... ) ;
+// }
+
 template < class range_type , class ...Types >
-int multithread ( std::function < void ( range_type , Types... ) > func ,
+int multithread ( void (*pfunc) ( range_type , Types... ) ,
                   int nparts ,
                   range_type range ,
                   Types ...args  )
@@ -345,8 +405,8 @@ int multithread ( std::function < void ( range_type , Types... ) > func ,
 
   partition_type < range_type > partitioning = partition ( range , nparts ) ;
   nparts = partitioning.size() ;
-
-  return multithread ( func , partitioning , args... ) ;
+  
+  return multithread ( pfunc , partitioning , args... ) ;
 }
 
 } ; // end if namespace vspline
diff --git a/remap.h b/remap.h
index f32e624..fdb5e29 100644
--- a/remap.h
+++ b/remap.h
@@ -86,6 +86,7 @@
 #ifndef VSPLINE_REMAP_H
 #define VSPLINE_REMAP_H
 
+#include "multithread.h"
 #include "eval.h"
 
 namespace vspline {
@@ -114,14 +115,6 @@ struct coordinate_traits < vigra::TinyVector < T , N > >
   typedef typename vigra::TinyVector < T , N >::value_type rc_type ;
 } ;
 
-// TODO: I'd like to write it like this, but it won't compile:
-
-// template < typename T , int N >
-// struct coordinate_traits < vigra::TinyVector < T , N > >
-// {
-//   typedef vigra::TinyVector < T , N >::value_type rc_type ;
-// } ;
-
 /// since remap can also operate with pre-split coordinates, we need another partial
 /// specialization of coordinate_traits for split_type, which also defines a value_type
 
@@ -131,29 +124,50 @@ struct coordinate_traits < split_type < N , IT , FT > >
   typedef typename split_type < N , IT , FT >::value_type rc_type ;
 } ;
 
+/// st_remap is the single-thread routine to perform a remap with a given warp array
+/// _warp (containing coordinates) into a target array _output, which takes the interpolated
+/// values at the locations the coordinates indicate, using a bspline as interpolator.
+/// While calling this function directly is entirely possible, code wanting to perform
+/// a remap would rather call the next function which partitions the task at hand and
+/// uses a separate thread for each partial job.
+/// The first parameter, range, specifies a subarray of _warp and _output which this
+/// routine is meant to process. this may be the full range, but normally it will cover
+/// only equally sized parts of these arrays, where the precise extent is determined
+/// by multithred() or it's caller in the next routine.
+/// Note how I pass in several pointers. Normally I'd pass such data by reference,
+/// but I can't do this here (or haven't found a way yet) because of my parameter-handling
+/// in multithread() which I couldn't get to work with references. The effect is the same.
+
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class coordinate_type , // usually TinyVector of real for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int st_remap ( const bspline < value_type , dim_in > & bspl ,
-               MultiArrayView < dim_out , coordinate_type > warp ,
-               MultiArrayView < dim_out , value_type > output ,
-               bool use_vc = true
-           )
+void st_remap ( shape_range_type < dim_out >                 range ,
+                const bspline < value_type , dim_in >        *p_bspl ,
+                MultiArrayView < dim_out , coordinate_type > *p_warp ,
+                MultiArrayView < dim_out , value_type >      *p_output ,
+                bool use_vc = true
+              )
 {
   typedef bspline < value_type , dim_in > spline_type ;
   typedef typename coordinate_traits < coordinate_type > :: rc_type rc_type ;
   typedef evaluator < dim_in , value_type , rc_type , int > evaluator_type ;
   typedef typename evaluator_type::ele_type ele_type ;
+  
+  // pick out the subarrays specified by 'range'
+  // this idiom reoccurs throughout, since the multithreading code I use does not
+  // actually partition the containers it handles, but instead only creates information
+  // on how to partition them, and then passes this information to the functionals
+  // used for starting the threads. While this results in slight bloat of the single
+  // thread code, it makes the calling side much more legible and requires no use of
+  // bind, tuples or the likes.
+
+  auto warp = p_warp->subarray ( range[0] , range[1] ) ;
+  auto output = p_output->subarray ( range[0] , range[1] ) ;
+  
+  // create an evaluator from the b-spline
 
-  // do a bit of compatibility checking
-
-  if ( output.shape() != warp.shape() )
-  {
-    throw shape_mismatch ( "the shapes of the warp array and the output array do not match" ) ;
-  }
-
-  evaluator_type ev ( bspl ) ;
+  evaluator_type ev ( *p_bspl ) ;
   
 #ifdef USE_VC
 
@@ -264,48 +278,47 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
     ++source_it ;
     ++target_it ;
   }
-
-  return 0 ;
 }
 
 /// this remap variant performs the remap with several threads. The routine above is single-threaded,
 /// but all individual evaluations are independent of each other. So making it multi-threaded
-/// is trivial: just split the warp array and target array n-ways and call the single-threaded
-/// version on these subarrays. We have utility code to perform the splitting of the arrays
-/// and processing the chunks in separate threads, namely divide_and_conquer_2.
+/// is trivial: just split the warp array and target array n-ways and have each thread process
+/// a chunk. Initially I used this approach directly, but later I switched to multithreading code
+/// which doesn't actually touch the containers, but instead only passes partitioning information
+/// to the individual threads, leaving it to them to split the containers accordingly, which
+/// results in simple and transparent code with only few limitations.
 
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class coordinate_type , // usually float for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int remap ( const bspline < value_type , dim_in > & bspl ,
-            MultiArrayView < dim_out , coordinate_type > warp ,
-            MultiArrayView < dim_out , value_type > output ,
-            bool use_vc = true ,
-            int nthreads = ncores
+void remap ( const bspline < value_type , dim_in > & bspl ,
+             MultiArrayView < dim_out , coordinate_type > & warp ,
+             MultiArrayView < dim_out , value_type > & output ,
+             bool use_vc = true ,
+             int nthreads = ncores
            )
 {
-  using namespace std::placeholders ;
-
-  // we use bind to bind the first and last argument of the single-threaded
-  // remap function, leaving two free arguments which will accept the two arrays
-  // of data to coiterate over (being the chunks of warp and output which are created
-  // by divide_and_conquer_2)
-
-  auto chunk_func_2
-  = std::bind ( st_remap < value_type , coordinate_type , dim_in , dim_out > , // single-threaded remap
-                std::ref(bspl) ,     // bspline passed in above
-                _1 ,       // placeholders to accept data chunks
-                _2 ,       // from divide_and_conquer
-                use_vc ) ; // use_vc flag as passed in above
-
-  divide_and_conquer_2 < decltype ( warp ) ,    // run divide_and_conquer_2
-                         decltype ( output ) >
-  :: run ( warp ,                          // on the warp array
-           output ,                        // and the output array
-           chunk_func_2 ,                  // applying the functor we've made with bind
-           -1 ,                            // allowing array splitting along any axis
-           nthreads ) ;                    // using nthreads threads
+  // check shape compatibility
+  
+  if ( output.shape() != warp.shape() )
+  {
+    throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
+  }
+
+  // set up 'range' to cover a complete array of output's size
+  // (or, warp's size, which is the same)
+  
+  shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
+  
+  // call multithread(), specifying the single-threaded remap routine as the functor
+  // to invoke the threads, passing in 'range', which will be partitioned by multithread(),
+  // followed by all the other parameters the single-threaded remap needs, which is
+  // pretty much the set of parameters we've received here, with the subtle difference
+  // that we can't pass anything on by reference and use pointers instead.
+
+  multithread ( & st_remap < value_type , coordinate_type , dim_in , dim_out > ,
+                 nthreads , range , &bspl , &warp , &output , use_vc ) ;
 } ;
 
 /// This is a variant of remap, which directly takes an array of values and remaps it,
@@ -333,16 +346,17 @@ int remap ( MultiArrayView < dim_in , value_type > input ,
   
   if ( output.shape() != warp.shape() )
   {
-    throw shape_mismatch ( "the shapes of the warp array and the output array do not match" ) ;
+    throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
   }
 
   // create the bspline object
   // TODO may want to specify tolerance here instead of using default
+  
   bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
-  // copy in the data
-  bsp.core = input ;
-  // prefilter
-  bsp.prefilter ( use_vc , nthreads ) ;
+  
+  // prefilter, taking data in 'input' as knot point data
+  
+  bsp.prefilter ( use_vc , nthreads , input ) ;
 
   // and call the remap variant taking a bspline object,
   // passing in the spline, the coordinate array and the target array
@@ -358,6 +372,7 @@ int remap ( MultiArrayView < dim_in , value_type > input ,
 /// 'manually' de/interleave them. In my tests, this improved performance in unvectorized code,
 /// while for vectorized code it made no noticeable difference.
 // TODO: runs, but needs more testing
+// TODO: we want an equivalent processing offset/tune pairs which would be even more compact
 
 template < class ic_type , class rc_type , int dimension , int coordinate_dimension >
 class warper
@@ -377,14 +392,17 @@ public:
 
   typedef typename vigra::MultiArrayShape < dimension > :: type shape_type ;
   
+  shape_type shape ;
+
 public:
   
   /// constructor taking a shape argument. This creates the containers for integral and
   /// remainder parts of the coordinates internally.
 
-  warper ( const shape_type & shape )
-  : _select ( shape ) ,
-    _tune ( shape )
+  warper ( const shape_type & _shape )
+  : shape ( _shape ) ,
+    _select ( _shape ) ,
+    _tune ( _shape )
   {
     select = _select ;
     tune = _tune ;
@@ -409,10 +427,11 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class nd_rc_type ,
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int st_wremap ( const bspline < value_type , dim_in > & bspl ,
-                MultiArrayView < dim_out , nd_ic_type > select ,
-                MultiArrayView < dim_out , nd_rc_type > tune ,
-                MultiArrayView < dim_out , value_type > output ,
+void st_wremap ( shape_range_type < dim_out > range ,
+                const bspline < value_type , dim_in > * bspl ,
+                MultiArrayView < dim_out , nd_ic_type > _select ,
+                MultiArrayView < dim_out , nd_rc_type > _tune ,
+                MultiArrayView < dim_out , value_type > _output ,
                 bool use_vc = true
            )
 {
@@ -422,7 +441,11 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
   typedef evaluator < dim_in , value_type , rc_type , ic_type > evaluator_type ;
   typedef typename evaluator_type::ele_type ele_type ;
 
-  evaluator_type ev ( bspl ) ;
+  auto select = _select.subarray ( range[0] , range[1] ) ;
+  auto tune = _tune.subarray ( range[0] , range[1] ) ;
+  auto output = _output.subarray ( range[0] , range[1] ) ;
+  
+  evaluator_type ev ( *bspl ) ;
   
 #ifdef USE_VC
 
@@ -492,8 +515,6 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
     ++tune_it ;
     ++target_it ;
   }
-
-  return 0 ;
 }
 
 /// overload of remap() using presplit coordinates packaged in a warper object
@@ -503,41 +524,25 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type ,         // real coordinate part
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int remap ( const bspline < value_type , dim_in > & bspl ,
+void remap ( const bspline < value_type , dim_in > & bspl ,
             warper < ic_type , rc_type , dim_out , dim_in > & warp ,
             MultiArrayView < dim_out , value_type > output ,
             bool use_vc = true ,
             int nthreads = ncores
            )
 {
-  using namespace std::placeholders ;
-  
   typedef warper < ic_type , rc_type , dim_out , dim_in > warper_type ;
   typedef typename warper_type::nd_ic_type nd_ic_type ;
   typedef typename warper_type::nd_rc_type nd_rc_type ;
 
-  // we use bind to bind the first and last argument of the single-threaded
-  // remap function, leaving two free arguments which will accept the two arrays
-  // of data to coiterate over (being the chunks of warp and output which are created
-  // by divide_and_conquer_2)
-
-  auto chunk_func_3
-  = std::bind ( st_wremap < value_type , nd_ic_type , nd_rc_type , dim_in , dim_out > ,
-                std::ref(bspl) ,     // bspline passed in above
-                _1 ,       // placeholders to accept data chunks
-                _2 ,       // from divide_and_conquer
-                _3 ,
-                use_vc ) ; // use_vc flag as passed in above
-
-  divide_and_conquer_3 < decltype ( warp.select ) ,
-                         decltype ( warp.tune ) ,
-                         decltype ( output ) >
-  :: run ( warp.select ,                   // on the warp array
-           warp.tune ,
-           output ,                        // and the output array
-           chunk_func_3 ,                  // applying the functor we've made with bind
-           -1 ,                            // allowing array splitting along any axis
-           nthreads ) ;                    // using nthreads threads
+  if ( output.shape() != warp.shape )
+  {
+    throw shape_mismatch ( "remap: the shapes of the warper and the output array do not match" ) ;
+  }
+  shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
+  
+  multithread ( & st_wremap < value_type , nd_ic_type , nd_rc_type , dim_in , dim_out > ,
+                 nthreads , range , &bspl , warp.select , warp.tune , output , use_vc ) ;
 } ;
 
 // Next we have some collateral code to get ready for the transformation-based remap
@@ -787,7 +792,6 @@ public:
       c[0] ( mask ) -= shape[0] ; // fold back to range
       for ( int d = 1 ; d < dimension ; d++ )
       {
-//         c[d] ( mask ) ++ ; // increase next-higher dimension's index
         c[d] ( mask ) += 1 ; // increase next-higher dimension's index
         mask = ( c[d] >= end[d] ) ; // check for range overflow
         // resultant mask is either empty or the same as before
@@ -825,17 +829,20 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type ,         // float or double, for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
-                  transformation < value_type , rc_type , dim_in , dim_out > tf ,
-                  MultiArrayView < dim_out , value_type > output ,
-                  typename MultiArrayView < dim_out , value_type > :: difference_type offset ,
-                  bool use_vc = true
-               )
+void st_tf_remap ( shape_range_type < dim_out > range ,
+                   const bspline < value_type , dim_in > * bspl ,
+                   transformation < value_type , rc_type , dim_in , dim_out > tf ,
+                   MultiArrayView < dim_out , value_type > _output ,
+                   bool use_vc = true
+                 )
 {
   typedef bspline < value_type , dim_in > spline_type ;
   typedef evaluator < dim_in , value_type , rc_type , int > evaluator_type ;
   typedef typename evaluator_type::ele_type ele_type ;
-  evaluator_type ev ( bspl ) ;
+  
+  auto output = _output.subarray ( range[0] , range[1] ) ;
+  auto offset = range[0] ;
+  evaluator_type ev ( *bspl ) ;
 
 #ifdef USE_VC
 
@@ -907,8 +914,6 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
     ev.eval ( cs_out , target_it.get<1>() ) ;
     ++target_it ;
   }
-
-  return 0 ;
 }
 
 /// multithreaded tf_remap routine, splitting the target array to chunks.
@@ -923,33 +928,17 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type ,         // float or double for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int tf_remap ( const bspline < value_type , dim_in > & bspl ,
+void tf_remap ( const bspline < value_type , dim_in > & bspl ,
                transformation < value_type , rc_type , dim_in , dim_out > tf ,
                MultiArrayView < dim_out , value_type > output ,
                bool use_vc = true ,
                int nthreads = ncores            
              )
 {
-  using namespace std::placeholders ;
-
-  // we use bind to bind the first two and last argument of the single-threaded
-  // tf_remap function, leaving one free argument which will accept the array
-  // of data to iterate over (being the a chunk of 'output' which is created
-  // by divide_and_conquer)
-
-  auto chunk_func
-  = std::bind ( st_tf_remap < value_type , rc_type , dim_in , dim_out > , // single-threaded tf_remap
-                std::ref(bspl) ,     // bspline passed in above
-                tf ,       // transformation passed in above
-                _1 ,       // placeholders to accept data chunk
-                _2 ,       // and offset from divide_and_conquer
-                use_vc ) ; // use_vc flag as passed in above
-
-  divide_and_conquer < decltype ( output ) > // run divide_and_conquer
-  :: run ( output ,                          // on the output array
-           chunk_func ,                      // applying the functor we've made with bind
-           -1 ,                              // allowing array splitting along any axis
-           nthreads ) ;                      // using nthreads threads
+  shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
+  
+  multithread ( st_tf_remap < value_type , rc_type , dim_in , dim_out > ,
+                nthreads , range , &bspl , tf , output , use_vc ) ;
 }
 
 /// this highest-level transform-based remap takes an input array and creates
@@ -973,9 +962,7 @@ int tf_remap ( MultiArrayView < dim_in , value_type > input ,
   // create the bspline object
   bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
   // copy in the data
-  bsp.core = input ;
-  // prefilter
-  bsp.prefilter ( use_vc , nthreads ) ;
+  bsp.prefilter ( use_vc , nthreads , input ) ;
 
   tf_remap<value_type,rc_type,dim_in,dim_out>
            ( bsp , tf , output , use_vc , nthreads ) ;
@@ -992,14 +979,17 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type ,         // float or double, for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
+void st_make_warp_array ( shape_range_type < dim_out > range ,
+                         transformation < value_type , rc_type , dim_in , dim_out > tf ,
                          MultiArrayView < dim_out ,
                                           vigra::TinyVector < rc_type , dim_in > >
-                           & warp_array ,
-                         typename MultiArrayView < dim_out , value_type > :: difference_type offset ,
+                           _warp_array ,
                          bool use_vc = true
                        )
 {
+  auto warp_array = _warp_array.subarray ( range[0] , range[1] ) ;
+  auto offset = range[0] ;
+
   /// incoming coordinates (into output, which has dim_out dims)
   /// generated by vigra's CoupledIterator or, in the vectorized code,
   /// by coordinate_iterator_v
@@ -1064,8 +1054,6 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
     target_it.get<1>() = cs_out ;
     ++target_it ;
   }
-
-  return 0 ;
 }
 
 /// multithreaded code to make a warp array using a coordinate transformation.
@@ -1074,33 +1062,19 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type ,         // float or double for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
+void make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
                       MultiArrayView < dim_out ,
                                        vigra::TinyVector < rc_type , dim_in > >
-                        warp_array ,
+                        & warp_array ,
                       bool use_vc = true ,
                       int nthreads = ncores
                     )
 {
-  using namespace std::placeholders ;
-
-  // we use bind to bind the first two and last arguments of the single-threaded
-  // function, leaving one free argument which will accept the array
-  // of data to iterate over (being the a chunk of 'output' which is created
-  // by divide_and_conquer)
-
-  auto chunk_func
-  = std::bind ( st_make_warp_array < value_type , rc_type , dim_in , dim_out > , // single-threaded make_warp_array
-                tf ,       // transformation passed in above
-                _1 ,       // placeholders to accept data chunk
-                _2 ,       // and offset from divide_and_conquer
-                use_vc ) ; // use_vc flag as passed in above
-
-  divide_and_conquer < decltype ( warp_array ) > // run divide_and_conquer
-  :: run ( warp_array  ,                        // on the output array
-           chunk_func ,                    // applying the functor we've made with bind
-           -1 ,                            // allowing array splitting along any axis
-           nthreads ) ;                    // using nthreads threads
+  shape_range_type < dim_out > range ( shape_type < dim_out > () , warp_array.shape() ) ;
+  
+  multithread ( st_make_warp_array < value_type , rc_type , dim_in , dim_out > ,
+                nthreads , range , tf , warp_array , use_vc ) ;
+  
 }
 
 } ; // end of namespace vspline

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git