Sun Jul 2 09:02:38 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit a29afea2a33c7fed04ec5740a2ebd4c086a38a8a
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sun Nov 20 11:54:15 2016 +0100

    factored out multithreading code into multithread.h, using it in filter.h
    Now that I rewrote the multithreading code, I need to make some changes to
    the code using multithreading as well, since I introduced the passing of
    ranges together with the 'whole' arguments to the single-thread code and
    the signatures are slightly different. In this intermediate commit, the
    old code in common.h (splite_array_to_cunks, divide_and_conquer) is still
    present and used for evaluation of the spline, but the (pre)filtering code
    already uses the new multithreading code in multithread.h.
---
 common.h                    |   2 +-
 example/impulse_response.cc | 461 +-------------------------------------------
 filter.h                    | 219 +++++++++++----------
 multithread.h               | 354 ++++++++++++++++++++++++++++++++++
 4 files changed, 474 insertions(+), 562 deletions(-)

diff --git a/common.h b/common.h
index b07acd9..aeb9768 100644
--- a/common.h
+++ b/common.h
@@ -43,7 +43,7 @@
 #define VSPLINE_COMMON
 
 #include <vigra/multi_array.hxx>
-#include <thread>
+#include <vspline/multithread.h>
 
 #ifdef USE_VC
 
diff --git a/example/impulse_response.cc b/example/impulse_response.cc
index 71fef16..f9a42ec 100644
--- a/example/impulse_response.cc
+++ b/example/impulse_response.cc
@@ -72,459 +72,8 @@
 
 #include <iomanip>
 #include <assert.h>
+#include <vspline/multithread.h>
 #include <vspline/vspline.h>
-#include <vigra/multi_math.hxx>
-#include <random>
-#include <tuple>
-
-// iterator_splitter will try to set up n ranges from a range. the partial
-// ranges are stored in a std::vector. The split may succeed producing n
-// or less ranges, and if iter_range can't be split at all, a single range
-// encompassing the whole of iter_range will be returned in the result vector.
-
-template < class _iterator_type >
-struct iterator_splitter
-{
-  typedef _iterator_type iterator_type ;
-  typedef vigra::TinyVector < iterator_type , 2 > range_type ;
-  typedef std::vector < range_type > partition_type ;
-
-  static partition_type part ( const range_type & iter_range ,
-                               int n )
-  {
-    std::vector < range_type > res ;
-    assert ( n > 0 ) ;
-
-    iterator_type start = iter_range [ 0 ] ;
-    iterator_type end = iter_range [ 1 ] ;
-    int size = end - start ;
-    if ( n > size )
-      n = size ;
-    
-    int chunk_size = size / n ; // will be at least 1
-    
-    for ( int i = 0 ; i < n - 1 ; i++ )
-    {
-      res.push_back ( range_type ( start , start + chunk_size ) ) ;
-      start += chunk_size ;
-    }
-    res.push_back ( range_type ( start , end ) ) ;
-    return res ;
-  }
-} ;
-
-// shape_splitter will try to split a shape into n ranges by 'chopping' it
-// along the outermost axis that can be split n-ways. The additional parameter
-// 'forbid' prevents that particular axis from being split. The split may succeed
-// producing n or less ranges, and if 'shape' can't be split at all, a single range
-// encompassing the whole of 'shape' will be returned in the result vector.
-
-template < int dim >
-struct shape_splitter
-{
-  typedef typename vigra::MultiArrayShape < dim > :: type shape_type ;
-  typedef vigra::TinyVector < shape_type , 2 > range_type ;
-  typedef std::vector < range_type > partition_type ;
-  
-  static partition_type part ( const shape_type & shape , ///< shape to be split n-ways
-                               int n ,                    ///< intended number of chunks
-                               int forbid = -1 )          ///< axis which shouldn't be split
-  {
-    partition_type res ;
-
-    // find the outermost dimension that can be split n ways, and it's extent 
-    int split_dim = -1 ;
-    int max_extent = -1 ;
-    for ( int md = dim - 1 ; md >= 0 ; md-- )
-    {
-      if (    md != forbid
-          && shape[md] > max_extent
-          && shape[md] >= n )
-      {
-        max_extent = shape[md] ;
-        split_dim = md ;
-        break ;
-      }
-    }
-    
-    // if the search did not yet succeed:
-    if ( max_extent == -1 )
-    {
-      // repeat process with relaxed conditions: now the search will also succeed
-      // if there is an axis which can be split less than n ways
-      for ( int md = dim - 1 ; md >= 0 ; md-- )
-      {
-        if (    md != forbid
-            && shape[md] > 1 )
-        {
-          max_extent = shape[md] ;
-          split_dim = md ;
-          break ;
-        }
-      }
-    }
-    
-    if ( split_dim == -1 )
-    {   
-      // we have not found a dimension for splitting. We pass back res with
-      // a range over the whole initial shape as it's sole member
-      res.push_back ( range_type ( shape_type() , shape ) ) ;
-    }
-    else
-    {
-      // we can split the shape along split_dim
-      
-      int w = shape [ split_dim ] ;  // extent of the dimension we can split
-      n = std::min ( n , w ) ;       // just in case, if that is smaller than n
-      
-      int cut [ n ] ;   // where to chop up this dimension
-      
-      for ( int i = 0 ; i < n ; i++ )
-        cut[i] = ( (i+1) * w ) / n ;   // roughly equal chunks, but certainly last cut == a.end()
-
-      shape_type start , end = shape ;
-
-      for ( int i = 0 ; i < n ; i++ )
-      {
-        end [ split_dim ] = cut [ i ];                  // apply the cut locations
-        res.push_back ( range_type ( start , end ) ) ;
-        start [ split_dim ] = end [ split_dim ] ;
-      }
-    }
-    return res ; // return the number of chunks
-  }
-} ;
-
-// // we have a std::vector of tuples of bunch_type to which we want to assign
-// // array chunks, so that the first tuple in the vector receives the first chunks
-// // of all elements of bunch, the second tuple receives the second set of chunks
-// // and so forth. We can't iterate over tuple elements in a loop (get<K> requires
-// // a const K), but we can recurse over the elements, filling the Kth member
-// // of all tuples in the target vector in every step of the recursion:
-// 
-// template < class bunch_type , int e , int dimension >
-// struct push_members
-// {
-//   static void push ( std::vector < bunch_type > & target ,
-//                      const bunch_type & bunch ,
-//                      const typename shape_splitter < dimension > :: partition_type & ranges )
-//   {
-//     // extract member e-1 from the unsplit tuple
-//     auto current = std::get<e-1> ( bunch ) ;
-// 
-//     // to perform the shape consistency check, we compare to the first member:
-//     if ( e > 1 )
-//     {
-//       const auto & first = std::get<0> ( bunch ) ; 
-//       if ( first.shape() != current.shape() )
-//         throw vspline::shape_mismatch ( "coprocess: all arrays must have the same size" ) ;
-//     }
-//     
-//     // obtain the parts by applying 'ranges'
-//     // and fill in the parts at position e-1 in every tuple in 'target'
-//     for ( int k = 0 ; k < ranges.size() ; k++ )
-//       std::get < e - 1 > ( target [ k ] ) = current.subarray ( ranges[k][0] , ranges[k][1] ) ;
-//     // and recurse to the next-lower member
-//     push_members < bunch_type , e - 1 , dimension > :: push ( target , bunch , ranges ) ;
-//   }
-// } ;
-// 
-// // terminate the recursion over e, the ordinal number of the tuple's element
-// // with a partial specialization for e == 0
-// 
-// template < class bunch_type , int dimension >
-// struct push_members < bunch_type , 0 , dimension >
-// {
-//   static void push ( ... ) { } // doesn't do anything at this level.
-// } ;
-// 
-// // using C++ magic gleaned from first answer in 
-// // http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer?noredirect=1&lq=1
-// 
-// // seq is an empty struct with a variadic template argument list of int
-// template<int ...>
-// struct seq { };
-// 
-// // level N 'gens' inherits from level N-1 'gens'
-// template<int N, int ...S>
-// struct gens : gens<N-1, N-1, S...> { };
-// 
-// // level 0 'gens' is partially specialized (N==0) and typedefs 'type'
-// // this typedef is inherited by all higher-level 'gens'
-// template<int ...S>
-// struct gens<0, S...> {
-//   typedef seq<S...> type;
-// };
-// 
-// // why all the artistry? we have our parameter sets in tuples, but the
-// // function we want to call expects individual arguments. while we might
-// // require that the function be rewritten to accept a tuple instead of
-// // individual arguments (which is trivial), being able to use the function
-// // 'as is' is more intuitive, especially if the function's definition is
-// // somewhere else and we'd have to code a wrapper for it. We use two steps
-// // to achieve the call to func:
-// //
-// // here we have the 'inner' callFunc.
-// // the actual call to func happens here: _callFunc takes func, an argument
-// // tuple and a 'seq' type 'S' with a template parameter list of ints 0...N-1
-// // with N being the number of arguments func expects.
-// // the sequence is submitted to pack expansion with a pattern which
-// // produces the tuple's kth element for each k in S. The comma-separated
-// // list of the tuple members which results is passed as argument list to func.
-// 
-// template < typename return_type , typename ...Args , int ...S >
-// return_type _callFunc ( std::function < return_type ( Args... ) > func , 
-//                        std::tuple < Args... > params ,
-//                        seq<S...> )
-// {
-//   return func ( std::get < S > ( params ) ... ) ; // expand pack params and call func
-// }
-// 
-// // the 'outer' callFunc receives func and a tuple of arguments.
-// // Here, the number of arguments is established (sizeof...(Args)) and used to
-// // create a 'seq' object, whose type reflects this number of arguments and
-// // which, by pack expansion, can produce individual arguments from the tuple
-// // in the previous routine _callFunc. func and params are passed on to the
-// // 'inner' _callFunc, together with the seq object. The inner _callFunc
-// // performs the actual function call.
-// 
-// template < typename return_type , typename ...Args >
-// return_type callFunc ( std::function < return_type ( Args... ) > func , 
-//                        std::tuple < Args... > params )
-// {
-//   return _callFunc ( func , params , typename gens<sizeof...(Args)>::type() ) ;
-// }
-// 
-// // with the infrastructure code at hand, we can proceed to formulate a generalized
-// // multithreading routine for sets of MultiArrayViews. Here we have 'coprocess'
-// // which receives a std::function taking one or several MultiArrayViews, the
-// // desired number of threads, and as many MultiArrayViews as 'func' can process.
-// // TODO: I haven't managed yet to pass the views per reference
-// // coprocess tries to create a set of tuples of the same type as the tuple
-// // which was created from the arguments passed to coprocess (bunch_type),
-// // so that each tuple in the set contains arguments for a call to func which
-// // operates on a set of subarrays of the original arguments. With this set of tuples,
-// // representing a partition of the work, the work can be delegated to several threads.
-// 
-// template<class ... Types>
-// int coprocess ( std::function < void ( Types... ) > func , int parts , Types ...args  )
-// {
-//   // we gather the arguments in a tuple
-//   auto bunch = std::make_tuple ( args... ) ;
-//   typedef decltype ( bunch ) bunch_type ;
-//   const int members = std::tuple_size < bunch_type > :: value ;
-//   
-//   auto first = std::get<0> ( bunch ) ;
-//   typedef decltype ( first ) view_type ;
-//   const int dimension = view_type::actual_dimension ;
-//   
-//   auto shape = first.shape() ;
-//   auto ranges = shape_splitter < dimension > :: part ( shape , parts ) ;
-//   parts = ranges.size() ;
-// 
-//   // set up the set of tuples containing arguments for the partitioned job,
-// 
-//   std::vector < bunch_type > partition ( parts ) ;
-//   
-//   // fill in the tuples corresponding to the ranges
-//   push_members < bunch_type , members , dimension > :: push ( partition , bunch , ranges ) ;
-// 
-//   // set up thread objects for the additional threads we'll use
-//   int new_threads = parts - 1 ;
-//   std::thread * t[new_threads] ;
-//   
-//   // start the additional threads, passing callFunc as the thread's functional,
-//   // and func and the tuple containing the partial job's arguments. the new threads
-//   // will proceed to 'explode' the tuple and feed the individual arguments to func:
-//   int s ;
-//   for ( s = 0 ; s < new_threads ; s++ )
-//   {
-//     t[s] = new std::thread ( callFunc < void , Types... > , func , partition[s] ) ;
-//   }
-//   // do the last partial - or only - job in this very thread: 
-//   callFunc ( func , partition[s] ) ;
-//   
-//   // finally, join the additional threads and delete their thread objects
-//   for ( s = 0 ; s < new_threads ; s++ )
-//   {
-//     t[s]->join() ;
-//     delete t[s] ;
-//   }
-//   return parts ;
-// }
-// 
-// // the simpler task is to formulate coprocess using 'bunched' arguments in the first
-// // place. This makes the formulation of 'func' a bit more involved, as func now has
-// // to accept a tuple as it's sole argument and unpack the tuple's contents, but the
-// // processing in this variant of 'coprocess' is much more straightforward, since the
-// // artistry needed to call func with an 'exploded' tuple is not needed here:
-// 
-// template < class bunch_type >
-// int coprocess ( std::function < void ( bunch_type ) > func , int parts , bunch_type bunch  )
-// {
-//   const int members = std::tuple_size < bunch_type > :: value ;
-//   
-//   auto first = std::get<0> ( bunch ) ;
-//   typedef decltype ( first ) view_type ;
-//   const int dimension = view_type::actual_dimension ;
-// 
-//   auto shape = first.shape() ;
-//   auto ranges = shape_splitter < dimension > :: part ( shape , parts ) ;
-//   parts = ranges.size() ;
-// 
-//   // set up the set of tuples containing arguments for the partitioned job
-//   std::vector < bunch_type > partition ( parts ) ;
-//   
-//   // fill in the tuples corresponding to the ranges
-//   push_members < bunch_type , members , dimension > :: push ( partition , bunch , ranges ) ;
-// 
-//   // set up thread objects for the additional threads we'll use (if any)
-//   int new_threads = parts - 1 ;
-//   std::thread * t[new_threads] ;
-//   
-//   // start the additional threads, passing func as the thread's functional and the tuple
-//   // containing the partial job's arguments. the new threads will proceed to pass their tuple
-//   // straight to func:
-//   int s ;
-//   for ( s = 0 ; s < new_threads ; s++ )
-//   {
-//     t[s] = new std::thread ( func , partition[s] ) ;
-//   }
-//   // do the last partial job in this very thread. this may be the only job, if push_members
-//   // has returned 1, failing to partition the job
-//   func ( partition[s] ) ;
-//   
-//   // finally, join the additional threads and delete their thread objects
-//   for ( s = 0 ; s < new_threads ; s++ )
-//   {
-//     t[s]->join() ;
-//     delete t[s] ;
-//   }
-//   // and return the number of threads which was used.
-//   return parts ;
-// }
-// 
-// // sample function to run with coprocess:
-// 
-// void twosome ( vigra::MultiArrayView < 1 , double > in1 , vigra::MultiArrayView < 1 , double > in2 )
-// {
-// //   std::cout << "shape 1 : " << in1 . shape() << std::endl ;
-// //   std::cout << "shape 2 : " << in2 . shape() << std::endl ;
-//   for ( int i = 0 ; i < in1.shape(0) ; i++ )
-//     in2[i] = in1[i] + 42.0 ;
-// }
-// 
-// // sample function to run with the bunched version of coprocess:
-// 
-// void btwosome ( std::tuple < vigra::MultiArrayView < 1 , double > ,
-//                              vigra::MultiArrayView < 1 , double > > in )
-// {
-//   for ( int i = 0 ; i < std::get<1>(in).shape(0) ; i++ )
-//     std::get<1>(in) [ i ] += std::get<0>(in) [ i ] + 43.0 ;
-// }
-
-// given limit_type, we define range_type as a TinyVector of two limit_types,
-// the first denoting the beginning of the range and the second it's end, with
-// end being outside of the range.
-
-template < class limit_type >
-using range_type = vigra::TinyVector < limit_type , 2 > ;
-
-// given range_type, we define partition_type as a std::vector of range_type.
-// This data type is used to hold the partitioning of a range into subranges.
-
-template < class range_type >
-using partition_type = std::vector < range_type > ;
-
-// given a dimension, we define a shape_type as a TinyVector of long
-// of this dimension. This is equivalent to vigra's shape type.
-
-template < int dimension >
-using shape_type = vigra::TinyVector < long , dimension > ;
-
-// given a dimension, we define shape_range_type as a range defined by
-// two shapes of the given dimension. This definition allows us to directly
-// pass the two shapes as arguments to a call of subarray() on a MultiArrayView
-// of the given dimension
-
-template < int dimension >
-using shape_range_type = range_type < shape_type < dimension > > ;
-
-// for any dimension d, we specialize function partition() for shape_range_type<d>.
-// we use the extant class shape_splitter to perform the split
-
-template < int d >
-partition_type < shape_range_type<d> > partition ( shape_range_type<d> range , int nparts )
-{
-  // perform the split, silently assuming that the range passed in starts at origin
-  // TODO while we don't ever call this specialization of partition() with a range which
-  // doensn't start at origin, we might either accomodate such use, forbid it, or rethink
-  // the logic...
-  
-  return shape_splitter < d > :: part ( range[1] , nparts ) ;
-}
-
-/// multithread is a routine to perform multithreading with functors accessing several
-/// containers, iterators etc. ('manifolds') of identical structure together.
-/// when such a functor is invoked for a specific thread, it will only process a subset
-/// of each of the original manifold objects. This 'subset' notion is codified as range_type:
-/// range_type specifies a starting point and an end point inside the manifolds defining
-/// the extent of the subset. While it is tempting to code so that the multithreading
-/// routine feeds the subsets to the individual threads, I found that passing the
-/// range to the thread, together with the original variadic arguments to multithread,
-/// produces clean and compact code with a slightly higher burden on the functor, since
-/// it has to 'cut out' the subset it is meant to process itself. Why is this better?
-/// Because it unburdens the multithreading code from knowing any specifics about the
-/// 'cutting out' procedure, and at the same time allows the functor to access the
-/// 'whole' manifold and it's metrics if this is needed.
-/// The partitioning of the incoming range is left to function 'partition' which has to
-/// be specialized for any concrete range_type used, and multithread simply relies on
-/// a specialized version being present.
-
-template < class range_type , class ...Types >
-int multithread ( std::function < void ( range_type , Types... ) > func ,
-                  int nparts ,
-                  range_type range ,
-                  Types ...args  )
-{
-  // partition the range and update nparts in case the partitioning produced
-  // less ranges than we initially asked for. Rely on the presence of a function
-  // partition() which can process range_type:
-
-  auto partitioning = partition ( range , nparts ) ;
-  nparts = partitioning.size() ;
-
-  // set up thread objects for the additional threads we'll use (if any)
-
-  int new_threads = nparts - 1 ;
-  std::thread * t[new_threads] ;
-  
-  // start the additional threads (if any), passing func as the thread's functional,
-  // followed by the range over which func is meant to operate, followed by any other
-  // arguments passed to multithread as parameter pack 'args'
-
-  for ( int s = 0 ; s < new_threads ; s++ )
-  {
-    t[s] = new std::thread ( func , partitioning[s] , args... ) ;
-  }
-
-  // do the last partial job in this very thread. this may be the only job,
-  // if partition() has failed to partition the job
-
-  func ( partitioning [ new_threads ] , args... ) ;
-  
-  // join the additional threads (if any) and delete their thread objects
-
-  for ( int s = 0 ; s < new_threads ; s++ )
-  {
-    t[s]->join() ;
-    delete t[s] ;
-  }
-
-  // and return the number of threads which was used.
-
-  return nparts ;
-}
 
 /// sample function to run with multithread
 /// for any dimension d, we define 'rtwosome' as a function taking a shape_range_type
@@ -533,7 +82,7 @@ int multithread ( std::function < void ( range_type , Types... ) > func ,
 /// the functions used with multithread might be more or less specialized.
 
 template < int dimension >
-void rtwosome ( shape_range_type<dimension> r ,
+void rtwosome ( vspline::shape_range_type<dimension> r ,
                 vigra::MultiArrayView < dimension , double > _in1 ,
                 vigra::MultiArrayView < dimension , double > _in2 )
 {
@@ -627,9 +176,9 @@ int main ( int argc , char * argv[] )
 //   auto t = std::make_tuple ( va , vb ) ;
 //   coprocess ( bfun , 4 , t ) ;
   
-  shape_range_type<1> range ( shape_type() , va.shape() ) ;
-  std::function < void ( shape_range_type<1> , view_type , view_type ) > rfun ( rtwosome<1> ) ;
-  multithread ( rfun , 4 , range , va , vb ) ;
+  vspline::shape_range_type<1> range ( shape_type(3) , va.shape() ) ;
+  std::function < void ( vspline::shape_range_type<1> , view_type , view_type ) > rfun ( rtwosome<1> ) ;
+  vspline::multithread ( rfun , 4 , range , va , vb ) ;
   
   for ( auto v : vb )
     std::cout << v << std::endl ;
diff --git a/filter.h b/filter.h
index 6fa0cb7..e784109 100644
--- a/filter.h
+++ b/filter.h
@@ -62,7 +62,7 @@ namespace vspline {
 
 /// overall_gain is a helper routine:
 /// Simply executing the filtering code by itself will attenuate the signal.
-/// Here we calculate the gain which, applied to the signal, will cancel this effect.
+/// Here we calculate the gain which, pre-applied to the signal, will cancel this effect.
 /// While this code was initially part of the filter's constructor, I took it out
 /// to gain some flexibility by passing in the gain as a parameter.
 
@@ -821,25 +821,33 @@ scatter ( const T* source ,
 /// While the buffering consumes some time, it saves time on the actual filter calculation,
 /// especially with higher-order filters. On my system, I found I broke even even with only
 /// one pole, so there is no special treatment here for low-order filtering (TODO confirm)
-/// TODO: what if dim == 1? should it have special treatment or is source.bindAt ( axis , 0 )
-/// safe in this case?
+/// note the use of range_type<T>, which is from multithread.h
 
 template < class source_type ,
            class target_type >
-int nonaggregating_filter ( source_type &source ,
-                            target_type &target ,
-                            int axis ,
-                            double gain ,
-                            bc_code bc ,
-                            int nbpoles ,
-                            const double * pole ,
-                            double tolerance
-                          )
+void nonaggregating_filter ( range_type < typename source_type::difference_type > range ,
+                             source_type original_source ,
+                             target_type original_target ,
+                             int axis ,
+                             double gain ,
+                             bc_code bc ,
+                             int nbpoles ,
+                             const double * pole ,
+                             double tolerance
+                           )
 {
   const int dim = source_type::actual_dimension ;
   typedef typename source_type::value_type value_type ;
   typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
 
+  // we're in the single-threaded code now. multithread() has simply forwarded
+  // the source and target MultiArrayViews and a range, here we use the range
+  // to pick out the subarrays of original_source and original_target which we
+  // are meant to process in this thread:
+
+  auto source = original_source.subarray ( range[0] , range[1] ) ;
+  auto target = original_target.subarray ( range[0] , range[1] ) ;
+
   int count = source.shape ( axis ) ; 
 
   /// we use a buffer of count value_types
@@ -892,7 +900,7 @@ int nonaggregating_filter ( source_type &source ,
 
     while ( source_sliter < source_sliter_end )
     {
-      // copy from the array to the buffer
+      // copy from the array to the buffer with a monadic gather
       
       source_index = &(*source_sliter) - source_base_adress ;
       
@@ -904,8 +912,7 @@ int nonaggregating_filter ( source_type &source ,
                               
       solver.solve ( buffer.begin() ) ;
       
-      // and perform extended scatter with extrusion parameters to write the filtered data
-      // to the destination
+      // and perform a monadic scatter to write the filtered data to the destination
 
       scatter<value_type> ( buffer_base_adress , target_base_adress + source_index  ,
                             source_stride , count ) ;
@@ -943,7 +950,6 @@ int nonaggregating_filter ( source_type &source ,
       ++target_sliter ;
     }
   }
-  return 0 ;
 }
 
 // the use of Vc has to be switched on with the flag USE_VC.
@@ -1083,42 +1089,33 @@ scatter ( const typename VT::value_type * source ,
 }
 
 /// aggregating_filter (below) keeps a buffer of vector-aligned memory, which it fills
-/// with 1D subarrays of the source array which are collinear to the processing
-/// axis, The buffer is then submitted to vectorized forward-backward recursive filtering
+/// with 1D subarrays of the source array which are collinear to the processing axis.
+/// The buffer is then submitted to vectorized forward-backward recursive filtering
 /// and finally stored back to the corresponding memory area in target, which may
 /// be the same as source, in which case the operation is seemingly performed
 /// in-place (while in fact the buffer is still used). Buffering takes the bulk
 /// of the processing time (on my system), the vectorized maths are fast by
-/// comparison. Depending on array size and spline degree, sometimes the
-/// nonvectorized code is faster. But as both grow, bufering soon comes out
-/// the winner.
+/// comparison. Depending on data type, array size and spline degree, sometimes the
+/// nonvectorized code is faster. But as both grow, bufering soon comes out on top.
 /// ele_aggregating_filter is a subroutine processing arrays of elementary value_type.
 /// It's used by aggregating_filter, either directly if the arrays' value_type is already
 /// an elementary type, or after element-expanding the array(s).
 
 template < class source_type ,
            class target_type >
-int ele_aggregating_filter ( source_type &source ,
-                             target_type &target ,
-                             int axis ,
-                             double gain ,
-                             bc_code bc ,
-                             int nbpoles ,
-                             const double * pole ,
-                             double tolerance
-                           )
+void ele_aggregating_filter ( source_type &source ,
+                              target_type &target ,
+                              int axis ,
+                              double gain ,
+                              bc_code bc ,
+                              int nbpoles ,
+                              const double * pole ,
+                              double tolerance
+                            )
 {
   typedef typename source_type::value_type ele_type ;    ///< elementary type in source
 
-  // Initially I used a plain Vc::Vector as the simdized type, but I found that using a SimdArray
-  // of twice that size performed slightly better. Here's the one place where this type can be fixed,
-  // it's choice 'trickles down' to the remainder of the code, so experimentation is cheap. vsize has
-  // to be a multiple of the genuine vector's size, though, because in gather/scatter above we explicitly
-  // use aligned load/stores to the buffer. If we use plain load/stores to the buffer there, other
-  // vsizes can be used, but performance suffers - we want to use the fastest ops possible after all.
-  // TODO: This should be tested with different systems
-  
-  // we now have a traits class fixing the simdized type (in common.h):
+  // we have a traits class fixing the simdized type (in common.h):
   
   typedef typename vector_traits < ele_type > :: type simdized_type ;
 
@@ -1263,19 +1260,28 @@ int ele_aggregating_filter ( source_type &source ,
                                target_indexes , mask , target_stride , count ) ;
     }
   }
-  return 0 ;
 }
 
+/// here we provide a common routine 'aggregating_filter', which works for elementary
+/// value_types and also for aggregate value_types. Processing is different for these
+/// two cases, because the vector code can only process elementary types, and if
+/// value_type isn't elementary, we need to element-expand the source and target
+/// arrays. Since this routine is the functor passed to multithread() and therefore
+/// receives a range parameter to pick out a subset of the data to process in the
+/// single thread, we also take the opportunity here to pick out the subarrays
+/// for further processing.
+
 template < class source_type ,
            class target_type >
-int aggregating_filter ( source_type &source ,
-                         target_type &target ,
-                         int axis ,
-                         double gain ,
-                         bc_code bc ,
-                         int nbpoles ,
-                         const double * pole ,
-                         double tolerance
+void aggregating_filter ( range_type < typename source_type::difference_type > range ,
+                          source_type original_source ,
+                          target_type original_target ,
+                          int axis ,
+                          double gain ,
+                          bc_code bc ,
+                          int nbpoles ,
+                          const double * pole ,
+                          double tolerance
                         )
 {
   const int dim = source_type::actual_dimension ;
@@ -1284,6 +1290,11 @@ int aggregating_filter ( source_type &source ,
     "aggregating_filter: both arrays must have the same value_type" ) ;
   typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
 
+  // continue processing on the subarrays of source and target specified by 'range':
+
+  auto source = original_source.subarray ( range[0] , range[1] ) ;
+  auto target = original_target.subarray ( range[0] , range[1] ) ;
+  
   if ( vigra::ExpandElementResult < value_type > :: size != 1 )
   {
     // value_type is an aggregate type, but we want to operate on elementary types
@@ -1291,9 +1302,9 @@ int aggregating_filter ( source_type &source ,
     // arrays with elementary types.
     auto expanded_source = source.expandElements ( 0 ) ;
     auto expanded_target = target.expandElements ( 0 ) ;
-    return ele_aggregating_filter ( expanded_source , expanded_target ,
-                                    axis + 1 , gain , bc ,
-                                    nbpoles , pole , tolerance ) ;
+    ele_aggregating_filter ( expanded_source , expanded_target ,
+                             axis + 1 , gain , bc ,
+                             nbpoles , pole , tolerance ) ;
   }
   else
   {
@@ -1306,20 +1317,18 @@ int aggregating_filter ( source_type &source ,
       es ( source.shape() , source.stride() , (ele_type*)(source.data()) ) ;
     vigra::MultiArrayView < target.actual_dimension , ele_type >
       et ( target.shape() , target.stride() , (ele_type*)(target.data()) ) ;
-    return ele_aggregating_filter ( es , et ,
-                                    axis , gain , bc ,
-                                    nbpoles , pole , tolerance ) ;
+    ele_aggregating_filter ( es , et ,
+                             axis , gain , bc ,
+                             nbpoles , pole , tolerance ) ;
   }
 }
 
 #endif
 
 /// Now we have the routines which perform the buffering and filtering for a chunk of data,
-/// We add code for multithreading. This is done by using utility code from common.h, namely
-/// the routine 'divide_and_conquer_2', which splits two arrays in the same way and applies
-/// a functor to each pair of chunks in a separate thread.
+/// We add code for multithreading. This is done by using utility code from multithread.h.
 /// filter_1d, which is the routine processing nD arrays along a specific axis, might as well
-/// be a function. But functions can't be patially specialized (at least not with my compiler)
+/// be a function. But functions can't be partially specialized (at least not with my compiler)
 /// so I use a functor instead, which, as a class, can be partially specialized. We want a
 /// partial specialization for 1D arrays, where all of our usual schemes of multithreading and
 /// vectorization don't intrinsically work and we have to employ a different method, see there.
@@ -1331,28 +1340,37 @@ class filter_1d
 {
 public:
   void operator() ( input_array_type &input ,   ///< source data. the routine can also operate in-place
-                   output_array_type &output ,  ///< where input == output.
-                   int axis ,
-                   double gain ,
-                   bc_code bc ,                 ///< boundary treatment for this solver
-                   int nbpoles ,
-                   const double * pole ,
-                   double tolerance ,
-                   bool use_vc = true ,
-                   int nslices = ncores )  ///< number of threads to use
+                    output_array_type &output ,  ///< where input == output.
+                    int axis ,
+                    double gain ,
+                    bc_code bc ,                 ///< boundary treatment for this solver
+                    int nbpoles ,
+                    const double * pole ,
+                    double tolerance ,
+                    bool use_vc = true ,
+                    int nslices = ncores )  ///< number of threads to use
 {
   typedef typename input_array_type::value_type value_type ;
+  typedef typename input_array_type::difference_type shape_type ;
+  typedef range_type < shape_type > range_t ;
+  typedef partition_type < range_t > partition_t ;
   typedef typename vigra::MultiArray < 1 , value_type > ::iterator iter_type ;
 
-  using namespace std::placeholders ;
-
-  // we use bind to create a functor which we can pass to divide_and_conquer_2.
-  // divide_and_conquer_2 will split input and output into chunks and then apply
-  // the functor to pairs of chunks in separate threads.
-
-  typedef decltype ( & nonaggregating_filter < input_array_type , output_array_type > )
-    filter_functor_type ;
-
+  // multithread() expects a std::function, so here we fix the type of
+  // std::function we want to pass to it, create an instance of the std::function
+  // and assign the routine appropriate to the task:
+  
+  typedef
+  std::function < void ( range_t ,
+                         input_array_type ,
+                         output_array_type ,
+                         int ,
+                         double ,
+                         bc_code ,
+                         int ,
+                         const double * ,
+                         double ) > filter_functor_type ;
+                         
   filter_functor_type filter_functor ;
   
 #ifdef USE_VC
@@ -1368,30 +1386,32 @@ public:
   
 #endif
 
-  auto chunk_func_2
-  = std::bind ( filter_functor ,
-                _1 ,          // placeholders to accept data chunks
-                _2 ,          // from divide_and_conquer
-                axis ,        // axis to process
+  // obtain a partitioning of the data array into subranges. We do this 'manually' here
+  // because we must instruct shape_splitter not to chop up the current processing axis
+  // (by passing the 'forbid' parameter, which is taken as -1 (no effect) in the default
+  // partitioning of nD arrays.
+
+  partition_t partitioning = shape_splitter < dim > :: part ( input.shape() , nslices , axis ) ;
+  
+  multithread ( filter_functor ,
+                partitioning ,
+                input ,
+                output ,
+                axis ,
                 gain ,
                 bc ,
                 nbpoles ,
                 pole ,
                 tolerance ) ;
-
-
-  // divide_and_conquer_2 performs the array splitting and multithreading
-
-  divide_and_conquer_2 < input_array_type , output_array_type >
-  :: run ( input ,         // knot point data
-           output ,        // space for coefficients
-           chunk_func_2 ,  // functor from above to apply the solver
-           axis ,          // forbid splitting along processing axis
-           nslices ) ;     // use nslices threads
 }
 } ;
 
-/// now here's the specialization for 1D arrays.
+/// now here's the specialization for 1D arrays. It may come as a surprise that it looks
+/// nothing like the nD routine. This is due to the fact that we follow a specific strategy:
+/// We 'fold up' the 1D array into a 'fake 2D' array, process this 2D array with the nD code
+/// which is very efficient, and 'mend' the stripes along the margins of the fake 2D array
+/// which contain wrong results due to the fact that some boundary condition appropriate
+/// for the 2D case was applied.
 
 template < typename input_array_type ,      ///< type of array with knot point data
            typename output_array_type >     ///< type of array for coefficients (may be the same)
@@ -1426,7 +1446,6 @@ public:
   
 #ifdef USE_VC
  
-//   typedef vector_traits < ele_type > simdized_type ;
   const int vsize = vector_traits < ele_type > :: vsize ;
   
   // if we can use vector code, the number of lanes is multiplied by the
@@ -1439,13 +1458,7 @@ public:
 
 #endif
 
-//   std::cout << "have " << lanes << " lanes" << std::endl ;
-  
   // we give the filter some space to run up to precision
-  // TODO we may want to use the proper horizon values here
-  // TODO and make sure 64 is enough.
-
-//   std::cout << "tolerance: " << tolerance << std::endl ;
   
   if ( tolerance <= 0.0 )
   {
@@ -1459,7 +1472,7 @@ public:
     
     int horizon = ceil ( log ( tolerance ) / log ( fabs ( pole[0] ) ) ) ;
     
-    // tis is just was much as we want for the filter to run up to precision
+    // this is just as much as we want for the filter to run up to precision
     // starting with BC code 'IGNORE' at the margins
     
     runup = horizon ;
@@ -1473,12 +1486,11 @@ public:
     
     if ( input.shape(0) < min_length )
     {
-//       std::cout << "input too short for fake 2D with the given tolerance" << std::endl ;
       lanes = 1 ;
     }
     else
     {
-      // input's larger than the absolute minimum, maybe we can even increase
+      // input is larger than the absolute minimum, maybe we can even increase
       // the number of lanes some more? we'd like to do this if the input is
       // very large, since we use buffering and don't want the buffers to become
       // overly large. But the smaller the run along the split x axis, the more
@@ -1508,7 +1520,6 @@ public:
 
   if ( lanes == 1 )
   {
-//     std::cout << "using simple 1D filter" << std::endl ;
     // this is a simple single-threaded implementation
     filter_type solver ( input.shape(0) ,
                          gain ,
@@ -1522,8 +1533,6 @@ public:
   
   // the input qualifies for fake 2D processing.
 
-//   std::cout << "input qualifies for fake 2D, using " << lanes << " lanes" << std::endl ;
-  
   // we want as many chunks as we have lanes. There may be some data left
   // beyond the chunks (tail_size of value_type)
   
diff --git a/multithread.h b/multithread.h
new file mode 100644
index 0000000..00bff78
--- /dev/null
+++ b/multithread.h
@@ -0,0 +1,354 @@
+/************************************************************************/
+/*                                                                      */
+/*    vspline - a set of generic tools for creation and evaluation      */
+/*              of uniform b-splines                                    */
+/*                                                                      */
+/*            Copyright 2015, 2016 by Kay F. Jahnke                     */
+/*                                                                      */
+/*    Permission is hereby granted, free of charge, to any person       */
+/*    obtaining a copy of this software and associated documentation    */
+/*    files (the "Software"), to deal in the Software without           */
+/*    restriction, including without limitation the rights to use,      */
+/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
+/*    sell copies of the Software, and to permit persons to whom the    */
+/*    Software is furnished to do so, subject to the following          */
+/*    conditions:                                                       */
+/*                                                                      */
+/*    The above copyright notice and this permission notice shall be    */
+/*    included in all copies or substantial portions of the             */
+/*    Software.                                                         */
+/*                                                                      */
+/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
+/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
+/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
+/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
+/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
+/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
+/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
+/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
+/*                                                                      */
+/************************************************************************/
+
+/// \file multithread.h
+///
+/// \brief code to distribute the processing of bulk data to several threads
+/// 
+/// The code in this header provides a resonably general method to perform
+/// processing of manifolds of data with several threads in parallel. In vspline,
+/// there are several areas where potentially large numbers of individual values
+/// have to be processed independently of each other or in a dependence which
+/// can be preserved in partitioning. To process such 'bulk' data effectively,
+/// vspline employs two strategies: multithreading and vectorization.
+/// This file handles the multithreading.
+///
+/// To produce generic code for the purpose, we first intoduce a model of what
+/// we intend to do. This model looks at the data as occupying a 'range' having
+/// a defined starting point and end point. We keep with the convention of defining
+/// ranges so that the start point is inside and the end point outside the data
+/// set described by the range, just like iterators obtained by begin() and end().
+/// This range is made explicit, even if it is implicit in the data which we want to
+/// submit to multithreading, and there is a type for the purpose, struct range_type.
+/// range_type merely captures the concept of a range, taking 'limit_type' as it's
+/// template parameter, so that any type of range can be accomodated.
+/// Next we define an object holding a set of ranges, modeling a partitioning of
+/// an 'original' range into subranges, which, withing the context of this code,
+/// are disparate and in sequence. This object is modeled as struct partition_type,
+/// taking a range_type as it's template argument.
+/// With these types, we model concrete ranges and partitionings. The most important
+/// group is dealing with multidimensional shapes, where a range extends from a 'lower'
+/// coordinate to a 'higer' coordinate. These two coordinates can be used directly
+/// to call vigra's 'subarray' function.
+/// Next we provide code to partition such ranges into sets of subranges.
+/// Finally we can express a generalized multithreading routine. This routine takes
+/// a functor capable of processing a range specification and a parameter pack of
+/// arbitrary further parameters, of which some will usually be refering to manifolds
+/// of data for which the given range makes sense. We call this routine with a
+/// partitioning of the original range and the same parameter pack that is to be passed
+/// to the functor. The multithreading routine proceeds to set up threads as needed,
+/// starting each with the functor as it's functional, receiving a subrange from
+/// the partitioning and the parameter pack as arguments. There's also an overload
+/// to multithread() where we pass in a range over the data and a desired number of
+/// threads, leaving the partitioning to code inside multithread(), resulting in some
+/// default partitioning provided by a suitable overload of function partition().
+/// This is the commonly used variant, since it's usually not necessary to obtain
+/// a partitioning other than the default one.
+
+#ifndef VSPLINE_MULTITHREAD_H
+#define VSPLINE_MULTITHREAD_H
+
+#include <assert.h>
+#include <vigra/tinyvector.hxx>
+#include <thread>
+
+namespace vspline
+{
+
+/// given limit_type, we define range_type as a TinyVector of two limit_types,
+/// the first denoting the beginning of the range and the second it's end, with
+/// end being outside of the range.
+
+template < class limit_type >
+using range_type = vigra::TinyVector < limit_type , 2 > ;
+
+/// given range_type, we define partition_type as a std::vector of range_type.
+/// This data type is used to hold the partitioning of a range into subranges.
+
+template < class range_type >
+using partition_type = std::vector < range_type > ;
+
+/// given a dimension, we define a shape_type as a TinyVector of long
+/// of this dimension. This is equivalent to vigra's shape type.
+
+template < int dimension >
+using shape_type = vigra::TinyVector < long , dimension > ;
+
+/// given a dimension, we define shape_range_type as a range defined by
+/// two shapes of the given dimension. This definition allows us to directly
+/// pass the two shapes as arguments to a call of subarray() on a MultiArrayView
+/// of the given dimension
+
+template < int dimension >
+using shape_range_type = range_type < shape_type < dimension > > ;
+
+// #include <iomanip>
+// #include <vspline/vspline.h>
+// #include <vigra/multi_math.hxx>
+// #include <random>
+// #include <tuple>
+
+// currently unused
+// // iterator_splitter will try to set up n ranges from a range. the partial
+// // ranges are stored in a std::vector. The split may succeed producing n
+// // or less ranges, and if iter_range can't be split at all, a single range
+// // encompassing the whole of iter_range will be returned in the result vector.
+// 
+// template < class _iterator_type >
+// struct iterator_splitter
+// {
+//   typedef _iterator_type iterator_type ;
+//   typedef vigra::TinyVector < iterator_type , 2 > range_type ;
+//   typedef std::vector < range_type > partition_type ;
+// 
+//   static partition_type part ( const range_type & iter_range ,
+//                                int n )
+//   {
+//     std::vector < range_type > res ;
+//     assert ( n > 0 ) ;
+// 
+//     iterator_type start = iter_range [ 0 ] ;
+//     iterator_type end = iter_range [ 1 ] ;
+//     int size = end - start ;
+//     if ( n > size )
+//       n = size ;
+//     
+//     int chunk_size = size / n ; // will be at least 1
+//     
+//     for ( int i = 0 ; i < n - 1 ; i++ )
+//     {
+//       res.push_back ( range_type ( start , start + chunk_size ) ) ;
+//       start += chunk_size ;
+//     }
+//     res.push_back ( range_type ( start , end ) ) ;
+//     return res ;
+//   }
+// } ;
+
+/// shape_splitter will try to split a shape into n ranges by 'chopping' it
+/// along the outermost axis that can be split n-ways. The additional parameter
+/// 'forbid' prevents that particular axis from being split. The split may succeed
+/// producing n or less ranges, and if 'shape' can't be split at all, a single range
+/// encompassing the whole of 'shape' will be returned in the result vector.
+
+template < int dim >
+struct shape_splitter
+{
+  typedef shape_type < dim > shape_t ;
+  typedef range_type < shape_t > range_t ;
+  typedef partition_type < range_t > partition_t ;
+  
+  static partition_t part ( const shape_t & shape , ///< shape to be split n-ways
+                               int n ,                    ///< intended number of chunks
+                               int forbid = -1 )          ///< axis which shouldn't be split
+  {
+    partition_t res ;
+
+    // find the outermost dimension that can be split n ways, and it's extent 
+    int split_dim = -1 ;
+    int max_extent = -1 ;
+    for ( int md = dim - 1 ; md >= 0 ; md-- )
+    {
+      if (    md != forbid
+          && shape[md] > max_extent
+          && shape[md] >= n )
+      {
+        max_extent = shape[md] ;
+        split_dim = md ;
+        break ;
+      }
+    }
+    
+    // if the search did not yet succeed:
+    if ( max_extent == -1 )
+    {
+      // repeat process with relaxed conditions: now the search will also succeed
+      // if there is an axis which can be split less than n ways
+      for ( int md = dim - 1 ; md >= 0 ; md-- )
+      {
+        if (    md != forbid
+            && shape[md] > 1 )
+        {
+          max_extent = shape[md] ;
+          split_dim = md ;
+          break ;
+        }
+      }
+    }
+    
+    if ( split_dim == -1 )
+    {   
+      // we have not found a dimension for splitting. We pass back res with
+      // a range over the whole initial shape as it's sole member
+      res.push_back ( range_t ( shape_t() , shape ) ) ;
+    }
+    else
+    {
+      // we can split the shape along split_dim
+      
+      int w = shape [ split_dim ] ;  // extent of the dimension we can split
+      n = std::min ( n , w ) ;       // just in case, if that is smaller than n
+      
+      int cut [ n ] ;   // where to chop up this dimension
+      
+      for ( int i = 0 ; i < n ; i++ )
+        cut[i] = ( (i+1) * w ) / n ;   // roughly equal chunks, but certainly last cut == a.end()
+
+      shape_t start , end = shape ;
+
+      for ( int i = 0 ; i < n ; i++ )
+      {
+        end [ split_dim ] = cut [ i ];                  // apply the cut locations
+        res.push_back ( range_t ( start , end ) ) ;
+        start [ split_dim ] = end [ split_dim ] ;
+      }
+    }
+    return res ;
+  }
+} ;
+
+/// for any dimension d, we specialize function partition() for shape_range_type<d>.
+/// we use shape_splitter to perform the split. This function template
+/// provides the default partitioning mechanism for a shape_range, which is used by
+/// the second variant of multithread(), which is called with a range and a number
+/// of desired threads instead of a ready-made partitioning.
+
+template < int d >
+partition_type < shape_range_type<d> > partition ( shape_range_type<d> range , int nparts )
+{
+  if ( range[0].any() )
+  {
+    // the lower limit of the range is not at the origin, so get the shape
+    // of the region between range[0] and range[1], call shape_splitter with
+    // this shape, and add the offset to the lower limit of the original range
+    // to the partial ranges in the result
+    auto shape = range[1] - range[0] ;
+    auto res = shape_splitter < d > :: part ( shape , nparts ) ;
+    for ( auto & r : res )
+    {
+      r[0] += range[0] ;
+      r[1] += range[0] ;
+    }
+    return res ;
+  }
+  // if range[0] is at the origin, we don't have use an offset
+  return shape_splitter < d > :: part ( range[1] , nparts ) ;
+}
+
+/// multithread is a routine to perform multithreading with functors accessing several
+/// containers, iterators etc. ('manifolds') of identical structure together.
+/// when such a functor is invoked for a specific thread, it will only process a subset
+/// of each of the original manifold objects. This 'subset' notion is codified as range_type:
+/// range_type specifies a starting point and an end point inside the manifolds defining
+/// the extent of the subset. While it is tempting to code so that the multithreading
+/// routine feeds the subsets to the individual threads, I found that passing the
+/// range to the thread, together with the original variadic arguments to multithread,
+/// produces clean and compact code with a slightly higher burden on the functor, since
+/// it has to 'cut out' the subset(s) it is meant to process itself. Why is this better?
+/// Because it unburdens the multithreading code from knowing any specifics about the
+/// 'cutting out' procedure, and at the same time allows the functor to access the
+/// 'whole' manifold and it's metrics if this is needed, like in code looking at the
+/// coordinates into an array, which should refer to the 'whole' array, not to the
+/// subarrays produced by partitioning.
+/// I have also taken the actual partitioning of the original range out of the core
+/// multithreading routine, because at times the calling code needs to influence this
+/// process. The situation inside vspline where this is necessary is in the prefiltering
+/// code, where the partitioning has to avoid splitting up the current processing axis,
+/// passing the exis as additional parameter 'forbid' to class shape_splitter.
+
+template < class range_type , class ...Types >
+int multithread ( std::function < void ( range_type , Types... ) > func ,
+                  partition_type < range_type > partitioning ,
+                  Types ...args  )
+{
+  // get the number of ranges in the partitioning
+
+  int nparts = partitioning.size() ;
+  
+  assert ( nparts > 0 ) ; // make sure, for now, that there is at least one range
+
+  // set up thread objects for the additional threads we'll use (if any)
+
+  int new_threads = nparts - 1 ;
+  std::thread * t[new_threads] ;
+  
+  // start the additional threads (if any), passing func as the thread's functional,
+  // followed by the range over which func is meant to operate, followed by any other
+  // arguments passed to multithread as parameter pack 'args'
+
+  for ( int s = 0 ; s < new_threads ; s++ )
+  {
+    t[s] = new std::thread ( func , partitioning[s] , args... ) ;
+  }
+
+  // do the last partial job in this very thread. this may be the only job,
+  // if partitioning only contains a single range
+
+  func ( partitioning [ new_threads ] , args... ) ;
+  
+  // join the additional threads (if any) and delete their thread objects
+
+  for ( int s = 0 ; s < new_threads ; s++ )
+  {
+    t[s]->join() ;
+    delete t[s] ;
+  }
+
+  // and return the number of threads which was used.
+
+  return nparts ;
+}
+
+/// variant of multithread taking the desired number of threads and a range instead
+/// of the partitioning. This variant relies on the presence of a suitable function
+/// partition() to provide the partitioning of the incoming range into at most nparts
+/// subranges. If the calling code doesn't to influence the partitioning process, this
+/// is the variant to use.
+
+template < class range_type , class ...Types >
+int multithread ( std::function < void ( range_type , Types... ) > func ,
+                  int nparts ,
+                  range_type range ,
+                  Types ...args  )
+{
+  // partition the range and update nparts in case the partitioning produced
+  // less ranges than we initially asked for. Rely on the presence of a function
+  // partition() which can process range_type:
+
+  partition_type < range_type > partitioning = partition ( range , nparts ) ;
+  nparts = partitioning.size() ;
+
+  return multithread ( func , partitioning , args... ) ;
+}
+
+} ; // end if namespace vspline
+
+#endif // #ifndef VSPLINE_MULTITHREAD_H
\ No newline at end of file

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git