[vspline] 30/72: polished multithreading code, (hopefully) fixing one bug

Sun Jul 2 09:02:40 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 4b0bc8d31d57eb2dfb686311f67c16a43f275ef1
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Fri Jan 27 11:00:47 2017 +0100

    polished multithreading code, (hopefully) fixing one bug
---
 brace.h              |   8 +-
 common.h             |   8 +-
 example/roundtrip.cc |  63 ++--
 multithread.h        | 254 +++++--------
 remap.h              | 979 +++++++++++++++++++++++++++++----------------------
 thread_pool.h        |  48 +--
 6 files changed, 726 insertions(+), 634 deletions(-)

diff --git a/brace.h b/brace.h
index 7cc8093..683903e 100644
--- a/brace.h
+++ b/brace.h
@@ -318,10 +318,10 @@ struct bracer
 /// the bracing is done one-left-one-right, to avoid corner cases as best as posible.
 
   void apply ( view_type & a , // containing array
-              bc_code bc ,    // boundary condition code
-              int lsz ,       // space to the left which needs to be filled
-              int rsz ,       // ditto, to the right
-              int axis )      // axis along which to apply bracing 
+               bc_code bc ,    // boundary condition code
+               int lsz ,       // space to the left which needs to be filled
+               int rsz ,       // ditto, to the right
+               int axis )      // axis along which to apply bracing 
   {
     int w = a.shape ( axis ) ; // width of containing array along axis 'axis'
     int m = w - ( lsz + rsz ) ;    // width of 'core' array
diff --git a/common.h b/common.h
index 00e11f4..e593982 100644
--- a/common.h
+++ b/common.h
@@ -87,13 +87,13 @@ struct vector_traits
   // if vsize given is precisely the Size of a Vc::Vector<T>, make simdized_type
   // such a Vc::Vector, otherwise make it a Vc::SimdArray<T,vsize>
   
-  typedef typename std::conditional < vsize == Vc::Vector < T > :: Size ,
-                                      Vc::Vector < T > ,
-                                      Vc::SimdArray < T , vsize > > :: type type ;
+//   typedef typename std::conditional < vsize == Vc::Vector < T > :: Size ,
+//                                       Vc::Vector < T > ,
+//                                       Vc::SimdArray < T , vsize > > :: type type ;
 
 // to always use Simdarrays, use this definition instead:
 
-//   typedef Vc::SimdArray < T , vsize > simdized_type ;
+   typedef Vc::SimdArray < T , vsize > type ;
 } ;
 
 #else
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index b533207..f28c52d 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -143,13 +143,14 @@ void roundtrip ( view_type & data ,
   
 #ifdef PRINT_ELAPSED
   std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
+  std::chrono::system_clock::time_point end ;
 #endif
   
   for ( int times = 0 ; times < TIMES ; times++ )
     bsp.prefilter ( use_vc ) ;
   
 #ifdef PRINT_ELAPSED
-  std::chrono::system_clock::time_point end = std::chrono::system_clock::now();
+  end = std::chrono::system_clock::now();
   cout << "avg " << TIMES << " x prefilter:........................ "
        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
@@ -195,7 +196,9 @@ void roundtrip ( view_type & data ,
   coordinate_array fwarp ( Shape ( Tx , Ty ) ) ;
   warp_array _warp ( Shape(Tx+5,Ty+4) ) ;
   warp_view warp = _warp.subarray ( Shape(2,1) , Shape(-3,-3) ) ;
-  array_type target ( Shape(Tx,Ty) ) ;
+  array_type _target ( Shape(Tx,Ty) ) ;
+  view_type target ( _target ) ;
+  
   rc_type dfx = 0.0 , dfy = 0.0 ;
   
   for ( int times = 0 ; times < 1 ; times++ )
@@ -214,31 +217,31 @@ void roundtrip ( view_type & data ,
       }
     }
   }
+ 
+// oops... currently we can't remap from presplit coordinates!
+// #ifdef PRINT_ELAPSED
+//   start = std::chrono::system_clock::now();
+// #endif
+//   
+//   for ( int times = 0 ; times < TIMES ; times++ )
+//     vspline::remap < eval_type , 2 > ( ev , warp , target , use_vc ) ;
+// 
+//   
+// #ifdef PRINT_ELAPSED
+//   end = std::chrono::system_clock::now();
+//   cout << "avg " << TIMES << " x remap1 from pre-split coordinates: "
+//        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
+//        << " ms" << endl ;
+// #endif
+//        
+//   check_diff<view_type> ( target , data ) ;
   
 #ifdef PRINT_ELAPSED
   start = std::chrono::system_clock::now();
 #endif
   
   for ( int times = 0 ; times < TIMES ; times++ )
-    vspline::remap<split_type,pixel_type,eval_type,2>
-      ( ev , warp , target , use_vc ) ;
-
-  
-#ifdef PRINT_ELAPSED
-  end = std::chrono::system_clock::now();
-  cout << "avg " << TIMES << " x remap1 from pre-split coordinates: "
-       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
-       << " ms" << endl ;
-#endif
-       
-  check_diff<view_type> ( target , data ) ;
-  
-#ifdef PRINT_ELAPSED
-  start = std::chrono::system_clock::now();
-#endif
-  
-  for ( int times = 0 ; times < TIMES ; times++ )
-    vspline::remap<coordinate_type,pixel_type,eval_type,2>
+    vspline::remap < eval_type , 2 >
       ( ev , fwarp , target , use_vc ) ;
 
   
@@ -251,12 +254,13 @@ void roundtrip ( view_type & data ,
        
   check_diff<view_type> ( target , data ) ;
   
+// oops.. currently I have problems with remaps with internal splines!
 #ifdef PRINT_ELAPSED
   start = std::chrono::system_clock::now();
 #endif
   
   for ( int times = 0 ; times < TIMES ; times++ )
-    vspline::remap<coordinate_type,pixel_type,2>
+    vspline::remap < coordinate_type , pixel_type , 2 >
       ( data , fwarp , target , bcv , DEGREE , use_vc ) ;
 
  
@@ -307,7 +311,7 @@ void roundtrip ( view_type & data ,
 #endif
   
   for ( int times = 0 ; times < TIMES ; times++ )
-    vspline::tf_remap < eval_type , 2 , 2 >
+    vspline::tf_remap < eval_type , 2 >
       ( ev , tf , target , use_vc ) ;
 
   // note:: with REFLECT this doesn't come out right, because of the .5 difference!
@@ -362,7 +366,13 @@ void process_image ( char * name )
   vigra::importImage(imageInfo, imageArray);
   
   // test these bc codes:
-  vspline::bc_code bcs[] = { vspline::MIRROR , vspline::REFLECT , vspline::NATURAL , vspline::PERIODIC } ;
+  vspline::bc_code bcs[] =
+  {
+    vspline::MIRROR ,
+    vspline::REFLECT ,
+    vspline::NATURAL ,
+    vspline::PERIODIC
+ } ;
 
   for ( int b = 0 ; b < 4 ; b++ )
   {
@@ -432,6 +442,11 @@ int main ( int argc , char * argv[] )
   cout << endl << "testing double data, float coordinates" << endl ;
   detail::process_image<double,float> ( argv[1] ) ;
   
+  cout << "reached end" << std::endl ;
+  // oops... we hang here, failing to terminate
+  
+  exit ( 0 ) ;
+  
 //   typedef vigra::RGBValue<float,0,1,2> pixel_type; 
 //   typedef vigra::MultiArray<2, pixel_type> array_type ;
 //   typedef vigra::MultiArrayView<2, pixel_type> view_type ;
diff --git a/multithread.h b/multithread.h
index 0301db7..6b8b55c 100644
--- a/multithread.h
+++ b/multithread.h
@@ -41,7 +41,7 @@
 /// vspline employs two strategies: multithreading and vectorization.
 /// This file handles the multithreading.
 ///
-/// To produce generic code for the purpose, we first intoduce a model of what
+/// To produce generic code for the purpose, we first introduce a model of what
 /// we intend to do. This model looks at the data as occupying a 'range' having
 /// a defined starting point and end point. We keep with the convention of defining
 /// ranges so that the start point is inside and the end point outside the data
@@ -49,16 +49,21 @@
 /// This range is made explicit, even if it is implicit in the data which we want to
 /// submit to multithreading, and there is a type for the purpose: struct range_type.
 /// range_type merely captures the concept of a range, taking 'limit_type' as it's
-/// template parameter, so that any type of range can be accomodated.
+/// template parameter, so that any type of range can be accomodated. A range is
+/// defined by it's lower and upper limit.
+///
 /// Next we define an object holding a set of ranges, modeling a partitioning of
-/// an 'original' range into subranges, which, within the context of this code,
+/// an original/whole range into subranges, which, within the context of this code,
 /// are disparate and in sequence. This object is modeled as struct partition_type,
 /// taking a range_type as it's template argument.
+///
 /// With these types, we model concrete ranges and partitionings. The most important
-/// group is dealing with multidimensional shapes, where a range extends from a 'lower'
-/// coordinate to a 'higer' coordinate. These two coordinates can be used directly
-/// to call vigra's 'subarray' function.
+/// one is dealing with multidimensional shapes, where a range extends from a 'lower'
+/// coordinate to a just below a 'higer' coordinate. These two coordinates can be
+/// used directly to call vigra's 'subarray' function.
+///
 /// Next we provide code to partition ranges into sets of subranges.
+///
 /// Finally we can express a generalized multithreading routine. This routine takes
 /// a functor capable of processing a range specification and a parameter pack of
 /// arbitrary further parameters, of which some will usually be refering to manifolds
@@ -67,11 +72,12 @@
 /// to the functor. The multithreading routine proceeds to set up 'tasks' as needed,
 /// providing each with the functor as it's functional, a subrange from
 /// the partitioning and the parameter pack as arguments. There's also an overload
-/// to multithread() where we pass in a range over the data and a desired number of
+/// to multithread() where we pass in a range over all the data and a desired number of
 /// tasks, leaving the partitioning to code inside multithread(), resulting in some
 /// default partitioning provided by a suitable overload of function partition().
 /// This is the commonly used variant, since it's usually not necessary to obtain
 /// a partitioning other than the default one.
+///
 /// The tasks, once prepared, are handed over to a 'joint_task' object which handles
 /// the interaction with the thread pool (in thread_pool.cc). While my initial code
 /// used one thread per task, this turned out inefficient, because it was not granular
@@ -79,8 +85,16 @@
 /// into more individual tasks (something like 8 times the number of cores), resulting
 /// in a fair compromise concerning granularity. multithread() waits for all tasks to
 /// terminate and returns when it's certain that the job is complete.
+///
 /// With this method, we assure that on return of multithread() we can safely access
-/// whatever results we anticipate.
+/// whatever results we anticipate. While it might be useful to launch the tasks and
+/// return to continue the main thread, picking up the result later when it becomes
+/// ready, I chose to suspend the calling thread until the result arrives. This makes
+/// the logic simpler, and should be what most use cases need: there is often little else
+/// to do but to wait for the result anyway. If asynchronous operation is needed, a thread
+/// can be launched to initiate and collect from the multithreading. It's safe to have several
+/// threads using this multithreading code, since each task is linked to a 'coordinator',
+/// see struct joint_task below.
 
 #ifndef VSPLINE_MULTITHREAD_H
 #define VSPLINE_MULTITHREAD_H
@@ -127,12 +141,6 @@ using shape_range_type = range_type < shape_type < dimension > > ;
 template < int dimension >
 using shape_partition_type = partition_type < shape_range_type < dimension > > ;
 
-// #include <iomanip>
-// #include <vspline/vspline.h>
-// #include <vigra/multi_math.hxx>
-// #include <random>
-// #include <tuple>
-
 // currently unused
 // // iterator_splitter will try to set up n ranges from a range. the partial
 // // ranges are stored in a std::vector. The split may succeed producing n
@@ -175,11 +183,12 @@ using shape_partition_type = partition_type < shape_range_type < dimension > > ;
 /// 'forbid' prevents that particular axis from being split. The split may succeed
 /// producing n or less ranges, and if 'shape' can't be split at all, a single range
 /// encompassing the whole of 'shape' will be returned in the result vector.
-/// TODO: with some shapes, splitting will result in subranges which aren't optimal
-/// for b-spline prefiltering (these are fastest with extents which are a multiple of
-/// the simdized data type), so we might add code to preferably use cut locations
-/// coinciding with those extents. And with small extents being split, the result
-/// becomes very inefficient for filtering.
+
+// TODO: with some shapes, splitting will result in subranges which aren't optimal
+// for b-spline prefiltering (these are fastest with extents which are a multiple of
+// the simdized data type), so we might add code to preferably use cut locations
+// coinciding with those extents. And with small extents being split, the result
+// becomes very inefficient for filtering.
 
 template < int dim >
 struct shape_splitter
@@ -286,18 +295,7 @@ partition ( shape_range_type<d> range , int nparts )
   return shape_splitter < d > :: part ( range[1] , nparts ) ;
 }
 
-// next we need some collateral code for the implementation of multithread().
-// We have a thread pool (in thread_pool.cc) which we want to use for
-// multithreading. To interface with the thread pool, we need to declare
-// a few objects living in threadpool.cc:
-
-extern std::mutex task_mutex ;
-extern std::queue < std::function < void() > > task_queue ;
-extern std::condition_variable task_cv ;
-extern void kill_thread_pool() ;
-extern bool worker_stay_alive ;
-
-// next some forward declarations
+// forward declaration of struct joint_task and function action_wrapper()
 
 struct joint_task ;
 
@@ -305,10 +303,12 @@ void action_wrapper ( std::function < void() > action , joint_task * p_coordinat
 
 /// struct joint_task serves as a coordinator of a multithreaded operation.
 /// on creation, it receives a vector of tasks, which are packaged as
-/// std::function<void()>. These tasks are wrapped in 'action_wrapper',
-/// which executes the task and communicates with the joint_task object
-/// to keep it updated on the progress of the tasks.
-/// Once the last task is complete, the joint_task in turn notifies
+/// std::function<void()>. These tasks will usually be closures, produced by
+/// std::bind or equivalent techniques.
+///
+/// These tasks are wrapped in 'action_wrapper', which executes the task and
+/// communicates with the joint_task object to keep it updated on the progress
+/// of the tasks. Once the last task is complete, the joint_task in turn notifies
 /// the calling code.
 
 struct joint_task
@@ -319,17 +319,14 @@ struct joint_task
   std::mutex finished_mutex ; // to guard 'finished'
   bool finished ;             // is set to true as predicate to test by caller
   std::condition_variable finished_cv ; // the caller waits for this cv  
-
-  thread_pool & tp ;
-  std::vector < std::function < void() > > & taskv ; // tasks to perform
+  thread_pool & tp  ;
   
   joint_task ( thread_pool & _tp ,
-               std::vector < std::function < void() > > & _taskv )
-  : tp ( _tp ) ,             // thread pool to use
-    taskv (_taskv ) ,        // individual tasks to perform
-    crew ( _taskv.size() ) , // number of tasks
-    done ( 0 ) ,             // number of done tasks
-    finished ( false )       // not yet finished with all tasks
+               std::vector < std::function < void() > > & taskv )
+  : tp (_tp ) ,
+    crew ( taskv.size() ) , // number of tasks
+    done ( 0 ) ,            // number of done tasks
+    finished ( false )      // not yet finished with all tasks
   {
     {
       // under task_mutex, fill tasks into task queue
@@ -350,7 +347,9 @@ struct joint_task
       // the predicate done==crew rejects spurious wakes
       tp.task_cv.wait ( lk , [&] { return done == crew ; } ) ;
     }
-    // we're sure now that all tasks are done, so we let the caller know
+    // we're sure now that all tasks are done, so we let the caller know,
+    // who is waiting on finished_cv.
+    std::lock_guard<std::mutex> lk ( finished_mutex ) ;
     finished = true ;
     finished_cv.notify_one() ;
   } ;
@@ -366,18 +365,24 @@ struct joint_task
 void action_wrapper ( std::function < void() > action ,
                       joint_task * p_coordinator )
 {
-  // perform wrapped action
+  // perform the wrapped action (the 'payload' initially passed to
+  // the joint task object in the 'taskv' argument)
+
   action() ;
 
-  // under the coordinator's complete_mutex, increase the coordinator's
+  // under the coordinator's task_mutex, increase the coordinator's
   // 'done' counter and test if it's now equal to 'crew', the total
   // number of actions originating from p_coordinator
-  bool all_done ;
-  {
-    std::lock_guard<std::mutex> lk ( p_coordinator->tp.task_mutex ) ;
-    all_done = ( ++ (p_coordinator->done) == p_coordinator->crew ) ;
-  }
-  if ( all_done )
+  
+  // TODO initially I had the notify_all call after closing the scope of
+  // the lock guard, but I had random crashes. Changing the code to call
+  // notify_all with the lock guard still in effect seemed to remove the
+  // problem, but made me unsure of my logic.
+  
+  // TODO might use a different condition variable for this purpose
+
+  std::lock_guard<std::mutex> lk ( p_coordinator->tp.task_mutex ) ;
+  if ( ++ (p_coordinator->done) == p_coordinator->crew ) ;
   {
     // this was the last action originating from p_coordinator
     // notify the coordinator that the joint task is now complete
@@ -391,7 +396,7 @@ void action_wrapper ( std::function < void() > action ,
 /// a multithreaded operation. It receives a functor (a single-threaded
 /// function used for all individual tasks), a partitioning, which contains
 /// information on which part of the data each task should process, and
-/// a set of parameters to pass on to the functor.
+/// a set of additional parameters to pass on to the functor.
 /// The individual tasks are created by binding the functor with
 ///
 /// - a range from the partitioning, describing it's share of the data
@@ -402,17 +407,25 @@ void action_wrapper ( std::function < void() > action ,
 /// create a 'joint_task' object which handles the interaction with the
 /// thread pool and notifies once all tasks have completed.
 
+static thread_pool common_thread_pool ; // keep a thread pool only for multithread()
+
 template < class range_type , class ...Types >
 int multithread ( void (*pfunc) ( range_type , Types... ) ,
                   partition_type < range_type > partitioning ,
                   Types ...args )
 {
-  static thread_pool tp ; // keep a thread pool only for multithread()
-
   // get the number of ranges in the partitioning
 
   int nparts = partitioning.size() ;
   
+  if ( nparts <= 1 )
+  {
+    // if only one part is in the partitioning, we take a shortcut
+    // and execute the function right here:
+    (*pfunc) ( partitioning[0] , args... ) ;
+    return 1 ;
+  }
+
   // set up the individual tasks by binding all arguments to p_func,
   // resulting in a vecor of std::function<void()> to pass to the
   // joint_task object
@@ -420,9 +433,8 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
   std::vector < std::function < void() > > taskv ( nparts ) ;
 
   for ( int s = 0 ; s < nparts ; s++ )
-  {
+
     taskv[s] = std::bind ( pfunc , partitioning[s] , args... ) ;
-  }
 
   // create the joint_task object, passing the thread pool and
   // the vector of tasks.
@@ -430,13 +442,13 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
   // all complete before notifying on its' finished_cv condition
   // variable
 
-  joint_task jt ( tp , taskv ) ;
+  joint_task jt ( common_thread_pool , taskv ) ;
   
-  {
-    // wait until 'jt.finished' is true
-    std::unique_lock<std::mutex> lk ( jt.finished_mutex ) ;
-    jt.finished_cv.wait ( lk , [&jt] { return jt.finished ; } ) ;
-  }
+  // wait until 'jt.finished' is true
+
+  std::unique_lock<std::mutex> lk ( jt.finished_mutex ) ;
+  
+  jt.finished_cv.wait ( lk , [&jt] { return jt.finished ; } ) ;
 
   // now the joint task is complete
 
@@ -446,6 +458,7 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
 /// this overload of multithread() takes the desired number of tasks and a
 /// range covering the 'whole' data. partition() is called with the range,
 /// resulting in a partitioning which above overload of multithread() can use.
+
 //  TODO we still have to update all calls to multithread() to pass in the
 //  number of partial tasks (so we can omit the * 8)
 
@@ -455,118 +468,23 @@ int multithread ( void (*pfunc) ( range_type , Types... ) ,
                   range_type range ,
                   Types ...args )
 {
-  // partition the range and update nparts in case the partitioning produced
-  // less ranges than we initially asked for. Rely on the presence of a function
+  if ( nparts <= 1 )
+  {
+    // if only one part is requested, we take a shortcut and execute
+    // the function right here:
+    (*pfunc) ( range , args... ) ;
+    return 1 ;
+  }
+
+  // partition the range. Rely on the presence of a function
   // partition() which can process range_type:
 
   partition_type < range_type > partitioning = partition ( range , nparts ) ;
-  nparts = partitioning.size() ;
   
   return multithread ( pfunc , partitioning , args... ) ;
 }
 
-// /// multithread_async works like multithread, but instead of waiting
-// /// for the joint task to complete, it immediately returns the joint_task
-// /// object (beware, this has to be deleted in the calling code)
-// 
-// template < class range_type , class ...Types >
-// joint_task * multithread_async ( void (*pfunc) ( range_type , Types... ) ,
-//                                  partition_type < range_type > partitioning ,
-//                                  Types ...args )
-// {
-//   int nparts = partitioning.size() ;
-//   std::vector < std::function < void() > > taskv ( nparts ) ;
-//   for ( int s = 0 ; s < nparts ; s++ )
-//   {
-//     taskv[s] = std::bind ( pfunc , partitioning[s] , args... ) ;
-//   }
-//   // here we don't wait for the tasks to complete but simply return
-//   // a pointer to the joint task object, passing control to the caller.
-//   return new joint_task ( taskv ) ;
-// }
-// 
-// /// overload of multithread_async with automatic range partitioning
-// 
-// template < class range_type , class ...Types >
-// joint_task * multithread_async ( void (*pfunc) ( range_type , Types... ) ,
-//                                  int nparts ,
-//                                  range_type range ,
-//                                  Types ...args )
-// {
-//   partition_type < range_type > partitioning = partition ( range , nparts ) ; // * 8 ) ;
-//   nparts = partitioning.size() ;
-//   
-//   return multithread_async ( pfunc , partitioning , args... ) ;
-// }
-
-/// simple routine to run a single task asynchronously
 
-// template < class ...Types >
-// void run_async ( void (*pfunc) ( Types... ) ,
-//                  Types ...args )
-// {
-//   auto task = std::bind ( pfunc , args... ) ;
-//   std::lock_guard<std::mutex> lk ( task_mutex ) ;
-//   task_queue.push ( task ) ;
-// }
-
-
-// some apply type routines, preliminary
-// the idea is to write a functor which bundles all processing steps.
-// this functor is then aplied to the target array, filling in the result
-// by calculating it value by value.
-// TODO why is this code in multithread.h??
-
-template < typename view_type >
-void apply_point_func ( void (*pfunc) ( typename view_type::value_type & ) ,
-                        shape_range_type < view_type::actual_dimension > range ,
-                        view_type * p_view )
-{
-  auto view = p_view->subarray ( range[0] , range[1] ) ;
-  for ( auto & v : view )
-    pfunc ( v ) ;
-}
-  
-#ifdef USE_VC
-
-template < typename view_type >
-void apply_v_point_func ( void (*pvfunc) ( typename view_type::value_type * ) ,
-                          shape_range_type < view_type::actual_dimension > range ,
-                          view_type * p_view )
-{
-  auto view = p_view->subarray ( range[0] , range[1] ) ;
-  typedef typename view_type::value_type value_type ;
-  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
-  const int vsize = vspline::vector_traits < ele_type > :: vsize ;
-  int aggregates = view.size() / vsize ;
-  int leftover = view.size() - aggregates * vsize ;
-  assert ( leftover == 0 ) ;
-  ele_type * base = (ele_type*) (view.data()) ;
-  for ( int a = 0 ; a < aggregates ; a++ )
-    pvfunc ( base + vsize * a ) ;    
-}
- 
-template < typename view_type >
-void apply_v_point_func ( void (*pvfunc) ( typename view_type::value_type * ) ,
-                          void (*pfunc) ( typename view_type::value_type & ) ,
-                          shape_range_type < view_type::actual_dimension > range ,
-                          view_type * p_view )
-{
-  auto view = p_view->subarray ( range[0] , range[1] ) ;
-  typedef typename view_type::value_type value_type ;
-  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
-  const int vsize = vspline::vector_traits < ele_type > :: vsize ;
-  int aggregates = view.size() / vsize ;
-  int leftover = view.size() - aggregates * vsize ;
-  ele_type * base = (ele_type*) (view.data()) ;
-  for ( int a = 0 ; a < aggregates ; a++ )
-    pvfunc ( base + vsize * a ) ;    
-  for ( int l = 0 ; l < leftover ; l++ )
-    pfunc ( view [ vsize * aggregates + l ] ) ;    
-}
- 
-#endif
-    
 } ; // end if namespace vspline
 
 #endif // #ifndef VSPLINE_MULTITHREAD_H
diff --git a/remap.h b/remap.h
index ef8bc72..47233d6 100644
--- a/remap.h
+++ b/remap.h
@@ -52,12 +52,10 @@
 /// st_remap is the single_threaded implementation; remap itself partitions it's work
 /// and creates several threads, each running one instance of st_remap.
 ///
-/// all remap routines take several template arguments:
+/// all remap routines take two template arguments:
 ///
-/// - coordinate_type:   TinyVector of float or double for coordinates, or split type
-/// - value_type:        like, float, double, complex<>, pixels, TinyVectors etc.
 /// - interpolator_type: functor object yielding values for coordinates
-/// - dim_out:           number of dimensions of output array
+/// - dim_target:        number of dimensions of output array
 ///
 /// remaps to other-dimensional objects are supported. This makes it possible to,
 /// for example, remap from a volume to a 2D image, using a 2D warp array containing
@@ -91,6 +89,14 @@
 ///
 /// This file also has code to evaluate a b-spline at positions in a mesh grid, which can be used
 /// for scaling, and for separable geometric transformations.
+///
+/// The current implementation of the remap functionality uses a straightforward mode of
+/// operation, which factors out the various needed tasks into separate bits of code. The
+/// result data are acquired by 'pulling' them into the target array by repeatedly calling
+/// a functor yielding the results. This functor is a closure containing all logic needed
+/// to produce the result values in scan order of the target array. While the two remap routines
+/// and grid_eval should cover most use cases, it's quite possible to use the routine fill()
+/// itself, passing in a suitable functor.
 
 #ifndef VSPLINE_REMAP_H
 #define VSPLINE_REMAP_H
@@ -103,289 +109,9 @@ namespace vspline {
 using namespace std ;
 using namespace vigra ;
 
-/// in st_remap() we want to derive the elementary type of a coordinate from
-/// the template argument 'coordinate_type'. We use a traits class for this purpose.
-/// per default, whatever the coordinate_type may be, take it to be the rc_type as well.
-/// This works for simple 1D coordinates like float or double.
-
-/// declaring an appropriate evaluator is quite a mouthful, so we use
-/// this 'using' expresssion to mak it more handy:
-
-template < int dimension , typename value_type , typename coordinate_type >
-using evaluator_type
-= evaluator < dimension ,
-              value_type ,
-              typename coordinate_traits < coordinate_type > :: rc_type ,
-              typename coordinate_traits < coordinate_type > :: ic_type > ;
-
-/// st_remap is the single-thread routine to perform a remap with a given warp array
-/// at p_warp (containing coordinates) into a target array at p_output, which takes the
-/// interpolated values at the locations the coordinates indicate.
-///              
-/// While calling this function directly is entirely possible, code wanting to perform
-/// a remap would rather call the next function which partitions the task at hand and
-/// uses a thread pool to process the partial jobs.
-///              
-/// The first parameter, range, specifies a subarray of warp and output which this
-/// routine is meant to process. this may be the full range, but normally it will cover
-/// only equally sized parts of these arrays, where the precise extent is determined
-/// by multithread() or it's caller in the next routine, see there.
-///              
-/// Note how I pass in several pointers. Normally I'd pass such data by reference,
-/// but I can't do this here (or haven't found a way yet) because of my parameter-handling
-/// in multithread() which I couldn't get to work with references. The effect is the same.
-/// Note how this routine takes 'interpolator_type' as a template argument. Since the code
-/// in this routine is not specific to b-splines, it accepts not only b-spline evaluators
-/// but any object satisfying the interface defined in class interpolator. This opens the
-/// remapping code for other interpolation methods as well.
-///              
-/// The interface defined by class interpolator is minimal and does not contain pure
-/// virtual methods for many of the methods defined in class evaluator, but then many
-/// of class evaluator's methods only are meaningful for b-spline evaluation, like
-/// treatment of 'split coordinates'. The remap code in this method and it's companion
-/// below only use methods from the interpolator interface.
-              
-template < typename coordinate_type ,    // type of coordinates in the warp array
-           typename value_type ,         // type of values to produce
-           typename interpolator_type  , // functor object yielding values for coordinates
-           int dim_out >                 // number of dimensions of output array
-void st_remap ( shape_range_type < dim_out >                  range ,
-                const interpolator_type * const               p_ev ,
-                const MultiArrayView < dim_out , coordinate_type > * const p_warp ,
-                MultiArrayView < dim_out , value_type > *     p_output ,
-                bool use_vc = true
-              )
-{
-  enum { dim_in = coordinate_traits < coordinate_type > :: dimension } ;
-  typedef typename coordinate_traits < coordinate_type > :: rc_type rc_type ;
-  typedef typename interpolator_type::ele_type ele_type ;
-  
-  // pick out the subarrays specified by 'range'
-  // this idiom reoccurs throughout, since the multithreading code I use does not
-  // actually partition the containers it handles, but instead only creates information
-  // on how to partition them, and then passes this information to the functionals
-  // used for starting the threads. While this results in slight bloat of the single
-  // thread code, it makes the calling side much more legible and requires no use of
-  // bind, tuples or the likes.
-
-  auto warp = p_warp->subarray ( range[0] , range[1] ) ;
-  auto output = p_output->subarray ( range[0] , range[1] ) ;
-
-  auto source_it = warp.begin() ;
-  auto target_it = output.begin() ;
-  
-  // since coding for all variations of strided and unstrided memory in warp and output
-  // is a bit much, we only differentiate between the ideal case where all memory is unstrided
-  // and all other cases, where we use buffering of vsize elements. In both cases, we pass
-  // consecutive memory to the evaluator, which can deinterleave consistently.
-  // TODO: this pattern of coiteration/cotraversal with suitable iteration methods is
-  // reoccuring and might be worth factoring out, also because these traversals are
-  // often needed in collateral code, like in apply or transform functions.
-
-  int leftover = warp.elementCount() ;
-
-#ifdef USE_VC
-
-  typedef typename interpolator_type::ele_v ele_v ;
-  const int vsize = interpolator_type::vsize ;
-
-  if ( use_vc )
-  {
-    int aggregates = warp.elementCount() / vsize ;            // number of full vectors
-    leftover = warp.elementCount() - aggregates * vsize ;     // any leftover single values
-    coordinate_type * source = warp.data() ;                  // acces to memory
-    value_type * destination = output.data() ;
-
-    if ( warp.isUnstrided() )
-    {
-      // best case: warp array has consecutive memory
-      if ( output.isUnstrided() )
-      {
-        // best case: output array has consecutive memory, no need to buffer
-        for ( int a = 0 ; a < aggregates ; a++ , source += vsize , destination += vsize )
-        {
-          p_ev->v_eval ( source , destination ) ;
-        }
-        target_it += aggregates * vsize ;
-      }
-      else
-      {
-        // we fall back to buffering and storing individual result values
-        // TODO while this is a straightforward implementation, it should be more efficient
-        // to (de)interleave here and call the fully vectorized evaluation code.
-        value_type target_buffer [ vsize ] ;
-        for ( int a = 0 ; a < aggregates ; a++ , source += vsize )
-        {
-          p_ev->v_eval ( source , target_buffer ) ;
-          for ( int e = 0 ; e < vsize ; e++ )
-          {
-            *target_it = target_buffer[e] ;
-            ++target_it ;
-          }
-        }
-      }
-      source_it += aggregates * vsize ;
-    }
-    else
-    {
-      // we fall back to loading and buffering individual warp values
-      coordinate_type source_buffer [ vsize ] ;
-      if ( output.isUnstrided() )
-      {
-        // best case: output array has consecutive memory
-        for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
-        {
-          for ( int e = 0 ; e < vsize ; e++ )
-          {
-            source_buffer[e] = *source_it ;
-            ++source_it ;
-          }
-          p_ev->v_eval ( source_buffer , destination ) ;
-        }
-        target_it += aggregates * vsize ;
-      }
-      else
-      {
-        // we also fall back to buffering and storing individual result values
-        value_type target_buffer [ vsize ] ;
-        for ( int a = 0 ; a < aggregates ; a++ )
-        {
-          for ( int e = 0 ; e < vsize ; e++ )
-          {
-            source_buffer[e] = *source_it ;
-            ++source_it ;
-          }
-          p_ev->v_eval ( source_buffer , target_buffer ) ;
-          for ( int e = 0 ; e < vsize ; e++ )
-          {
-            *target_it = target_buffer[e] ;
-            ++target_it ;
-          }
-        }
-      }
-    }
-  }
-  
-#endif // USE_VC
-
-  // process leftovers, if any - if vc isn't used, this loop does all the processing
-  while ( leftover-- )
-  {
-    // process leftovers with single-value evaluation
-    p_ev->eval ( *source_it , *target_it ) ;
-    ++source_it ;
-    ++target_it ;
-  }
-}
-
-/// this remap variant performs the remap with several threads. The routine above is
-/// single-threaded, but all individual evaluations are independent of each other.
-/// So making it multi-threaded is trivial: just split the warp array and target
-/// array n-ways and have each thread process a chunk. Initially I used this approach
-/// directly, but later I switched to multithreading code which doesn't actually touch
-/// the containers, but instead only passes partitioning information to the individual
-/// threads, leaving it to them to split the containers accordingly, which results in
-/// simple and transparent code with only few limitations, see multithread.h
-
-template < typename coordinate_type ,    // type of coordinates in the warp array
-           typename value_type ,         // type of values to produce
-           typename interpolator_type  , // functor object yielding values for coordinates
-           int dim_out >                 // number of dimensions of output array
-void remap ( const interpolator_type & ev ,
-             const MultiArrayView < dim_out , coordinate_type > & warp ,
-             MultiArrayView < dim_out , value_type > & output ,
-             bool use_vc = true ,
-             int nthreads = ncores
-           )
-{
-  enum { dim_in = coordinate_traits < coordinate_type > :: dimension } ;
-
-  // check shape compatibility
-  
-  if ( output.shape() != warp.shape() )
-  {
-    throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
-  }
-
-  // set up 'range' to cover a complete array of output's size
-  // (or, warp's size, which is the same)
-  
-  shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
-  
-  // call multithread(), specifying the single-threaded remap routine as the functor
-  // to invoke the threads, passing in 'range', which will be partitioned by multithread(),
-  // followed by all the other parameters the single-threaded remap needs, which is
-  // pretty much the set of parameters we've received here, with the subtle difference
-  // that we can't pass anything on by reference and use pointers instead.
-
-  multithread ( & st_remap < coordinate_type ,
-                             value_type ,
-                             interpolator_type ,
-                             dim_out > ,
-                nthreads * 8 , // heuristic. desired number of partitions
-                range ,        // 'full' range which is to be partitioned
-                &ev ,          // interpolator_type object
-                &warp ,        // warp array
-                &output ,      // target array
-                use_vc ) ;     // flag to switch use of Vc on/off
-} ;
-
 template < int dimension >
 using bcv_type = vigra::TinyVector < bc_code , dimension > ;
 
-/// This is a variant of remap, which directly takes an array of values and remaps it,
-/// internally creating a b-spline of given order just for the purpose. This is used for
-/// one-shot remaps where the spline isn't reused.
-
-template < typename coordinate_type , // type of coordinates in the warp array
-           typename value_type ,      // type of values to produce
-           int dim_out >              // number of dimensions of output array
-int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dimension ,
-                             value_type > input ,
-            const MultiArrayView < dim_out , coordinate_type > & warp ,
-            MultiArrayView < dim_out , value_type > & output ,
-            bcv_type < coordinate_traits < coordinate_type > :: dimension > bcv
-              = bcv_type < coordinate_traits < coordinate_type > :: dimension >
-                ( MIRROR ) ,
-            int degree = 3 ,
-            bool use_vc = true ,
-            int nthreads = ncores
-          )
-{
-  const int dim_in = coordinate_traits < coordinate_type > :: dimension ;
-  typedef typename coordinate_traits < coordinate_type > :: rc_type rc_type ;
-  typedef typename coordinate_traits < coordinate_type > :: ic_type ic_type ;
-
-  // check shape compatibility
-  
-  if ( output.shape() != warp.shape() )
-  {
-    throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
-  }
-
-  // create the bspline object
-  // TODO may want to specify tolerance here instead of using default
-  
-  bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
-  
-  // prefilter, taking data in 'input' as knot point data
-  
-  bsp.prefilter ( use_vc , nthreads , input ) ;
-
-  // create an evaluator over the bspline
-
-  typedef evaluator < dim_in , value_type , rc_type , ic_type > evaluator_type ;
-  evaluator_type ev ( bsp ) ;
-  
-  // and call the other remap variant,
-  // passing in the evaluator, the coordinate array and the target array
-  
-  remap < coordinate_type , value_type , evaluator_type , dim_out >
-    ( ev , warp , output , use_vc , nthreads ) ;
-    
-  return 0 ;
-}
-
 // Next we have some collateral code to get ready for the transformation-based remap
 
 /// declaration of a generalized coordinate transformation. In vspline, we're using
@@ -416,7 +142,7 @@ using transform
 /// if there is no vectorized code at hand. This is less efficient, but often just enough,
 /// especially if the transformation is simple.
 ///
-/// class transformation is constructed either with a single coordinate transformation
+/// struct transformation is constructed either with a single coordinate transformation
 /// functor, or, if USE_VC is defined, optionally with a vectorized coordinate transformation
 /// functor as second constructor argument. The functors have to satisfy the two type aliases
 /// transform and vtransform (see above). For illustration, consider this:
@@ -436,7 +162,7 @@ using transform
 /// vspline::transformation < float , 2 , 2 >
 ///   tf ( tf_identity<float> , vtf_identity<Vc::float_v>  ) ;
 ///
-/// we could also using the single-argument constructor to the same effect, but not using
+/// we could also using the single-argument constructor to the same effect, not using
 /// vectorized transformations:
 ///                              
 /// vspline::transformation < float , 2 , 2 >
@@ -447,7 +173,7 @@ template < class rc_type ,   /// elementary coordinate type
            int dim_source ,  /// dimension of outgoing coordinates (== dim. of coefficient array)
            int vsize = 1     /// number of elements in simdized coordinate
          >
-class transformation
+struct transformation
 {
   typedef transform < rc_type , dim_target , dim_source > transform_type ;
 
@@ -528,7 +254,8 @@ public :
   /// class transformation's operator() delegates to the functor passed in at construction.
   /// it transforms a target coordinate, c_target, into a source coordinate c_source
 
-  void operator() ( const target_nd_rc_type& c_target , source_nd_rc_type& c_source )
+  void operator() ( const target_nd_rc_type& c_target ,
+                    source_nd_rc_type& c_source ) const
   {
     tf ( c_target , c_source ) ;
   }
@@ -537,7 +264,8 @@ public :
 
   /// if USE_VC is defined, we overload operator() for vecorized arguments
   
-  void operator() ( const target_nd_rc_v& c_target , source_nd_rc_v& c_source )
+  void operator() ( const target_nd_rc_v& c_target ,
+                    source_nd_rc_v& c_source ) const
   {
     vtf ( c_target , c_source ) ;
   }
@@ -550,43 +278,53 @@ public :
 /// supports dereferencing and preincrement, but for the intended purpose that's
 /// enough. The iterator is cyclical, so the number of times it's used has to be
 /// controlled by the calling code.
+///
 /// dereferencing this object will yield a vectorized coordinate which is guaranteed
 /// to deinterleave to vsize valid ordinary coordinates, but it is only guaranteed that
 /// these ordinary coordinates are unique if several preconditions are met:
+///
 /// - the shape (end - start) must have more than vsize entries
+///
 /// - the iteration doesn't proceed beyond the number of full vectors
+///
 /// The vectorized coordinate delivered at the 'wrap-around-point', so one iteration
 /// after the last full set has been taken, will be partly wrapped around to the start
 /// of the iteration. This makes sure the contained ordinary coordinates are valid.
 /// but accessing data through it will again touch elements which have already been
 /// seen. As long as the indices aren't used for an in-place operation, recalculating
 /// a few values does no harm.
+
 // TODO this is utility code and might be placed into another header, probably next to
 // the routine producing successive gather indices for vectorized array traversal
+
 // TODO when experimenting with coordinate iterators, I found that vectorized code for
 // creating gather/scatter indexes performed worse than simply filling the index vector
 // with successive single indices generated by vigra, and using vigra's facilities
 // also simplifies matters, see the use in filter.h, ele_aggregating_filter.
 
-template < class _rc_v , int _dimension >
+template < class rc_type , class ic_type , int _dimension , int vsize >
 struct coordinate_iterator_v
 {
   enum { dimension = _dimension } ;
-  typedef vigra::TinyVector < int , dimension > shape_type ;
 
-  typedef _rc_v rc_v ;
-  enum { vsize = rc_v::Size } ;
+  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
+  typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
+
+  typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_type , vsize > :: type ic_v ;
+  
+  typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
   typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
 
-  const shape_type start ;
-  const shape_type end ;
-  const shape_type shape ;
+  const nd_ic_type start ;
+  const nd_ic_type end ;
+  const nd_ic_type shape ;
   
-  nd_rc_v c ;
+  nd_ic_v c ;
 
 public:
 
-  coordinate_iterator_v ( shape_type _start , shape_type _end )
+  coordinate_iterator_v ( nd_ic_type _start , nd_ic_type _end )
   : start ( _start ) ,
     end ( _end ) ,
     shape ( _end - _start )
@@ -597,7 +335,7 @@ public:
 
     for ( int i = 0 ; i < vsize ; i++ )
     {
-      shape_type index = *mci + start ;
+      nd_ic_type index = *mci + start ;
       ++mci ;
       if ( mci >= mci_end )
         mci = mci_start ;
@@ -606,12 +344,12 @@ public:
     }
   } ;
 
-  const nd_rc_v& operator*()
+  const nd_ic_v & operator*()
   {
     return c ;
   }
 
-  coordinate_iterator_v operator++()
+  coordinate_iterator_v & operator++()
   {
     c[0] += vsize ;        // increase level 0 values
     auto mask = ( c[0] >= end[0] ) ; // mask overflow
@@ -706,137 +444,552 @@ public:
 //   } ;
 // } ;
 
-/// this function provides a generalized geometric transformation
-/// In image transformations, often the source location at which interpolation
-/// of the source image is required is defined by a transformation of the location
-/// of the target coordinate. There are (at least) two approaches to handle this.
-/// The first is what is implemented in the previous implementation of remap: we
-/// calculate a 'warp' array full of transformed coordinates and process it en bloc.
-/// Alternatively, with the coordinate transformation at hand, we can write a remap
-/// variant which performs the coordinate transformation for each target coordinate
-/// as it goes along, saving the construction of the warp array.
+/// struct _fill contains the implementation of the 'engine' used for remap-like
+/// functions. The design logic is this: a remap will ultimately produce an array
+/// of results. This array is filled in standard scan order sequence by repeated
+/// calls to a functor containing all the logic to produce values in the required
+/// order. The functor is like a closure, being set up initially with all parameters
+/// needed for the task at hand (like with a warp array, a transformation, a genuine
+/// generator function etc.). Since the functor controls the path of the calculation
+/// from whatever starting point to the production of the final result, there are no
+/// intermediate containers for intermediate results. Since the remap process is
+/// mainly memory-bound, this strategy helps keeping memory use low. The data can
+/// be produced one by one, but the code has vectorized operation as well, which
+/// brings noticeable performance gain. With vectorized operation, instead of producing
+/// single values, the engine produces bunches of values. This operation is transparent
+/// to the caller, since the data are deposited in normal interleaved fashion. The
+/// extra effort for vectorized operation is in the implementation of the generator
+/// functor and reasonably straightforward. If only the standard remap functions
+/// are used, the user can remain ignorant of the vectorizetion.
 ///
-/// The transformation is passed in by using a 'transformation' object, which is a
-/// wrapper around the actual transformation routine(s). Note that this routine is
-/// also suitable for any type of interpolator satisfying the interface defined in
-/// class interpolator in interpolator.h.
-
-template < class interpolator_type , // type satisfying the interface in class interpolator
-           int dim_target ,          // dimension of incoming coordinates (== dim. of target array)
-           int dim_source >          // dimension of outgoing coordinates (== dim. of coefficient array)
-void st_tf_remap ( shape_range_type < dim_target > range ,
-                   const interpolator_type * const p_interpolator ,
-                   transformation < typename interpolator_type::rc_type ,
-                                    dim_target ,
-                                    dim_source ,
-                                    interpolator_type::vsize > tf ,
-                   vigra::MultiArrayView < dim_target ,
-                                           typename interpolator_type::value_type > * p_output ,
-                   bool use_vc = true
-                 )
+/// Why is _fill an object? It might be a function if partial specialization of functions
+/// were allowed. Since this is not so, we have to use a class and put the function in it's
+/// operator(). Now we can partially specialize the object with the desired effect.
+///
+/// struct _fill's operator() takes an object of class generator_type. This object
+/// has to satisfy a few requirements:
+///
+/// - it has to have an overloaded operator() with two signatures: one taking
+///   a pointer to vsize value_type, one taking a reference to a single value_type.
+///   these arguments specify where to deposit the generator's output.
+///
+/// - it has to offer a bindOuter routine producing a subdimensional generator
+///   to supply values for a slice of output
+///
+/// - it has to offer a subrange routine, limiting output to a subarray
+///   of the 'whole' output
+
+/// _fill is used by st_fill. It's an object implementing an hierarchical fill
+/// of the target array if this makes sense: If the whole target array is unstrided,
+/// the best-case loop is executed straight away. Otherwise, we check if the target
+/// array is unstrided in dimension 0, in which case we iterate over subdimensional
+/// slices. Only if even dimension 0 is unstrided, we don't recurse and use the less
+/// efficient buffering loop, but without the recursion overhead. All in all this
+/// seems like a good compromise.
+
+template < typename generator_type  , // functor object yielding values
+           int dim_out >              // number of dimensions of output array
+struct _fill
 {
-  typedef typename interpolator_type::ele_type ele_type ;
-  typedef typename interpolator_type::rc_type rc_type ;
-  
-  auto output = p_output->subarray ( range[0] , range[1] ) ;
-  auto offset = range[0] ;
+  void operator() ( generator_type & gen ,
+                    MultiArrayView < dim_out , typename generator_type::value_type >
+                      & output ,
+                    bool use_vc = true
+                  )
+  {
+    typedef typename generator_type::value_type value_type ;
+    
+    auto target_it = output.begin() ;  
+    int leftover = output.elementCount() ;
 
-#ifdef USE_VC
+  #ifdef USE_VC
 
-  typedef typename interpolator_type::value_type value_type ;
-  typedef typename interpolator_type::ele_v ele_v ;
-  typedef typename interpolator_type::rc_v rc_v ;
-  typedef typename interpolator_type::mc_ele_v mc_ele_v ;
-  const int vsize = interpolator_type::vsize ;
+    const int vsize = generator_type::vsize ;
 
-   // incoming coordinates (into output, which has dim_target dimensions)
-  vigra::TinyVector < rc_v , dim_target > c_target ;
+    if ( use_vc )
+    {
+      int aggregates = leftover / vsize ;        // number of full vectors
+      leftover -= aggregates * vsize ;           // remaining leftover single values
+      value_type * destination = output.data() ; // pointer to target's memory
 
-  // transformed coordinates (into input, which has dim_source dimensions)
-  vigra::TinyVector < rc_v , dim_source > c_source ;
+      if ( output.isUnstrided() )
+      {
+        // best case: output array has consecutive memory
+        for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
+        {
+          gen ( destination ) ; // pass pointer to destination to generator
+        }
+        target_it += aggregates * vsize ; // advance target_it to remaining single values
+      }
 
-#endif
-  
-  auto target_it = createCoupledIterator ( output ) ;
-  int leftover = output.elementCount() ;
-  
-#ifdef USE_VC
+      else if ( output.isUnstrided(0) )
+      {
+        // at least dimension 0 is unstrided, so recursion makes sense
+        for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
+        {
+          // recursively call _fill for each slice along the highest axis
+          auto sub_output = output.bindOuter ( c ) ;
+          auto sub_gen = gen.bindOuter ( c ) ;
+          _fill < decltype ( sub_gen ) , dim_out - 1 >() ( sub_gen , sub_output ) ;
+        }
+        leftover = 0 ; // no leftovers in this case
+      }
 
-  if ( use_vc )
+      else
+      {
+        // unstrided even in dimension 0, no point in recursing
+        // fall back to buffering and storing individual result values
+        value_type target_buffer [ vsize ] ;
+        for ( int a = 0 ; a < aggregates ; a++ )
+        {
+          gen ( target_buffer ) ;        // pass pointer to buffer to generator
+          for ( int e = 0 ; e < vsize ; e++ )
+          {
+            *target_it = target_buffer[e] ;   // and deposit in target
+            ++target_it ;
+          }
+        }
+      }
+    }
+    
+  #endif // USE_VC
+
+    // process leftovers, if any - if vc isn't used, this loop does all the processing
+    while ( leftover-- )
+    {
+      // process leftovers with single-value evaluation
+      gen ( *target_it ) ;
+      ++target_it ;
+    }
+  }
+} ;
+
+/// partial specialization od struct _fill for dimension 1. Here we only have
+/// to distinguish two cases: either the array is unstrided, so we can use the efficient
+/// loop, or it is not, in which case we resort to buffering.
+
+template < typename generator_type > // functor object yielding values
+struct _fill < generator_type , 1 >
+{
+  void operator() ( generator_type & gen ,
+                    MultiArrayView < 1 , typename generator_type::value_type >
+                      & output ,
+                    bool use_vc = true
+                  )
   {
-    int aggregates = output.elementCount() / vsize ;        // number of full vectors
-    leftover = output.elementCount() - aggregates * vsize ; // any leftover single values
-    value_type * destination = output.data() ;              // pointer to target data
+    typedef typename generator_type::value_type value_type ;
     
-    // use utility class coordinate_iterator_v to produce vectorized integral coordinates:
-    coordinate_iterator_v < rc_v , dim_source >
-      civ ( offset , offset + output.shape() ) ;
+    auto target_it = output.begin() ;  
+    int leftover = output.elementCount() ;
+
+  #ifdef USE_VC
 
-    if ( output.isUnstrided() )
+    const int vsize = generator_type::vsize ;
+
+    if ( use_vc )
     {
-      for ( int a = 0 ; a < aggregates ; a++ )
+      int aggregates = leftover / vsize ;        // number of full vectors
+      leftover -= aggregates * vsize ;           // remaining leftover single values
+      value_type * destination = output.data() ; // pointer to target's memory
+
+      if ( output.isUnstrided() )
       {
-        c_target = *civ ;            // for clarity: vectorized coordinate into target
-        tf ( c_target , c_source ) ; // perform the (reverse) coordinate transformation
-        ++civ ;                      // advance civ
-        // use the interpolator to yield a value at coordinate c_source
-        p_interpolator->v_eval ( c_source , destination ) ;
-        destination += vsize ;
+        // best case: output array has consecutive memory
+        for ( int a = 0 ; a < aggregates ; a++ , destination += vsize )
+        {
+          gen ( destination ) ; // pass pointer to destination to generator
+        }
+        target_it += aggregates * vsize ; // advance target_it to remaining single values
       }
-      // update target_it which we haven't used
-      target_it += aggregates * vsize ;
-    }
-    else
-    {
-      // alternative evaluation if we can't write straight to destination:
-      // let the interpolator yield into target_buffer and then scatter target_buffer
-      // to successive target values, as directed by target_it
-      value_type target_buffer [ vsize ] ;
-      for ( int a = 0 ; a < aggregates ; a++ )
+
+      else
       {
-        c_target = *civ ;            // for clarity: vectorized coordinate into target
-        tf ( c_target , c_source ) ; // perform the (reverse) coordinate transformation
-        ++civ ;                      // advance civ
-        p_interpolator->v_eval ( c_source , target_buffer ) ;
-        for ( int e = 0 ; e < vsize ; e++ )
+        // fall back to buffering and storing individual result values
+        value_type target_buffer [ vsize ] ;
+        for ( int a = 0 ; a < aggregates ; a++ )
         {
-          target_it.template get<1>() = target_buffer[e] ;
-          ++target_it ;
+          gen ( target_buffer ) ;        // pass pointer to buffer to generator
+          for ( int e = 0 ; e < vsize ; e++ )
+          {
+            *target_it = target_buffer[e] ;   // and deposit in target
+            ++target_it ;
+          }
         }
       }
     }
+    
+  #endif // USE_VC
+
+    // process leftovers, if any - if vc isn't used, this loop does all the processing
+    while ( leftover-- )
+    {
+      // process leftovers with single-value evaluation
+      gen ( *target_it ) ;
+      ++target_it ;
+    }
   }
+} ;
+
+/// single-threaded fill. This routine receives the range to process and the generator
+/// object capable of providing result values. The generator object is set up to provide
+/// values for the desired subrange and then passed to _fill, which handles the calls to
+/// the generator object and the depositing of the result values into the target array.
+
+template < typename generator_type  , // functor object yielding values
+           int dim_out >              // number of dimensions of output array
+void st_fill ( shape_range_type < dim_out >                  range ,
+               generator_type * const                        p_gen ,
+               MultiArrayView < dim_out , typename generator_type::value_type > * p_output ,
+               bool use_vc = true )
+{
+  // pick out output's subarray specified by 'range'
+
+  auto output = p_output->subarray ( range[0] , range[1] ) ;
   
-#endif // USE_VC
+  // limit the generator to cover the same range
+  
+  auto gen = p_gen->subrange ( range ) ;
+  
+  // have the results computed and put into the target
+
+  _fill < generator_type , dim_out >() ( gen , output , use_vc ) ;
+}
 
-  vigra::TinyVector < rc_type , dim_target > cs_target ;
-  vigra::TinyVector < rc_type , dim_source > cs_source ;
+/// multithreaded fill. This is the top-level fill routine. It takes a functor capable
+/// of delivering successive result values (in the target array's scan order), and calls
+/// this functor repeatedly until 'output' is full.
+/// this task is distributed to several worker threads by means of 'multithread', which in
+/// turn uses st_fill, the single-threaded fill routine.
+
+template < typename generator_type  , // functor object yielding values
+           int dim_target >              // number of dimensions of output array
+void fill ( generator_type & gen ,
+            MultiArrayView < dim_target , typename generator_type::value_type >
+              & output ,
+            bool use_vc = true ,
+            int nthreads = ncores
+          )
+{
+  // set up 'range' to cover a complete array of output's size
   
-  // process leftovers, if any - if vc wasn't used, this loop does all the processing
-  while ( leftover-- )
+  shape_range_type < dim_target > range ( shape_type < dim_target > () , output.shape() ) ;
+  
+  // call multithread(), specifying the single-threaded fill routine as the functor
+  // to invoke the threads, passing in 'range', which will be partitioned by multithread(),
+  // followed by all the other parameters the single-threaded fill needs, which is
+  // pretty much the set of parameters we've received here, with the subtle difference
+  // that we can't pass anything on by reference and use pointers instead.
+
+  multithread ( & st_fill < generator_type , dim_target > ,
+                nthreads * 8 , // heuristic. desired number of partitions
+                range ,        // 'full' range which is to be partitioned
+                &gen ,         // generator_type object
+                &output ,      // target array
+                use_vc ) ;     // flag to switch use of Vc on/off
+} ;
+
+/// struct warp_generator combines a warp array and an interpolator into a functor
+/// producing values to store in the output. This functor handles the whole chain of
+/// operations from picking the coordinates from the warp array to delivering target
+/// values. It can be fed to fill() to produce a remap function, see below.
+
+template < int dimension ,
+           typename interpolator_type ,
+           bool strided_warp = true >
+struct warp_generator
+{
+  typedef typename interpolator_type::nd_rc_type nd_rc_type ;
+  typedef typename interpolator_type::value_type value_type ;
+  
+  typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
+  
+  const warp_array_type warp ; // must not use reference here!
+  typename warp_array_type::const_iterator witer ;
+  const nd_rc_type * warp_data ;
+  
+  const interpolator_type & itp ;
+
+  warp_generator
+    ( const warp_array_type & _warp ,
+      const interpolator_type & _itp )
+  : warp ( _warp ) ,
+    itp ( _itp ) ,
+    warp_data ( warp.data() ) ,
+    witer ( _warp.begin() )
+  { } ;
+
+  void operator() ( value_type & target )
   {
-    // process leftovers with single-value evaluation of transformed coordinate
-    // get the coordinate into the target array:
-    cs_target = target_it.template get<0>() + offset ;
-    // transform it into a (source) coordinate (to feed the interpolator)
-    tf ( cs_target , cs_source ) ;
-    // interpolate at this coordinate, yielding into the target array as directed by target_it
-    p_interpolator->eval ( cs_source , target_it.template get<1>() ) ;
-    ++target_it ;
+    if ( strided_warp )
+      itp.eval ( *witer , target ) ;
+    else
+    {
+      itp.eval ( *warp_data , target ) ;
+      ++warp_data ;
+    }
+  }
+
+#ifdef USE_VC
+
+  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
+  enum { vsize = vector_traits < ele_type > :: vsize } ;
+
+#else
+  
+  enum { vsize = 1 } ;
+
+#endif
+
+
+  void operator() ( value_type * target )
+  {
+    if ( strided_warp )
+    {
+      nd_rc_type buffer [ vsize ] ;
+      for ( int e = 0 ; e < vsize ; e++ , witer++ )
+        buffer[e] = *witer ;
+      itp.v_eval ( buffer , target ) ;
+    }
+    else
+    {
+      itp.v_eval ( warp_data , target ) ;
+      warp_data += vsize ;
+    }
+  }
+
+  warp_generator < dimension , interpolator_type , strided_warp >
+    subrange ( const shape_range_type < dimension > & range ) const
+  {
+    return warp_generator < dimension , interpolator_type , strided_warp >
+             ( warp.subarray ( range[0] , range[1] ) , itp ) ;
+  }
+  
+  warp_generator < dimension - 1 , interpolator_type , strided_warp >
+    bindOuter ( const int & c ) const
+  {
+    return warp_generator < dimension - 1 , interpolator_type , strided_warp >
+             ( warp.bindOuter ( c ) , itp ) ;
+  }  
+} ;
+
+/// implementation of remap() by delegation to the more general fill() routine, passing
+/// in the warp array and the interpolator via a generator object. warp_generator
+/// has a fast variant which requires the whole warp array to be unstrided, a requirement
+/// which can usually be fulfilled, since warp arrays tend to be made just for the purpose
+/// of a remap. Picking of the correct variant is done here.
+
+template < typename interpolator_type  , // functor object yielding values for coordinates
+           int dim_target >                 // number of dimensions of output array
+void remap ( interpolator_type & ev ,
+             const MultiArrayView < dim_target ,
+                                    typename interpolator_type::nd_rc_type > & warp ,
+             MultiArrayView < dim_target ,
+                              typename interpolator_type::value_type > & output ,
+             bool use_vc = true ,
+             int nthreads = ncores
+           )
+{
+  if ( warp.isUnstrided() )
+  {
+    typedef warp_generator < dim_target , interpolator_type , false > gen_t ;  
+    gen_t gen ( warp , ev ) ;  
+    fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
+  }
+  else
+  {
+    typedef warp_generator < dim_target , interpolator_type , true > gen_t ;  
+    gen_t gen ( warp , ev ) ;  
+    fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
   }
 }
 
-/// multithreaded tf_remap routine, splitting the target array to chunks.
-/// This is just as for the warp-array-based remap. The target array is split into
-/// chunks, which are in turn processed in a thread each by the single-threaded routine.
+/// This is a variant of remap, which directly takes an array of values and remaps it,
+/// internally creating a b-spline of given order just for the purpose. This is used for
+/// one-shot remaps where the spline isn't reused.
+
+template < typename coordinate_type , // type of coordinates in the warp array
+           typename value_type ,      // type of values to produce
+           int dim_out >              // number of dimensions of output array
+int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dimension ,
+                             value_type > input ,
+            const MultiArrayView < dim_out , coordinate_type > & warp ,
+            MultiArrayView < dim_out , value_type > & output ,
+            bcv_type < coordinate_traits < coordinate_type > :: dimension > bcv
+              = bcv_type < coordinate_traits < coordinate_type > :: dimension >
+                ( MIRROR ) ,
+            int degree = 3 ,
+            bool use_vc = true ,
+            int nthreads = ncores
+          )
+{
+  const int dim_in = coordinate_traits < coordinate_type > :: dimension ;
+  typedef typename coordinate_traits < coordinate_type > :: rc_type rc_type ;
+  typedef typename coordinate_traits < coordinate_type > :: ic_type ic_type ;
+
+  // check shape compatibility
+  
+  if ( output.shape() != warp.shape() )
+  {
+    throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
+  }
+
+  // create the bspline object
+  // TODO may want to specify tolerance here instead of using default
+  
+  bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
+  
+  // prefilter, taking data in 'input' as knot point data
+  
+  bsp.prefilter ( use_vc , nthreads * 8 , input ) ;
+
+  // create an evaluator over the bspline
+
+  typedef evaluator < dim_in , value_type , rc_type , ic_type > evaluator_type ;
+  evaluator_type ev ( bsp ) ;
+  
+  // and call the other remap variant,
+  // passing in the evaluator, the coordinate array and the target array
+  
+  remap < evaluator_type , dim_out > ( ev , warp , output , use_vc , nthreads ) ;
+    
+  return 0 ;
+}
+
+/// struct transform_generator provides target values by performing a sequence of steps:
+///
+/// - start with a discrete coordinate into the target array
+///
+/// - transform it into a real-valued coordinate suitable for use with an interpolator
+///   by means of a vspline::tranformation, which contains the desired geometric transformation
+///
+/// - feed the real-valued coordinate to the interpolator, yielding the result
+///
+/// transform_generator satisfies the interface needed by the fill type of routines,
+/// so it can be fed to fill(), just like a warp_generator. Doing so results in a
+/// 'transform-based remap', see tf_remap below.
+
+template < int dim_target , typename interpolator_type >
+struct transform_generator
+{
+  typedef typename interpolator_type::value_type value_type ;
+
+  enum { dim_source = interpolator_type::dimension } ;
+  
+#ifdef USE_VC
+
+  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
+  enum { vsize = vector_traits < ele_type > :: vsize } ;
+
+#else
+  
+  enum { vsize = 1 } ;
+
+#endif
+  
+  typedef typename interpolator_type::ic_type ic_type ;
+  typedef typename interpolator_type::rc_type rc_type ;
+  typedef typename interpolator_type::nd_ic_type nd_ic_type ;
+  typedef typename interpolator_type::nd_rc_type nd_rc_type ;
+  
+  typedef transformation < rc_type , dim_target , dim_source , vsize > transform_type ;
+
+  const interpolator_type & itp ;
+  const transform_type & transform ;
+  const shape_range_type < dim_target > & range ;
+  
+  vigra::MultiCoordinateIterator < dim_source > mci ;
+  
+  transform_generator
+    ( const transform_type & _transform ,
+      const interpolator_type & _itp ,
+      const shape_range_type < dim_target > & _range
+    )
+  : transform ( _transform ) ,
+    itp ( _itp ) ,
+    range ( _range ) ,
+    mci ( _range[1] - _range[0] )
+#ifdef USE_VC
+    , civ ( _range[0] , _range[1] ) 
+#endif
+  {
+  } ;
+
+  void operator() ( value_type & target )
+  {
+    typename transform_type::target_nd_rc_type trg_coordinate = * ( mci + range[0] ) ;
+    typename transform_type::source_nd_rc_type src_coordinate ;
+    transform ( trg_coordinate , src_coordinate ) ;
+    itp.eval ( src_coordinate , target ) ;
+    ++mci ;
+  }
+
+#ifdef USE_VC
+  
+  typedef typename interpolator_type::rc_v rc_v ; 
+  coordinate_iterator_v < rc_type , ic_type , dim_source , vsize > civ ;
+
+  void operator() ( value_type * target )
+  {
+    typedef typename transform_type::target_nd_rc_v trg_coordinate_type ;
+    trg_coordinate_type trg_coordinate = trg_coordinate_type ( *civ ) ;
+    typename transform_type::source_nd_rc_v src_coordinate ;
+    transform ( trg_coordinate , src_coordinate ) ;
+    itp.v_eval ( src_coordinate , target ) ;
+    ++civ ;
+  }
+
+#endif
+
+  transform_generator < dim_target , interpolator_type >
+    subrange ( const shape_range_type < dim_target > & range ) const
+  {
+    return transform_generator < dim_target , interpolator_type >
+             ( transform , itp , range ) ;
+  }
+  
+  transform_generator < dim_target , interpolator_type >
+    bindOuter ( const int & c ) const
+  {
+    nd_ic_type slice_start = range[0] , slice_end = range[1] ;
+
+    slice_start [ dim_target - 1 ] = c ;
+    slice_end [ dim_target - 1 ] = c ;
+    
+    return transform_generator < dim_target , interpolator_type >
+             ( transform,
+               itp ,
+               shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
+  }  
+} ;
+
+/// implementation of tf_remap() by delegation to the more general fill() routine, passing
+/// in the coordinate transform and the interpolator via a transform_generator object which
+/// handles the full chain of operations from picking the right discrete target coordinates
+/// to producing result values.
+/// Having the generator object makes the implementation of the remap routine trivial:
+/// just create the generator and pass it to fill().
+///
+/// this function provides a generalized geometric transformation
+/// In image transformations, often the source location at which interpolation
+/// of the source image is required is defined by a transformation of the
+/// target coordinate. There are (at least) two approaches to handle this.
+/// The first is what is implemented in the previous implementation of remap: we
+/// calculate a 'warp' array full of transformed coordinates and process it en bloc.
+/// Alternatively, with a coordinate transformation at hand, we can write a remap
+/// variant which performs the coordinate transformation for each target coordinate
+/// as it goes along, saving the construction of the warp array.
+///
+/// The transformation is passed in by using a 'transformation' object, which is a
+/// wrapper around the actual transformation routine(s). Note that this routine is
+/// also suitable for any type of interpolator satisfying the interface defined in
+/// class interpolator in interpolator.h.
 
 template < class interpolator_type , // type satisfying the interface in class interpolator
-           int dim_target ,          // dimension of incoming coordinates (== dim. of target array)
-           int dim_source >          // dimension of outgoing coordinates (== dim. of coefficient array)
+           int dim_target >          // dimension of target array
 void tf_remap ( const interpolator_type & ev ,
                 transformation < typename interpolator_type::rc_type ,
                                  dim_target ,
-                                 dim_source ,
+                                 interpolator_type::dimension ,
                                  interpolator_type::vsize > tf ,
                 MultiArrayView < dim_target ,
                                  typename interpolator_type::value_type > & output ,
@@ -844,10 +997,13 @@ void tf_remap ( const interpolator_type & ev ,
                 int nthreads = ncores            
               )
 {
-  shape_range_type < dim_target > range ( shape_type < dim_target > () , output.shape() ) ;
-  
-  multithread ( st_tf_remap < interpolator_type , dim_target , dim_source > ,
-                nthreads * 8 , range , &ev , tf , &output , use_vc ) ;
+  typedef typename interpolator_type::value_type value_type ;
+  typedef TinyVector < int , dim_target > nd_ic_type ;
+  typedef transform_generator < dim_target , interpolator_type > gen_t ;
+
+  shape_range_type < dim_target > range ( nd_ic_type() , output.shape() ) ;  
+  gen_t gen ( tf , ev , range ) ;  
+  fill < gen_t , dim_target > ( gen , output , use_vc , nthreads ) ;
 }
 
 /// this highest-level transform-based remap takes an input array and creates
@@ -876,11 +1032,11 @@ void tf_remap ( MultiArrayView < dim_source ,
   // create the bspline object
   bspline < input_value_type , dim_source > bsp ( input.shape() , degree , bcv ) ;
   // copy in the data
-  bsp.prefilter ( use_vc , nthreads , input ) ;
+  bsp.prefilter ( use_vc , nthreads * 8 , input ) ;
   // create an evaluator for this spline
   typedef evaluator < dim_source , input_value_type , rc_type , int > evaluator_type ;
   evaluator_type ev ( bsp ) ;
-  tf_remap < evaluator_type , dim_target , dim_source >
+  tf_remap < evaluator_type , dim_target >
            ( ev , tf , output , use_vc , nthreads ) ;
 }
 
@@ -891,6 +1047,8 @@ void tf_remap ( MultiArrayView < dim_source ,
 /// This is the single-threaded routine; further below we will use divide_and_conquer
 /// to apply this routine to several chunks of data in parallel.
 
+// TODO rewrite using fill()
+
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class rc_type ,         // float or double, for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
@@ -946,7 +1104,8 @@ void st_make_warp_array ( shape_range_type < dim_out > range ,
     nd_rc_out_v target_buffer ;                                 // vectorized warp coordinates
 
     // use utility class coordinate_iterator_v to produce vectorized target coordinates:
-    coordinate_iterator_v < rc_v , dim_out > civ ( offset , offset + warp_array.shape() ) ;
+    coordinate_iterator_v < rc_type , int , dim_out , vsize >
+      civ ( offset , offset + warp_array.shape() ) ;
 
     for ( int a = 0 ; a < aggregates ; a++ )
     {
@@ -1256,12 +1415,6 @@ void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
 /// data, but when scaling down, aliasing will occur and the data should be
 /// low-pass-filtered adequately before processing.
 
-// TODO: I intend to implement a process where evaluation to a smaller target
-// is performed on unfiltered data with a b-spline evaluator of suitable degree,
-// resulting in a suitably filtered scaled-down version. This process is then 
-// repeated several times, resulting in a pyramid which can be used to obtain
-// antialiased interpolates at all scales.
-
 template < typename interpolator_type , // type offering eval()
            typename target_type ,
            typename weight_type ,       // singular weight data type
diff --git a/thread_pool.h b/thread_pool.h
index 4b4318e..d65abef 100644
--- a/thread_pool.h
+++ b/thread_pool.h
@@ -33,6 +33,15 @@
 ///
 /// \brief provides a thread pool for vspline's multithread() routine
 ///
+/// class thread_pool aims to provide a simple and straightforward implementation
+/// of a thread pool for multithread() in multithread.h, but the class might find
+/// use elsewhere. The operation is simple:
+///
+/// a set of worker threads is launched who wait for 'tasks', which come in the shape
+/// of std::function<void()>, from a queue. When woken, a worker thread tries to obtain
+/// a task. If it succeeds, the task is executed, and the worker thread tries to get
+/// another task. If none is to be had, it goes to sleep, waiting to be woken once
+/// there are new tasks.
 
 #include <thread>
 #include <mutex>
@@ -53,10 +62,6 @@ class thread_pool
 
   std::vector < std::thread * > pool ;
   
-  /// the threads are started, stopped and destroyed by this object
-
-  int nthreads ;
-
 public:
 
   // mutex and condition variable
@@ -64,6 +69,9 @@ public:
 
   std::mutex task_mutex ;
   std::condition_variable task_cv ;
+  
+  // queue to hold tasks
+
   std::queue < std::function < void() > > task_queue ;
 
 private:
@@ -72,30 +80,27 @@ private:
   /// We use a thread pool of worker threads. These threads have a very 
   /// simple cycle: They try and obtain a task (std::function<void()>). 
   /// If there is one to be had, it is invoked, otherwise they wait on
-  /// task_cv.
+  /// task_cv. When woken up, the flag stay_alive is checked, and if it
+  /// is found to be false, the worker thread ends.
   
   void worker_thread()
   {
-    std::function < void() > task ;
-
     while ( true )
     {
-      // under task_mutex, try and obtain a task
+      // under task_mutex, check stay_alive and try to obtain a task
       std::unique_lock<std::mutex> task_lock ( task_mutex ) ;
 
       if ( ! stay_alive )
-        break ;
-
-      int sz = task_queue.size() ;
-      if ( sz )
       {
-        // there are tasks in the queue, take one
-        task = task_queue.front() ;
-        task_queue.pop() ;
+        task_lock.unlock() ;
+        break ; // die
       }
 
-      if ( sz )
+      if ( task_queue.size() )
       {
+        // there are tasks in the queue, take one
+        auto task = task_queue.front() ;
+        task_queue.pop() ;
         task_lock.unlock() ;
         // got a task, perform it, then try for another one
         task() ;
@@ -112,12 +117,14 @@ private:
 
 public:
   
-  thread_pool ( int _nthreads = 2 * std::thread::hardware_concurrency() )
-  : nthreads ( _nthreads )
+  thread_pool ( int nthreads = 2 * std::thread::hardware_concurrency() )
   {
-    std::function < void() > wf = std::bind ( &thread_pool::worker_thread , this ) ; 
+    // to launch a thread with a method, we need to bind it to the object:
+    std::function < void() > wf = std::bind ( &thread_pool::worker_thread , this ) ;
+    
+    // now we can fill the pool with worker threads
     for ( int t = 0 ; t < nthreads ; t++ )
-    pool.push_back ( new std::thread ( wf ) ) ;
+      pool.push_back ( new std::thread ( wf ) ) ;
   }
     
   ~thread_pool()
@@ -147,6 +154,5 @@ public:
   }
 } ;
 
-
 } ; // end of namespace vspline
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git