Sun Jul 2 09:02:40 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit e55930be67b65164a8278f9f14b28fc87ea92f5e
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sat Mar 18 11:19:15 2017 +0100

    coded unary_functor.h as replacement for interpolator.h
    The type system and template args in interpolator.h still showed it's origin
    in eval.h, but what was encoded in interpolator.h is a broader concept of a
    unary functor, taking an n-dimensional input and producing an m-dimensional
    output, plus the vectorization and load/store methods. So I rewrote the code
    reflecting the workings of a unary functor and made the corresponding changes
    to other code. In some places I still have the 'old' type names like
    nd_rc_type etc., hopefully mainly where they make sense (like in eval.h).
    It'll probably take a while since everything gels as intended, but here is a
    working commit.
---
 bspline.h                   |  12 +-
 common.h                    |  53 ++-
 coordinate.h                |  80 +----
 doxy.h                      |   2 +-
 eval.h                      | 799 +++++++++++---------------------------------
 example/impulse_response.cc |  60 ++--
 example/pano_extract.cc     | 247 +++++++++-----
 example/roundtrip.cc        | 223 +++----------
 filter.h                    |  36 +-
 interpolator.h              | 635 +++++++++++++++++++----------------
 mapping.h                   | 124 +++----
 multithread.h               |  12 +-
 prefilter.h                 |   6 +-
 remap.h                     | 273 +++++++++------
 unary_functor.h             | 309 +++++++++++++++++
 15 files changed, 1377 insertions(+), 1494 deletions(-)

diff --git a/bspline.h b/bspline.h
index 4e893ac..d0c1f91 100644
--- a/bspline.h
+++ b/bspline.h
@@ -411,8 +411,8 @@ public:
   /// These data will then be used in place of any data present in the
   /// bspline object.
 
-  void prefilter ( bool use_vc = true ,     ///< use Vc by default
-                   int nthreads = ncores * 8 , ///< number of tasks to use
+  void prefilter ( bool use_vc = true ,         ///< use Vc by default
+                   int njobs = default_njobs ,  ///< intended number of jobs to use
                    view_type data = view_type() ) ///< view to knot point data to use instead of 'core'
   {
     // if the user should have modified 'bcv' since the bspline object's creation,
@@ -472,7 +472,7 @@ public:
                 tolerance ,
                 smoothing ,
                 use_vc ,
-                nthreads
+                njobs
               ) ;
         break ;
       case BRACED:
@@ -486,7 +486,7 @@ public:
                 tolerance ,
                 smoothing ,
                 use_vc ,
-                nthreads
+                njobs
               ) ;
         // using the more general code here now, since the frame may be larger
         // than strictly necessary for the given spline degree due to a request
@@ -516,7 +516,7 @@ public:
                 tolerance ,
                 smoothing ,
                 use_vc ,
-                nthreads
+                njobs
               ) ;
         break ;
       case MANUAL:
@@ -534,7 +534,7 @@ public:
                 tolerance ,
                 smoothing ,
                 use_vc ,
-                nthreads
+                njobs
               ) ;
         break ;
     }
diff --git a/common.h b/common.h
index 6cfe19d..3227503 100644
--- a/common.h
+++ b/common.h
@@ -79,38 +79,31 @@ const int ncores = std::thread::hardware_concurrency() ;
 /// Nothe how we use a default vector width twice the size of a standard Vc vector for
 /// the given T. In my tests, this made the code faster. TODO test more, and on other systems
 
-template < class T , int _vsize = 2 * Vc::Vector < T > :: Size >
+/// using definition for the 'elementary type' of a type via vigra's
+/// ExpandElementResult mechanism. The elementary type is used to determine
+/// the default vector size, which is used as default for the vector size
+/// template argument _vsize. This default should only be used once for a
+/// given domain of code sharing simdized types, and the resulting vector
+/// size used explicitly for other T - if, for example, most vectorized maths
+/// is done in float, one would use #define VSIZE vector_traits<float>::size,
+/// and for other T, code vector_traits<T,VSIZE>
+
+template < class T >
+using ET = typename vigra::ExpandElementResult < T > :: type ;
+
+template < class T ,
+           int _vsize = 2 * Vc::Vector < ET<T> > :: Size >
 struct vector_traits
 {
-  enum { vsize = _vsize } ;
+  enum { size = _vsize } ;
 
-  // if vsize given is precisely the Size of a Vc::Vector<T>, make simdized_type
-  // such a Vc::Vector, otherwise make it a Vc::SimdArray<T,vsize>
-  
-//   typedef typename std::conditional < vsize == Vc::Vector < T > :: Size ,
-//                                       Vc::Vector < T > ,
-//                                       Vc::SimdArray < T , vsize > > :: type type ;
-
-// to always use Simdarrays, use this definition instead:
+  enum { dimension = vigra::ExpandElementResult < T > :: size } ;
 
-   typedef Vc::SimdArray < T , vsize > type ;
-} ;
+  typedef ET<T> ele_type ;
 
-#else
-
-template < class T , int _vsize = 0 >
-struct vector_traits
-{
-  enum { vsize = 1 } ;
-
-  // if vsize given is precisely the Size of a Vc::Vector<T>, make simdized_type
-  // such a Vc::Vector, otherwise make it a Vc::SimdArray<T,vsize>
+  typedef Vc::SimdArray < ele_type , size > ele_v ;
   
-  typedef T type ;
-
-// to always use Simdarrays, use this definition instead:
-
-//   typedef Vc::SimdArray < T , vsize > simdized_type ;
+  typedef vigra::TinyVector < ele_v , dimension > type ;
 } ;
 
 #endif
@@ -196,7 +189,7 @@ typedef enum { PERIODIC,   ///< periodic boundary conditions / periodic mapping
                ZEROPAD ,   ///< used for boundary condition, bracing
                IGNORE ,    ///< used for boundary condition, bracing
                IDENTITY ,  ///< used as solver argument, mostly internal use
-               GUESS ,
+               GUESS ,     ///< used with EXPLICIT scheme to keep margin errors low
                SPHERICAL , ///< intended use for spherical panoramas, needs work
                REJECT ,    ///< mapping mode, throws out_of_bounds for out-of-bounds coordinates
                REJECTP ,   ///< mapping mode, like REJECT, but ceiling 1 higher
@@ -205,8 +198,7 @@ typedef enum { PERIODIC,   ///< periodic boundary conditions / periodic mapping
                TAG ,       ///< mapping mode, out-of-bounds coordinates produce delta -2.0
                TAGP ,      ///< mapping mode, like TAG, but ceiling 1 higher
                RTAG ,      ///< mapping mode, like TAG, but for widened brace
-               RAW ,       ///< mapping mode, processes coordinates unchecked, may crash the program
-               RRAW
+               RAW         ///< mapping mode, processes coordinates unchecked, may crash the program
 } bc_code;
 
 /// This enumeration is used by the convenience class 'bspline' to determine the prefiltering
@@ -240,8 +232,7 @@ const std::string bc_name[] =
   "TAG" ,
   "TAGP" ,
   "RTAG" ,
-  "RAW" ,
-  "RRAW"
+  "RAW"
 } ;
 
 } ; // end of namespace vspline
diff --git a/coordinate.h b/coordinate.h
index 6093636..dcb24aa 100644
--- a/coordinate.h
+++ b/coordinate.h
@@ -34,15 +34,15 @@
     \brief a traits class for coordinates
 
     Throughout the evaluation part of vspline, we deal with coordinates.
-    In the context of b-spline evaluation we have to handle coordinate
-    types beyond simple TinyVectors of some type: we also deal with
-    'split' types, coordinates split into integral part and remainder.
-    Additionally, there is 'compact_split_type' consisting of an offset
-    and an nD fractional part.
     To handle these types their components consistently, we define a
     traits class 'coordinate_traits'.
+    
 */
 
+// TODO: maybe this is overspecification... I was using presplit coordinates
+// earlier, so it made sense to have this traits class, but I threw out that
+// part of the code, so now it seems a bit verbose to go this way.
+
 #ifndef VSPLINE_COORDINATE_H
 #define VSPLINE_COORDINATE_H
 
@@ -60,50 +60,9 @@ template < int dimension ,
            typename rc_type >
 using nd_rc_type = vigra::TinyVector < rc_type , dimension > ;
 
-/// We start out with the definition of split coordinates:
-/// class split_type contains n-dimensional 'split coordinates', consisting of the
-/// integral and fracional part of the 'original' real coordinates, separated so that
-/// they can be processed by the evaluation routine.
-
-template < int _dimension ,
-           typename _ic_type ,
-           typename _rc_type >
-struct split_type
-{
-  typedef _ic_type ic_type ;
-  typedef _rc_type rc_type ;
-  enum { dimension = _dimension } ;
-
-  typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
-  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
-
-  nd_ic_type select ; ///< named select because it selects a specific location
-  nd_rc_type tune ;   ///< named tune because it provides additional 'fine-tuning'
-} ;
-
-/// compact_split_type is similar to split_type but instead of the nD integral
-/// coordinate it has a single offset in 'select' which contains the same information
-/// in 'compacted' form: it's the sum of all individual components of select, multiplied
-/// with the strides of the container select indexes. 
-
-template < int _dimension ,
-           typename _ic_type ,
-           typename _rc_type >
-struct compact_split_type
-{
-  typedef _ic_type ic_type ;
-  typedef _rc_type rc_type ;
-  enum { dimension = _dimension } ;
-
-  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
-
-  ic_type select ;
-  nd_rc_type tune ;
-} ;
-
 /// now for the traits class. we want it to provide the following types:
 /// ic_type, rc_type, nd_ic_type and nd_rc_type. We also want an enum 'dimension'
-/// giving the dimesnion of the coordinate.
+/// giving the dimension of the coordinate.
 
 template < class coordinate_type >
 struct coordinate_traits
@@ -131,33 +90,6 @@ struct coordinate_traits < vigra::TinyVector < _rc_type , _dimension > >
   typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
 } ;
 
-/// since remap can also operate with pre-split coordinates, we need another partial
-/// specialization of coordinate_traits for split_type, which also defines a value_type
-
-template < typename _rc_type , typename _ic_type , int _dimension >
-struct coordinate_traits < split_type < _dimension , _ic_type , _rc_type > >
-{
-  enum { dimension = _dimension } ;
-  typedef _rc_type rc_type ;
-  typedef _ic_type ic_type ;
-
-  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
-  typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
-} ;
-
-/// the same for compact_split_type
-
-template < typename _rc_type , typename _ic_type , int _dimension >
-struct coordinate_traits < compact_split_type < _dimension , _ic_type , _rc_type > >
-{
-  enum { dimension = _dimension } ;
-  typedef _rc_type rc_type ;
-  typedef _ic_type ic_type ;
-
-  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
-  typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
-} ;
-
 } ; // end of namespace vspline
 
 #endif // VSPLINE_COORDINATE_H
diff --git a/doxy.h b/doxy.h
index 56c77dd..4bee643 100644
--- a/doxy.h
+++ b/doxy.h
@@ -241,7 +241,7 @@ Using double precision arithmetics, vectorization doesn't help so much, and pref
  
  You can probably do everything vspline does with other software - there are several freely available implementations of b-spline interpolation and remap routines. What I wanted to create was an implementation which was as general as possible and at the same time as fast as possible, and, on top of that, comprehensive.
 
- These demands are not easy to satisfy at the same time, but I feel that my design comes  close. While generality is achieved by generic programming, speed needs exploitation of hardware features, and merely relying on the compiler is not enough. The largest speedup I saw was simply multithreading the code. This may seem like a trivial observation, but my design is influenced by it: in order to efficiently multithread, the problem has to be partitioned so that it can be processed by inde [...]
+ These demands are not easy to satisfy at the same time, but I feel that my design comes  close. While generality is achieved by generic programming, speed needs exploitation of hardware features, and merely relying on the compiler is not enough. The largest speedup I saw was simply multithreading the code. This may seem like a trivial observation, but my design is influenced by it: in order to efficiently multithread, the problem has to be partitioned so that it can be processed by inde [...]
  
  Another speedup method is data-parallel processing. This is often thought to be the domain of GPUs, but modern CPUs also offer it in the form of vector units. I chose implementing data-parallel processing in the CPU, as it offers tight integration with unvectorized CPU code. It's almost familiar terrain, and the way from writing conventional CPU code to vector unit code is not too far, when using tools like Vc, which abstract the hardware away. Using horizontal vectorization does requir [...]
 
diff --git a/eval.h b/eval.h
index d5fdc63..fe44ce9 100644
--- a/eval.h
+++ b/eval.h
@@ -3,7 +3,7 @@
 /*    vspline - a set of generic tools for creation and evaluation      */
 /*              of uniform b-splines                                    */
 /*                                                                      */
-/*            Copyright 2015, 2016 by Kay F. Jahnke                     */
+/*            Copyright 2015 - 2017 by Kay F. Jahnke                    */
 /*                                                                      */
 /*    Permission is hereby granted, free of charge, to any person       */
 /*    obtaining a copy of this software and associated documentation    */
@@ -53,13 +53,34 @@
     Evaluation of a b-spline is, compared to prefiltering, more computationally intensive
     and less memory-bound, so the profit from vectorization, especially for float data,
     is more pronounced here than for the prefiltering. On my system, I found single-precision
-    operation was speeded up about four times (AVX2).
+    operation was about three to four times as fast as unvectorized code (AVX2).
     
     The central class of this file is class evaluator. evaluator objects are set up to
     provide evaluation of a specific b-spline. Once they are set up they don't change and
-    effectively become functors with several overloaded evaluation methods for different
+    effectively become pure functors with several overloaded evaluation methods for different
     constellations of parameters. The evaluation methods typically take their arguments per
-    reference.
+    reference. The details of the evaluation variants, together with explanations of
+    specializations used for extra speed, can be found with the individual evaluation
+    routines.
+    
+    What do I mean by the term 'pure' functor? It's a concept from functional programming.
+    It means that calling the functor will not have any effect on the functor itself - it
+    can't change once it has been constructed. This has several nice effects: it can
+    potentially be optimized very well, it is thread-safe, and it will play well with
+    functioanl programming concepts - and it's conceptionally appealing.
+    
+    Code using class evaluator will probably use it at some core place where it is
+    part of some processing chain. An example would be an image processing program:
+    one might have some outer loop generating arguments (typically SIMDIZED types)
+    which are processed one after the other to yield a result. The processing will
+    typically have several stages, like coordinate generation and transformations,
+    then use class evaluator to pick an interpolated intermediate result, which is
+    further processed by, say, colour or data type manipulations before finally
+    being stored in some target container. The whole processing chain can be
+    coded to become a single functor, with one of class evaluator's eval-type
+    routines embedded somewhere in the middle, and all that's left is code to
+    efficiently handle the source and destination to provide arguments to the
+    processing chain - like the code in remap.h.
 */
 
 #ifndef VSPLINE_EVAL_H
@@ -67,7 +88,7 @@
 
 #include "mapping.h"
 #include "bspline.h"
-// #include "interpolator.h" // currently unused
+#include "unary_functor.h"
 
 namespace vspline {
 
@@ -166,8 +187,8 @@ struct weight_functor_base
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ele_type > :: type ele_v ;
-  typedef typename vector_traits < rc_type , ele_v::Size > :: type rc_v ;
+  typedef typename vector_traits < ele_type > :: ele_v ele_v ;
+  typedef typename vector_traits < rc_type , ele_v::Size > :: ele_v rc_v ;
 
   virtual void operator() ( ele_v* result , const rc_v& delta ) = 0 ;
 
@@ -292,7 +313,7 @@ struct bspline_derivative_weight_functor
 // Note here that one important property of the weights is that they constitute
 // a partition of unity. Both the (derivative) b-spline weights and this simple
 // weight functor share this property.
-// currently unused, but left in for now
+// currently unused, but left in for demonstration purposes
 
 /*
 
@@ -324,105 +345,6 @@ struct average_weight_functor
   }  
 } ;
 
-// here's another example, using a gaussian to approximate the b-spline basis function
-
-template < typename rc_type >
-struct gaussian_weight_functor
-: public weight_functor_base < rc_type >
-{
-  typedef weight_functor_base < rc_type > base_class ;
-
-  const int degree ;
-  const int order ;
-  
-  gaussian_weight_functor ( int _degree , int d = 0 )
-  : degree ( _degree ) ,
-    order ( _degree + 1 )
-  { } ;
-  
-  virtual void operator() ( rc_type* result , const rc_type& delta )
-  {
-    rc_type x = - degree / 2 - delta ; // only odd splines for now
-    for ( int e = 0 ; e < order ; e++ , x += rc_type(1.0) )
-      result[e] = gaussian_bspline_basis_approximation<rc_type> ( x , degree ) ;
-  }
-
-#ifdef USE_VC
-
-  using typename base_class::rc_type_v ;
-  virtual void operator() ( rc_type_v* result , const rc_type_v& delta )
-  {
-    rc_type_v x = rc_type_v ( - degree / 2 ) - delta ; // only odd splines for now
-    for ( int e = 0 ; e < order ; e++ , x += rc_type(1.0) )
-      result[e] = gaussian_bspline_basis_approximation<rc_type_v> ( x , degree ) ;
-  }
-
-#endif
-} ;
-
-*/
-
-/*
-// for speed tests we duplicate above weight generation functor definitions
-// to see if calculating weights for all dimensions at once is faster
-
-template < typename rc_type , typename rc_type_v = rc_type >
-struct alt_weight_functor_base
-{
-  // we define two pure virtual overloads for operator(), one for unvectorized
-  // and one for vectorized operation. In case the scope of evaluation is extended
-  // to other types of values, we'll have to add the corresponding signatures here.
-  
-  virtual void operator() ( rc_type* result , const rc_type& delta ) = 0 ;
-  
-#ifdef USE_VC
-
-  virtual void operator() ( rc_type_v* result , const rc_type_v& delta ) = 0 ;
-
-#endif
-} ;
-
-/// we derive from the weight functor base class to obtain a (multi-) functor
-/// specifically for (derivatives of) a b-spline :
-
-template < typename rc_type , typename rc_type_v = rc_type >
-struct alt_bspline_derivative_weight_functor
-: public alt_weight_functor_base < rc_type , rc_type_v >
-{
-  typedef alt_weight_functor_base < rc_type , rc_type_v > base_class ;
-
-  // set up the fully specialized functors to which operator() delegates:
-
-  bspline_derivative_weights < rc_type , rc_type >  weights ;
-
-#ifdef USE_VC
-
-  bspline_derivative_weights < rc_type_v , rc_type >  weights_v ; 
-
-#endif
-
-  alt_bspline_derivative_weight_functor ( int degree , int d = 0 )
-  : weights ( degree , d )
-#ifdef USE_VC
-  , weights_v ( degree , d )
-#endif
-  {
-  }
-  
-  // handle the weight calculation by delegation to the functors set up at construction
-  
-  virtual void operator() ( rc_type* result , const rc_type& delta )
-  {
-    weights ( result , delta ) ;
-  }
-
-#ifdef USE_VC
-  virtual void operator() ( rc_type_v* result , const rc_type_v& delta )
-  {
-    weights_v ( result , delta ) ;
-  }
-#endif
-} ;
 */
 
 /// class evaluator encodes evaluation of a B-spline. This is coded so that the spline's dimension,
@@ -444,14 +366,15 @@ struct alt_bspline_derivative_weight_functor
 /// The code in class evaluator begins with a sizeable constructor which sets up as much
 /// as possible to support the evaluation code to do it's job as fast as possible. Next follows
 /// the unvectorized code, finally the vectorized code.
-/// There is a great variety of overloads of class evaluator's eval() method to make it as versatile
-/// as possible. The strategy is to have all dependencies of the evaluation except for the actual
+/// There is a great variety of overloads of class evaluator's eval() method available, because
+/// class evaluator inherits from vspline::unary_functor.
+/// The evaluation strategy is to have all dependencies of the evaluation except for the actual
 /// coordinates taken care of by the constructor - and immutable for the evaluator's lifetime.
 /// The resulting object has no state which is modified after construction, making it thread-safe.
 /// It also constitutes a 'pure' function in a functional-programming sense, because it has
-/// no mutable state and no side-effects.
-/// The eval() variants take only coordinates (in their various guises) as arguments, relying
-/// on the state of the evaluator fixed at it's construction. The eval() overloads also
+/// no mutable state and no side-effects, as can be seen by the fact that the eval methods
+/// are all marked const.
+/// The eval() overloads also
 /// form a hierarchy, as evaluation progresses from accepting unsplit real coordinates to
 /// split coordinates and finally offsets and weights. This allows calling code to handle
 /// parts of the delegation hierarchy itself, only using class evaluator at a specific level.
@@ -469,7 +392,7 @@ struct alt_bspline_derivative_weight_functor
 /// neighbour interpolation, which has the same effect as simply running with degree 0 but avoids
 /// code which isn't needed for nearest neighbour interpolation (like the application of weights,
 /// which is futile under the circumstances, the weight always being 1.0).
-/// specialize can also be set to 1 for explicit bilinear interpolation. Any value but 0 or 1 will
+/// specialize can also be set to 1 for explicit n-linear interpolation. Any value but 0 or 1 will
 /// result in the general-purpose code being used.
 ///
 /// - template argument 'evenness' can be passed in as 1 to enforce raw-mode coordinate splitting
@@ -478,109 +401,87 @@ struct alt_bspline_derivative_weight_functor
 /// for odd splines, enforcing a raw mapping. Any other value for 'evenness' will result in the use of
 /// the mmap object passed in at the evaluator's creation, which provides cordinate handling services
 /// with the routines defined in mapping.h.
-
-template < int _dimension ,         ///< dimension of the spline
-           class _value_type ,      ///< type of spline coefficients/result (pixels etc.)
-           class _rc_type ,         ///< singular real coordinate, float or double
-           class _ic_type ,         ///< singular integral coordinate, currently only int
-//            bool use_tag = false ,   ///< flag switching tag checking on/off
-           int specialize = -1 ,
-           int evenness = -1 >
+///
+/// class evaluator inherits from class unary_functor, which implements the broader concept.
+/// class evaluator has more methods to help with special cases, but apart from that it is
+/// a standard vspline::unary_functor.
+///
+/// Note how the number of vector elements is fixed here by picking the number of ele_type
+/// which vspline::vector_traits considers appropriate. There should rarely be a need to
+/// choose a different number of vector elements: evaluation will often be the most
+/// computationally intensive part of a processing chain, and therefore this choice is
+/// sensible. But it's not mandatory.
+
+template < int _dimension ,         // dimension of the spline
+           class _value_type ,      // type of spline coefficients/result (pixels etc.)
+           class _rc_type ,         // singular real coordinate, float or double
+           class _ic_type ,         // singular integral coordinate, currently only int
+           int specialize = -1 ,    // specialize for degree 0 or 1
+           int evenness = -1 ,      // specialize for raw mapping, even or odd spline
+#ifdef USE_VC
+           int _vsize = simdized_ele_type < _value_type > :: Size
+#else
+           int _vsize = 1
+#endif
+         > // nr. of vector elements
 class evaluator
-// we could inherit from class interpolator, since class evaluator satisfies the
-// interpolator interface, but there's no advantage in doing so.
-// : public interpolator < _dimension , _value_type , _rc_type , _ic_type >
+: public unary_functor
+         < vigra::TinyVector < _rc_type , _dimension > ,
+           _value_type ,
+           _vsize
+         >
 {
 public:
+
+ // we want to access facilites of the base class (vspline::unary_functor<...>)
+ // so we use a typedef for the base class.
+
+ typedef unary_functor
+         < vigra::TinyVector < _rc_type , _dimension > ,
+           _value_type ,
+           _vsize
+         >
+         base_type ;
+
+  // pull in standard vspline::unary_functor type system with this macro:
+
+  using_unary_functor_types ( base_type )
   
-  // fix the types we're dealing with
   typedef _ic_type ic_type ;
   typedef _rc_type rc_type ;
-
-  enum { dimension = _dimension } ;
-  enum { level = dimension - 1 } ;
-
-  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
-  typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
+  typedef _value_type value_type ;
   
-  typedef _value_type value_type ;  
   typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
-
+  enum { dimension = _dimension }  ;
+  enum { level = _dimension - 1 }  ;
   enum { channels = vigra::ExpandElementResult < value_type > :: size } ;
 
-  /// array_type is used for the coefficient array TODO change to 'view_type'
+  typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
+  typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
 
-  typedef MultiArrayView < dimension , value_type >                array_type ;
+  /// view_type is used for a view to the coefficient array
+
+  typedef MultiArrayView < dimension , value_type >                view_type ;
 
   /// type used for nD array coordinates, array shapes
 
-  typedef typename array_type::difference_type                     shape_type ;
+  typedef typename view_type::difference_type                      shape_type ;
 
   typedef vigra::TinyVector < int , dimension >                    derivative_spec_type ;
 
-  /// type for a 'split' coordinate, mainly used internally in the evaluator, but
-  /// also in 'warp arrays' used for remapping
-
-  typedef split_type < dimension , ic_type , rc_type > 
-    split_coordinate_type ;
-
-  typedef compact_split_type < dimension , ic_type , rc_type >
-    compact_split_coordinate_type ;
-
-  /// the evaluator can handle raw coordinates, but it needs a mapping to do so.
- 
   typedef typename MultiArrayView < 1 , ic_type > :: const_iterator offset_iterator ;
-  typedef MultiArrayView < dimension + 1 , ele_type >              component_view_type ;
-  typedef typename component_view_type::difference_type            expanded_shape_type ;
-  typedef vigra::TinyVector < ele_type* , channels >               component_base_type ;
-
-#ifdef USE_VC
-
-  // for vectorized operation, we need a few extra typedefs
-  // I use a _v suffix to indicate vectorized types and the prefixes
-  // mc_ for multichannel and nd_ for multidimensional
-
-  /// a simdized type of the elementary type of value_type,
-  /// which is used for coefficients and results. this is fixed via
-  /// the traits class vector_traits (in common.h)
-  
-  typedef typename vector_traits < ele_type > :: type ele_v ;
   
-  /// element count of Simd data types.
-  
-  enum { vsize = ele_v::Size } ;
-
-  /// compatible-sized simdized type of vsize ic_type
-
-  typedef typename vector_traits < ic_type , vsize > :: type ic_v ;
-
-  /// compatible-sized simdized type of vsize rc_type
-
-  typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
-
-  /// multichannel value as SoA, for pixels etc.
-
-  typedef vigra::TinyVector < ele_v , channels > mc_ele_v ;
-
-  /// SoA for nD coordinates/components
-
-  typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
-
-  /// SoA for nD shapes (or multidimensional indices)
-
-  typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
-
-  /// SoA for multichannel indices (used for gather/scatter operations)
-
-  typedef vigra::TinyVector < ic_v , channels >  mc_ic_v ;
-
-#else
+  typedef vigra::MultiCoordinateIterator<dimension>                nd_offset_iterator ;
   
-  enum { vsize = 1 } ;
+  typedef MultiArrayView < dimension + 1 , ele_type >              component_view_type ;
   
-#endif // USE_VC
+  typedef typename component_view_type::difference_type            expanded_shape_type ;
   
+  typedef vigra::TinyVector < ele_type* , channels >               component_base_type ;
+
   /// object used to map arbitrary incoming coordinates into the defined range
+  /// This is used per default if 'evenness' isn't 0 or 1, in which case raw mapping
+  /// would be used (incoming coordinates are simply split)
 
   typedef nd_mapping < ic_type , rc_type , dimension , vsize > nd_mapping_type ;
 
@@ -610,7 +511,7 @@ public:
 private:
   
   nd_weight_functor fweight ;       ///< set of pointers to weight functors, one per dimension
-  const array_type coefficients ;   ///< b-spline coefficient array
+  const view_type & coefficients ;   ///< b-spline coefficient array
   const shape_type expanded_stride ;                 ///< strides in terms of expanded value_type
   MultiArray < 1 , ic_type > offsets ;               ///< offsets in terms of value_type
   MultiArray < 1 , ic_type > component_offsets ;     ///< offsets in terms of ele_type, for vc op
@@ -647,7 +548,7 @@ public:
   // more involved initialization than the one-liners after the colon... so I'm a bit sloppy here.
   // in the calling code, it's fine to use a const evaluator, that sorts it as well.
 
-  evaluator ( const array_type& _coefficients ,
+  evaluator ( const view_type & _coefficients ,
               nd_mapping_type _mmap ,
               int _spline_degree ,
               derivative_spec_type _derivative = derivative_spec_type ( 0 ) )
@@ -814,13 +715,6 @@ public:
   ///  
   /// I'd write a plain function template and partially specialize it for 'level', but that's
   /// not allowed, so I use a functor instead.
-  // TODO might add code here which receives a set of weights, one per value in the window,
-  // to apply in sequence (zip with window values). The set of weights would be the tensor
-  // product of the weights along each axis. Even more involved code is thinkable where some
-  // of the tensor product is produced and some remains as residual tensor(s). The 'done'
-  // part of the product would then be applied to as many windowed values as it holds and
-  // the weighted sum fed upwards into a recursion where it's treated with a factor from
-  // the tensor for 'one level up'.
   
   template < int level , class dtype >
   struct _eval
@@ -896,54 +790,6 @@ public:
     }
   } ;
 
-  /// _xeval takes an ele_type** 'weight', pointing to level ele_type*, which each
-  /// point to ORDER weights. This variant is used when the per-axis components
-  /// of the weights appear repeatedly,like in mesh grid processing.
-  // TODO it has to be seen if this is actually faster than just copying together
-  // the weights into the 2D MultiArrayView used by the routines above
-  // ... This is hard to vectorize and introduces lots of new code, so I'm not
-  // using it, but it's left in for now
-
-  // tentatively I coded:
-  
-//   template < int level , class dtype >
-//   struct _xeval
-//   {
-//     dtype operator() ( const dtype* & pdata ,
-//                        offset_iterator & ofs ,
-//                        const ele_type** const weight ,
-//                        const int & ORDER
-//                      ) const
-//     {
-//       dtype result = dtype() ;
-//       for ( int i = 0 ; i < ORDER ; i++ )
-//       {
-//         result +=   weight [ level ] [ i ]
-//                   * _eval < level - 1 , dtype >() ( pdata , ofs , weight ) ;
-//       }
-//       return result ;
-//     }
-//   } ;
-// 
-//   template < class dtype >
-//   struct _xeval < 0 , dtype >
-//   {
-//     dtype operator() ( const dtype* & pdata ,
-//                        offset_iterator & ofs ,
-//                        const ele_type** const const weight ,
-//                        const int & ORDER
-//                      ) const
-//     {
-//       dtype result = dtype() ;
-//       for ( int i = 0 ; i < ORDER ; i++ )
-//       {
-//         result += pdata [ *ofs ] * weight [ 0 ] [ i ] ;
-//         ++ofs ;
-//       }
-//       return result ;
-//     }
-//   } ;
-
   // next are evaluation routines. there are quite a few, since I've coded for operation
   // from different starting points and both for vectorized and nonvectorized operation.
   
@@ -963,19 +809,6 @@ public:
     result = _eval<level,value_type>() ( base , ofs , weight ) ;
   }
 
-//   /// x... variant, taking component weights by adress.
-//   /// this variant is used for mesh grid processing
-//   
-//   void xeval ( const ic_type & select ,
-//                const ele_type** const const weight ,
-//                const int & ORDER ,
-//                value_type & result ) const
-//   {
-//     const value_type * base = coefficients.data() + select ;
-//     offset_iterator ofs = offsets.begin() ;
-//     result = _xeval<level,value_type>() ( base , ofs , weight , ORDER ) ;
-//   }
-
   /// 'penultimate' delegate, taking a multidimensional index 'select' to the beginning of
   /// the coefficient window to process. Here the multidimensional index is translated
   /// into an offset from the coefficient array's base adress. Carrying the information as
@@ -995,8 +828,7 @@ public:
   /// window. Note that the weights aren't as many values as the window has coefficients, but
   /// only ORDER weights per dimension; the 'final' weights would be the tensor product of the
   /// sets of weights for each dimension, which we don't explicitly use here. Also note that
-  /// we limit the code here to have the same number of weights for each axis, which might be
-  /// changed to accept differently sized tensors.
+  /// we limit the code here to have the same number of weights for each axis.
   
   void eval ( const nd_ic_type& select ,
               const nd_rc_type& tune ,
@@ -1037,19 +869,6 @@ public:
     }
   }
 
-  /// this variant of eval() takes a split coordinate. This method isn't used
-  /// by other methods in class evaluator but serves as an alternative entry
-  /// point into the evaluation sequence if the calling code already has split
-  /// coordinates. It takes split_coordinate_type as a template argument, taking
-  /// split_type and compact_split_type.
-
-  template < class split_coordinate_type >
-  void eval ( const split_coordinate_type & sc , // presplit coordinate
-              value_type & result ) const
-  {
-    eval ( sc.select , sc.tune , result ) ;
-  }
-
   /// this variant take a coordinate which hasn't been mapped yet, so it uses
   /// the nd_mapping, mmap, and applies it. It's the most convenient form but
   /// also the slowest, due to the mapping, which is quite expensive computationally.
@@ -1106,6 +925,9 @@ public:
     eval ( cc , result ) ;
   }
 
+  /// variant of the above routine returning the result instead of depositing it
+  /// in a reference
+  
   value_type eval ( const rc_type& c ) const
   {
     static_assert ( dimension == 1 ,
@@ -1123,7 +945,7 @@ public:
   /// evaluation using the tensor product of the weight component vectors. This is simple,
   /// no need for recursion here, it's simply the sum of all values in the window, weighted
   /// with their corresponding weights. note that there are no checks; if there aren't enough
-  /// weights th code may crash. Note that this code is formulated as a template and can be
+  /// weights the code may crash. Note that this code is formulated as a template and can be
   /// used both for vectorized and unvectorized operation.
 
   template < class dtype , class weight_type >
@@ -1208,7 +1030,7 @@ public:
                    const MultiArrayView < 2 , ele_type > & weight ,
                    value_type & result ) const
   {
-    // gat a pointer to the coefficient window's beginning
+    // get a pointer to the coefficient window's beginning
     const value_type * base = coefficients.data() + select ;
     // prepare space for the multiplied-out weights
     ele_type flat_weight [ window_size ] ;
@@ -1225,22 +1047,23 @@ public:
 
   /// vectorized version of _eval. This works just about the same way as _eval, with the
   /// only difference being the innner loop over the channels, which is necessary because
-  /// in the vector code we can't code for vectors of, say, pixels.
+  /// in the vector code we can't code for vectors of, say, pixels, but only for vectors of
+  /// elementary types, like float.
   /// to operate with vsize values synchronously, we need a bit more indexing than in the
   /// non-vectorized version. the second parameter, origin, constitutes a gather operand
   /// which, applied to the base adress, handles a set of windows to be processed in parallel.
   /// if the gather operation is repeated with offsetted base addresses, the result vector is
   /// built in the same way as the single result value in the unvectorized code above.
-  /// note that the vectorized routine can't function like this when it comes to
-  /// evaluating unbraced splines: it relies on the bracing and can't do without it, because
+  /// note that the vectorized routine couldn't function like this if it were to
+  /// evaluate unbraced splines: it relies on the bracing and can't do without it, because
   /// it works with a fixed sequence of offsets, whereas the evaluation of an unbraced spline
-  /// might use a different offset sequence for values affected by the boundary condition.
-  /// Nevertheless I have chosen this implementation, as the speed gain by vectorization is
-  /// so large that the extra memory needed for the bracing seems irrelevant. I have to concede,
-  /// though, that this rules out in-place spline generation on a packed array of knot point values -
-  /// either the knot point data come in with bracing space already present, or the operation has
-  /// to use a separate target array.
+  /// would use a different offset sequence for values affected by the boundary condition.
 
+  typedef typename base_type::out_ele_v ele_v ;
+  typedef typename base_type::out_v mc_ele_v ;
+  typedef typename vector_traits < nd_ic_type , vsize > :: type nd_ic_v ;
+  typedef typename vector_traits < nd_rc_type , vsize > :: type nd_rc_v ;
+  
   template < class dtype , int level >
   struct _v_eval
   {
@@ -1296,9 +1119,9 @@ public:
   struct _v_eval_linear
   {
     dtype operator() ( const component_base_type& base , // base adresses of components
-                       const ic_v& origin ,              // offsets to evaluation window origins
-                       offset_iterator & ofs ,           // offsets to coefficients inside this window
-                       const nd_rc_v& tune ) const       // weights to apply
+                       const ic_v& origin ,        // offsets to evaluation window origins
+                       offset_iterator & ofs ,     // offsets to coefficients inside this window
+                       const in_v& tune ) const       // weights to apply
     {
       dtype sum ;    ///< to accumulate the result
       dtype subsum ;
@@ -1327,7 +1150,7 @@ public:
     dtype operator() ( const component_base_type& base , // base adresses of components
                        const ic_v& origin ,              // offsets to evaluation window origins
                        offset_iterator & ofs ,           // offsets to coefficients in this window
-                       const nd_rc_v& tune ) const       // weights to apply
+                       const in_v& tune ) const       // weights to apply
     {
       dtype sum ;
       auto o1 = *ofs ;
@@ -1349,55 +1172,6 @@ public:
     }  
   } ;
 
-//   template < class dtype , int level >
-//   struct _v_xeval
-//   {
-//     dtype operator() ( const component_base_type& base , ///< base adresses of components
-//                        const ic_v& origin ,              ///< offsets to evaluation window origins
-//                        offset_iterator & ofs ,           ///< offsets to coefficients inside this window
-//                        const ele_v ** const weight ,
-//                        const int & ORDER ) const
-//     {
-//       dtype sum = dtype() ;    ///< to accumulate the result
-//       dtype subsum ; ///< to pick up the result of the recursive call
-// 
-//       for ( int i = 0 ; i < ORDER ; i++ )
-//       {
-//         subsum = _v_xeval < dtype , level - 1 >() ( base , origin , ofs , weight , ORDER );
-//         for ( int ch = 0 ; ch < channels ; ch++ )
-//         {
-//           sum[ch] += weight [ level ] [ i ] * subsum[ch] ;
-//         }
-//       }
-//       return sum ;
-//     }  
-//   } ;
-// 
-//   /// the level 0 routine terminates the recursion
-//   
-//   template < class dtype >
-//   struct _v_xeval < dtype , 0 >
-//   {
-//     dtype operator() ( const component_base_type& base , ///< base adresses of components
-//                        const ic_v& origin ,              ///< offsets to evaluation window origins
-//                        offset_iterator & ofs ,           ///< offsets to coefficients in this window
-//                        const ele_v ** const weight ,
-//                        const int & ORDER ) const         ///< weights to apply
-//     {
-//       dtype sum = dtype() ;
-// 
-//       for ( int i = 0 ; i < ORDER ; i++ )
-//       {
-//         for ( int ch = 0 ; ch < channels ; ch++ )
-//         {
-//           sum[ch] += weight [ 0 ] [ i ] * ele_v ( base[ch] , origin + *ofs ) ;
-//         }
-//         ++ofs ;
-//       }
-//       return sum ;
-//     }  
-//   } ;
-
   // vectorized variants of the evaluation routines:
   
   /// this is the vectorized version of the final delegate, calling _v_eval. The first
@@ -1406,9 +1180,9 @@ public:
   /// The second argument is a 2D array of vecorized weights, the third a reference to
   /// the result.
   
-  void v_eval ( const ic_v& select ,  // offsets to lower corners of the subarrays
-                const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
-                mc_ele_v & result ) const
+  void eval ( const ic_v& select ,  // offsets to lower corners of the subarrays
+              const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
+              out_v & result ) const
   {
     // we need an instance of this iterator because it's passed into _v_eval by reference
     // and manipulated by the code there:
@@ -1417,28 +1191,25 @@ public:
     
     // now we can call the recursive _v_eval routine yielding the result
     
-    result = _v_eval < mc_ele_v , level >() ( component_base , select , ofs , weight ) ;
+    result = _v_eval < out_v , level >() ( component_base , select , ofs , weight ) ;
   }
 
-//   /// x... variant, taking componnt weights by adress
-//   /// used for mesh grid processing
-//   
-//   void v_xeval ( const ic_v& select ,
-//                  const ele_v ** const weight ,
-//                  const int & ORDER ,
-//                  mc_ele_v & result ) const
-//   {
-//     offset_iterator ofs = component_offsets.begin() ;
-//     result = _v_xeval < mc_ele_v , level >() ( component_base , select , ofs , weight , ORDER ) ;
-//   }
-// 
-//   /// in the penultimate delegate, we calculate the weights. We also use this
-//   /// method 'further down' when we operate on 'compact split coordinates',
-//   /// which contain offsets and fractional parts.
-
-  void v_eval ( const ic_v& select ,       // offsets to coefficient windows
-                const nd_rc_v& tune ,      // fractional parts of the coordinates
-                mc_ele_v & result ) const  // target
+  /// this variant starts from a set of offsets to coefficient windows, so here
+  /// the nD integral indices to the coefficient windows have already been 'condensed'
+  /// into 1D offsets into the coefficient array's memory.
+  /// Here we have the specializations affected by the template argument 'specialize'
+  /// which activates more efficient code for degree 0 (nearest neighbour) and degree 1
+  /// (linear interpolation) splines. I draw the line here; one might add further
+  /// specializations, but from degree 2 onwards the weights are reused several times
+  /// so looking them up in a small table (as the general-purpose code for unspecialized
+  /// operation does) should be more efficient (TODO test).
+  /// Passing any number apart from 0 or 1 as 'specialize' template argument results
+  /// in the use of the general-purpose code, which can also handle degree 0 and 1
+  /// splines, albeit les efficiently.
+  
+  void eval ( const ic_v& select ,       // offsets to coefficient windows
+                const in_v& tune ,      // fractional parts of the coordinates
+                out_v & result ) const  // target
   {
     if ( specialize == 0 )
     {
@@ -1455,7 +1226,7 @@ public:
 
       offset_iterator ofs = component_offsets.begin() ;
     
-      result = _v_eval_linear < mc_ele_v , level >()
+      result = _v_eval_linear < out_v , level >()
                ( component_base , select , ofs , tune ) ;
     }
     else
@@ -1484,176 +1255,68 @@ public:
       
       // having obtained the weights, we delegate to the final delegate.
       
-      v_eval ( select , weight , result ) ;
+      eval ( select , weight , result ) ;
     }
-
-// use_tag is no longer used.
-//     if ( use_tag )
-//     {
-//       // this bit of code is optional, it's only needed if mapping mode TAG is used.
-//       // If use_tag is false and mapping mode TAG is used, out-of-range
-//       // coordinates produce undefined results but the program will not crash.
-// 
-//       for ( int d = 0 ; d < dimension ; d++ )
-//       {
-//         auto mask = ( tune[d] == rc_type ( rc_out_of_bounds ) ) ; // test for oob value
-//         if ( any_of ( mask ) )
-//         {
-//           for ( int ch = 0 ; ch < channels ; ch++ ) // coordinate is oob, set to 0
-//             result[ch] ( mask ) = ele_type ( rv_out_of_bounds ) ;
-//         }
-//       }
-//     }
   }
 
   /// here we transform incoming nD coordinates into offsets into the coefficient
-  /// array's memory.
+  /// array's memory. In my experiments I found that switching from nD indices
+  /// to offsets is best done sooner rather than later, even though one  might suspect
+  /// that simply passing on the reference to the nD index and converting it 'further down'
+  /// shouldn't make much difference. Probably it's easier for the optimizer to
+  /// see the conversion closer to the emergence of the nD index from the nD real
+  /// coordinate coming in.
   /// note that we use both the strides and offsets appropriate for an expanded array,
-  /// and component_base has pointers to the element type.
+  /// and component_base has pointers to the elementary type.
 
-  void v_eval ( const nd_ic_v& select ,  // nD coordinates to coefficient windows
-                const nd_rc_v& tune ,    // fractional parts of the coordinates
-                mc_ele_v & result ) const
+  void eval ( const nd_ic_v & select , // nD coordinates to coefficient windows
+              const nd_rc_v & tune ,     // fractional parts of the coordinates
+              out_v & result ) const
   {
+    // condense the nD index into an offset
     ic_v origin = select[0] * ic_type ( expanded_stride [ 0 ] ) ;
     for ( int d = 1 ; d < dimension ; d++ )
       origin += select[d] * ic_type ( expanded_stride [ d ] ) ;
     
-    v_eval ( origin , tune , result ) ;
+    // pass on to overload taking the offset
+    eval ( origin , tune , result ) ;
   }
 
-  /// here we take the approach to require the calling function to present pointers to
-  /// vsize input and vsize output values, stored contiguously, so that we can use
-  /// 'standard' gather/scatter operations here with precomputed indexes. Performing the
-  /// (de)interleaving here simplifies matters for the calling code if it has the data
-  /// in contiguous memory. But this is not always the case, for example when the
-  /// data are strided. Anyway, it doesn't hurt to have the option.
-  
-  void v_eval ( const split_coordinate_type * const psc , // pointer to vsize split coordinates
-                value_type * result )  const              // pointer to vsize result values
-  {
-    nd_ic_v select ;
-    nd_rc_v tune ;
-
-    // this is awkward, but if we get split coordinates interleaved like this,
-    // we have to manually deinterleave them unless we make potentially unsafe
-    // assumptions about the size of the components (if they are the same and packed,
-    // we might gather instead...)
-    
-    for ( int vi = 0 ; vi < vsize ; vi++ )
-    {
-      for ( int d = 0 ; d < dimension ; d++ )
-      {
-        select[d][vi] = int ( psc[vi].select[d] ) ;
-        tune[d][vi] = rc_type ( psc[vi].tune[d] ) ;
-      }
-    }
-
-    mc_ele_v v_result ;
-    v_eval ( select , tune , v_result ) ;
-
-    // and deposit it in the memory the caller provides
-    for ( int ch = 0 ; ch < channels ; ch++ )
-      v_result[ch].scatter ( (ele_type*)result , mc_interleave(ch) ) ;
-  }
-
-  /// same for 'compact' split coordinates.
-
-  void v_eval ( const compact_split_coordinate_type* const psc ,
-                value_type * result )  const
-  {
-    ic_v select ;
-    nd_rc_v tune ;
-
-    // this is awkward, but if we get split coordinates interleaved like this,
-    // we have to manually deinterleave them unless we make potentially unsafe
-    // assumptions about the size of the components (if they are the same and packed,
-    // we might gather instead...)
-    
-    for ( int vi = 0 ; vi < vsize ; vi++ )
-    {
-      select[vi] = int ( psc[vi].select ) ;
-      for ( int d = 0 ; d < dimension ; d++ )
-      {
-        tune[d][vi] = rc_type ( psc[vi].tune[d] ) ;
-      }
-    }
-
-    mc_ele_v v_result ;
-    v_eval ( select , tune , v_result ) ;
-
-    // and deposit it in the memory the caller provides
-    for ( int ch = 0 ; ch < channels ; ch++ )
-      v_result[ch].scatter ( (ele_type*)result , mc_interleave(ch) ) ;
-  }
-
-  /// this variant is roughly the same as the one above, but it receives separate
-  /// pointers for the integral and real parts of the coordinates. This is for the
-  /// case where these aspects are stored in different containers. This type of access
-  /// is more efficient, since we can use gather operations
-  
-  void v_eval ( const nd_ic_type * const pi ,  // pointer to integral parts of coordinates
-                const nd_rc_type * const pr ,  // pointer to real parts fo coordinates
-                value_type * result ) const    // pointer to vsize result values
-  {
-    nd_ic_v select ;
-    nd_rc_v tune ;
-
-    // gather the integral and real coordinate parts from the pointer passed in
-    
-    for ( int d = 0 ; d < dimension ; d++ )
-    {
-      select[d].gather ( (ic_type*)pi , nd_interleave(d) )  ;
-      tune[d].gather ( (rc_type*)pr , nd_interleave(d) )  ;
-    }
-
-    mc_ele_v v_result ;
-    v_eval ( select , tune , v_result ) ;
-
-    // and deposit it in the memory the caller provides
-    
-    for ( int ch = 0 ; ch < channels ; ch++ )
-      v_result[ch].scatter ( (ele_type*)result , mc_interleave(ch) ) ;
-  }
-
-  /// same for 'compact' split coordinates
-  
-  void v_eval_c ( const ic_type * const pi ,    // pointer to offsets
-                  const nd_rc_type * const pr , // pointer to real parts fo coordinates
-                  value_type * result ) const   // pointer to vsize result values
-  {
-    ic_v select ;
-    nd_rc_v tune ;
-
-    select.load ( pi )  ;
-      
-    // gather the real coordinate parts from the pointer passed in
-    
-    for ( int d = 0 ; d < dimension ; d++ )
-    {
-      tune[d].gather ( pr , nd_interleave(d) )  ;
-    }
-
-    mc_ele_v v_result ;
-    v_eval ( select , tune , v_result ) ;
-
-    // and deposit it in the memory the caller provides
-    
-    for ( int ch = 0 ; ch < channels ; ch++ )
-      v_result[ch].scatter ( (ele_type*)result , mc_interleave(ch) ) ;
-  }
-
-  /// This variant of v_eval() works directly on vector data (of unsplit coordinates)
+  /// This variant of eval() works directly on vector data (of unsplit coordinates)
   /// This burdens the calling code with (de)interleaving the data. But often the calling
   /// code performs a traversal of a large body of data and is therefore in a better position
   /// to perform the (de)interleaving e.g. by a gather/scatter operation, or already receives
-  /// the data in simdized form.
+  /// the data in simdized form. This latter case is actually quite common, because 'real'
+  /// applications will rarely use class evaluator directly. Instead, the evaluator will usually
+  /// be used as some 'inner' component of another functor, which is precisely why class
+  /// evaluator is implemented as a 'pure' functor, containing only state fixed at construction.
+  /// So whatever deinterleaving and preprocessing and postprocessing and reinterleaving
+  /// may be performed 'outside' is irrelevant here where we receive SIMD data only.
+  /// In this routine, we make the move from incoming real coordinates to separate
+  /// nD integral indices and fractional parts. While this seems like a trivial operation,
+  /// it turns out to be potentially quite complex: depending on the 'meaning' of the
+  /// spline coefficients and on the 'evenness' of the spline's degree, there are several
+  /// alternative paths. The simple 'splitting' of the incoming real coordinate without
+  /// checks or mappings has two variants, one for even splines and one for odd splines.
+  /// If this is all that is required, the evaluator can be used with the template argument
+  /// 'evenness' set to 1 (for even splines) or 0 (for odd splines). This will directly make
+  /// use of the raw mapping routines in mapping.h.
+  /// When this template argument takes any other value, mapping and splitting of the incoming
+  /// coordinates will be done via the 'mmap' object containing pointers to mapping routines.
+  /// This looks complicated, but usually an evaluator will be constructed by passing it's
+  /// constructor a bspline object, and the bspline object has information about the boundary
+  /// conditions it was created with. What happens then is that a mapping which 'folds'
+  /// the incoming variables into the defined range is used - automatically. So if the bspline
+  /// was set up with, say, MIRROR boundary conditions, the incoming coordinates will be mirrored
+  /// at the bounds. This is a good default to fall back to or start out with; if the calling
+  /// code evolves to perform the mapping itself (or not requires any) the 'evenness'
+  /// specialization can be used to avoid the (slightly less efficient) use of the mmap object.
   
-  void v_eval ( const nd_rc_v & input ,    // number of dimensions * coordinate vectors
-                mc_ele_v & result )  const // number of channels * value vectors
+  void eval ( const in_v & input ,    // number of dimensions * coordinate vectors
+                out_v & result )  const // number of channels * value vectors
   {
     nd_ic_v select ;
-    nd_rc_v tune ;
+    in_v tune ;
 
     // map the incoming coordinates to the spline's range
     // here we have another specialization: 'evenness' is an
@@ -1690,89 +1353,32 @@ public:
     else
     {
       // no specialization; fall back to calling mmap
-      std::cerr << "warning: using fallback to old mapping code" << std::endl ;
       mmap ( input , select , tune ) ;
     }
 
-    // delegate to v_eval with split coordinates
+    // delegate to eval with split coordinates
 
-    v_eval ( select , tune , result ) ;
+    eval ( select , tune , result ) ;
   }
 
-//   void new_v_eval ( const nd_rc_v & input ,    // number of dimensions * coordinate vectors
-//                 mc_ele_v & result )  const // number of channels * value vectors
-//   {
-//     ic_v select ;
-//     nd_rc_v tune ;
-// 
-//     // map the incoming coordinates to the spline's range
-// 
-//     split_cordinate_v ( input , select , tune ) ;
-// 
-//     // delegate
-// 
-//     v_eval ( select , tune , result ) ;
-//   }
-
-  /// This variant also operates on unsplit coordinates. Here again we require the calling function
-  /// to pass pointers to vsize input and output values in contiguous memory. The data are
-  /// (de)interleved in this routine, the actual evaluation is delegated to the variant working
-  /// on vectorized data. If dimension == 1, the coordinate data are obtained with a load instruction
-  /// instead of a gather, which should be faster (TODO check) - ditto for channels and storing.
-  // TODO might try v_eval variants taking rc_v& / ele_v& for 1D / 1 channel case instead of
-  // relying on the optimizer to shed the unnecessary TinyVector<T,1> container
-  
-  void v_eval ( const nd_rc_type * const pmc ,  // pointer to vsize muli_coordinates
-                value_type * result ) const     // pointer to vsize result values
-  {
-    nd_rc_v input ;
-    mc_ele_v v_result ;
-
-    // gather the incoming (interleaved) coordinates
-
-    if ( dimension == 1 )
-    {
-      input[0].load ( (const rc_type* const)pmc ) ;
-    }
-    else
-    {
-      for ( int d = 0 ; d < dimension ; d++ )
-        input[d].gather ( (const rc_type* const)pmc , nd_interleave(d) ) ;
-    }
-
-    // call v_eval() for vectorized data
-    
-    v_eval ( input , v_result ) ;
-
-    // and deposit it in the memory the caller provides
-    
-    if ( channels == 1 )
-    {
-      v_result[0].store ( (ele_type*)result ) ;
-    }
-    else
-    {
-      for ( int ch = 0 ; ch < channels ; ch++ )
-        v_result[ch].scatter ( (ele_type*)result , mc_interleave(ch) ) ;
-    }
-  }
+  /// special handling of 1D splines
 
-  void v_eval ( const rc_type * const pmc ,  // pointer to vsize 1D coordinates
-                value_type * result ) const        // pointer to vsize result values
+  void eval ( const rc_type * const pmc ,  // pointer to vsize 1D coordinates
+              value_type * result ) const  // pointer to vsize result values
   {
     static_assert ( dimension == 1 ,
-                    "this v_eval variant is intended for 1D splines only" ) ;
+                    "this eval variant is intended for 1D splines only" ) ;
 
-    nd_rc_v input ;
-    mc_ele_v v_result ;
+    in_v input ;
+    out_v v_result ;
 
     // gather the incoming (interleaved) coordinates
     
     input[0].load ( (const rc_type* const)pmc ) ;
 
-    // call v_eval() for vectorized data
+    // call eval() for vectorized data
     
-    v_eval ( input , v_result ) ;
+    eval ( input , v_result ) ;
 
     // and deposit it in the memory the caller provides
     
@@ -1787,31 +1393,6 @@ public:
     }
   }
 
-  /// mixed form, where input is a vectorized coordinate
-  /// and output goes to interleaved memory
-
-  void v_eval ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
-                value_type * result ) const    // pointer to vsize result values
-  {
-    mc_ele_v v_result ;
-
-    // call v_eval() for vectorized data
-
-    v_eval ( input , v_result ) ;
-
-    // and deposit result in the memory the caller provides
-
-    if ( channels == 1 )
-    {
-      v_result[0].store ( (ele_type*)result ) ;
-    }
-    else
-    {
-      for ( int ch = 0 ; ch < channels ; ch++ )
-        v_result[ch].scatter ( (ele_type*)result , mc_interleave(ch) ) ;
-    }
-  }
-
 #endif // USE_VC
 
   ~evaluator()
diff --git a/example/impulse_response.cc b/example/impulse_response.cc
index f9e5387..8174dfa 100644
--- a/example/impulse_response.cc
+++ b/example/impulse_response.cc
@@ -68,6 +68,10 @@
 ///
 /// note how three different ways of getting the result are given, the variants
 /// using lower-level access to the filter are commented out.
+///
+/// The array sized used here may seem overly large, but this program also serves as
+/// a test for prefiltering 1D arrays with 'fake 2D processing' which only occurs
+/// with large 1D arrays, see filter.h for more on the topic.
 
 // when playing with the filter a bit, I noticed that feeding the filter with the
 // absolute values of the poles produces an impulse response which looks remotely like
@@ -95,44 +99,52 @@ int main ( int argc , char * argv[] )
 {
   int degree = std::atoi ( argv[1] ) ;
   double cutoff = std::atof ( argv[2] ) ;
-  assert ( degree >= 0 && degree < 25 ) ;
   
+  assert ( degree >= 0 && degree < 25 ) ;
   
+  int npoles = degree / 2 ;
+  const double * poles = vspline_constants::precomputed_poles [ degree ] ;
+
 // using the highest-level access to prefiltering, we code:
 
-//   vspline::bspline < double , 1 > bsp ( 10001 , degree , vspline::PERIODIC ) ; // , vspline::EXPLICIT ) ;
-//   auto v1 = bsp.core ;
-//   v1 [ 5000 ] = 1.0 ;
-//   bsp.prefilter() ;
+  vspline::bspline < double , 1 > bsp ( 100001 , degree , vspline::MIRROR ) ;
+  auto v1 = bsp.core ;
+  v1 [ 50000 ] = 1.0 ;
+  bsp.prefilter() ;
 
 // using slightly lower-level access to the prefiltering code, we could achieve
 // the same result with:
-
-//   vigra::MultiArray < 1 , double > v1 ( 10001 ) ;
-//   v1[5000] = 1.0 ;
-//   vspline::solve_vc ( v1 , v1 , vspline::MIRROR , degree , 0.00001 ) ;
+// 
+//   typedef vigra::MultiArray < 1 , double > array_t ;
+//   vigra::TinyVector < vspline::bc_code , 1 > bcv ;
+//   bcv[0] = vspline::MIRROR ;
+//   array_t v1 ( 100001 ) ;
+//   v1[50000] = 1.0 ;
+//   vspline::solve < array_t , array_t , double >
+//     ( v1 , v1 , bcv , degree , 0.000000000001 ) ;
   
 // and, going yet one level lower, this code also produces the same result:
   
-  vigra::MultiArray < 1 , double > v1 ( 10001 ) ;
-  v1[5000] = 1.0 ;
-  typedef decltype ( v1.begin() ) iter_type ;
-  vspline::filter < iter_type , iter_type , double >
-    f ( v1.size() ,
-        vspline::overall_gain ( 1 , precomputed_poles [ degree ] ) ,
-        vspline::MIRROR ,
-        1 ,
-        precomputed_poles [ degree ] ,
-        0.00001 ) ;
-  f.solve ( v1.begin() ) ;
+//   vigra::MultiArray < 1 , double > v1 ( 100001 ) ;
+//   v1[50000] = 1.0 ;
+//   typedef decltype ( v1.begin() ) iter_type ;
+//   vspline::filter < iter_type , iter_type , double >
+//     f ( v1.size() ,
+//         vspline::overall_gain ( npoles , poles ) ,
+//         vspline::MIRROR ,
+//         npoles ,
+//         poles ,
+//         0.000000000001 ) ;
+//   f.solve ( v1.begin() ) ;
         
   std::cout << "double ir_" << degree << "[] = {" << std::endl ;
   std::cout << std::fixed << std::showpos << std::showpoint << std::setprecision(16);
-  int k ;
-  for ( k = 0 ; k < 10001 ; k++ )
+
+  for ( int k = 0 ; k < 100001 ; k++ )
   {
     if ( fabs ( v1[k] ) > cutoff )
+    {
       std::cout << v1[k] << " ," << std::endl ;
+    }
   }
-  std::cout << " } ;" << std::endl ;
-}
\ No newline at end of file
+}
diff --git a/example/pano_extract.cc b/example/pano_extract.cc
index a6c7028..050557f 100644
--- a/example/pano_extract.cc
+++ b/example/pano_extract.cc
@@ -42,6 +42,9 @@ using namespace vspline ;
 #include <vigra/impex.hxx>
 #include <vigra/impexalpha.hxx>
 
+#include <unistd.h>
+#include <stdlib.h>
+
 #define PRINT_ELAPSED
 
 #ifdef PRINT_ELAPSED
@@ -51,6 +54,14 @@ using namespace vspline ;
 
 #include <vigra/quaternion.hxx>
 
+typedef vigra::RGBValue<float,0,1,2> pixel_type; 
+typedef vigra::RGBValue<UInt16,0,1,2> pixel_type_ui16 ; 
+typedef vigra::MultiArray<2, pixel_type> array_type ;
+typedef vigra::MultiArrayView<2, pixel_type> view_type ;
+typedef vigra::TinyVector < float , 2 > coordinate_type ;
+
+#define VSIZE (vspline::vector_traits<float>::size)
+
 /// concatenation of roll, pitch and yaw into a single quaternion
 /// we use a right-handed coordinate system: the positive x axis points towards us,
 /// the positive y axis to the right and the positive z axis up. the subjective
@@ -112,15 +123,16 @@ rotate_q ( const TinyVector < rc_type , 3 > & v , Quaternion<rc_type> q )
 /// tf_spherical_rectilinear transforms incoming (target) image coordinates to corresponding
 /// spherical coordinates into the sherical panoramic image. The main coding effort
 /// for the intended image transformation is in this functor. The strategy is this:
-/// with the transformation-based remap, we receive 2D coordinate into the target image.
+/// with the index-based remap, we receive 2D discrete coordinates into the target image.
 /// We try and precalculate everything we can from the constructor parameters and 'load'
 /// the functor with the precalculated values, leaving only the steps which depend on
 /// the incoming cordinates to the operator() of the functor. Instead of rotating
 /// every incoming coordinate wit yaw, pitch and roll, we only rotate three points:
 /// the top left, lower left and upper right of the target image. While we are using
 /// cartesian coordinates, we can now simply generate the transformed cartesian coordinates
-/// by following straight lines with dx, dy and dz derived from the three rotated points.
-/// Once we have the cartesian coordinates, we only have to transform them to spherical
+/// by following straight lines with dx, dy and dz derived from the three rotated points,
+/// which is easily done by scaling and translating the incoming discrete coordinates.
+/// Once we have the cartesian 3D coordinates, we only have to transform them to spherical
 /// coordinates and do a bit of scaling and shifting to receive coordinates into the
 /// panoramic source image. Note that in this code we model pixels as small rectangular
 /// shapes. If an image's width is w, it has w pixels along the x axis. The 'position'
@@ -202,7 +214,7 @@ public:
   /// The intermediate 3D processing is encapsuled in the functor.
 
   void operator() ( const coordinate_type & c_in ,
-                          coordinate_type & c_out )
+                          coordinate_type & c_out ) const
   {
     rc_type x = c_in[0] ; // incoming coordinates (into the target)
     rc_type y = c_in[1] ;
@@ -241,7 +253,7 @@ public:
 
   template < class rc_v >
   void operator() ( const TinyVector < rc_v , 2 > & c_in ,
-                          TinyVector < rc_v , 2 > & c_out )
+                          TinyVector < rc_v , 2 > & c_out ) const
   {
     rc_v x = c_in[0] ;
     rc_v y = c_in[1] ;
@@ -270,9 +282,84 @@ public:
 
 } ;
 
+typedef vigra::TinyVector < float , 2 > coordinate_type ;
+
+typedef
+vspline::unary_functor
+  < coordinate_type , pixel_type , VSIZE >
+  interpolator_base_type ;
+
+/// same for the alpha channel, here we have single float pixels
+
+typedef
+vspline::unary_functor
+  < coordinate_type , float , VSIZE >
+  alpha_interpolator_base_type ;
+
+// struct projector wraps inner_type, applying the projection of type projection_type
+// to incoming coordinates before passing them on to inner_type's evaluation routine.
+// This is a good example for my wrappping scheme: The constructor fixes const
+// references to both the projection and the inner type. the calls to eval and eval
+// simply apply the projection and pass on to inner_type's eval routine.
+// Note how class vspline::interpolator's type system is pulled in via the macro
+// using_interpolator_types(). While the sudden appearance of a bunch of types
+// is mildly surprising, using this macro saves a lot of declarations and focusses
+// the code on it's actual function.
+// So the flow of the data is like this:
+// - we have incoming discrete coordinates fed by index_remap
+// - these are transformed to target (spherical) coordinates by class projection_type
+// - the transformed coordinates are fed to inner_type, which is a b-spline evaluator
+// - the result of the b-spline evaluation is deposited in 'result'
+// - result is a variable in one of the eval variants pulled in from
+//   class vspline::interpolator. It's content is interleaved and written
+//   to the target array by this eval variant.
+
+template < class inner_type , class projection_type , class base_type >
+struct projector
+: public base_type
+{
+  const inner_type & inner ;
+  const projection_type & projection ;
+  
+  using_unary_functor_types(base_type) // pull in vspline::interpolator's types
+  
+  projector ( const inner_type & _inner ,
+              const projection_type & _projection )
+  : inner ( _inner ) ,
+    projection ( _projection )
+  { } ;
+  
+  // unvectorized operation:
+
+  void eval ( const in_type & c ,
+                    out_type & result ) const
+  {
+    in_type cc ;
+    projection ( c , cc ) ;
+    inner.eval ( cc , result ) ;
+  }
+
+#ifdef USE_VC
+
+  // we need to overload this variant of eval, since it's pure virtual
+  // in the base class and defines the actual data processing. This routine
+  // is in turn used by vspline::interpolator's routine depositing the
+  // (interleaved) data in the target array. 
+
+  void eval ( const in_v & c ,    // simdized nD coordinate
+                    out_v & result ) const  // simdized value_type
+  {
+    in_v cc ;
+    projection ( c , cc ) ;
+    inner.eval ( cc , result ) ;
+  }
+
+#endif
+} ;
+
 // To present a program which is useful and 'correct' in photography terms,
 // we have to deal with gamma and colour space. We make the silent assumption
-// that incomning data with unsigned char and unsigned short pixels are in sRGB,
+// that incoming data with unsigned char and unsigned short pixels are in sRGB,
 // while other incoming types are linear RGB already.
 
 #include <vigra/colorconversions.hxx>
@@ -328,7 +415,8 @@ void e_to_l_rgb ( ele_type& component )
 
   component = ( component <= 0.04045 ) 
               ? max * component / 12.92
-              : max * pow ( (component + 0.055) / 1.055 , 2.4 ) ;
+//               : max * pow ( (component + 0.055) / 1.055 , 2.4 ) ;
+              : max * exp (  2.4 * log ( (component + 0.055) / 1.055 ) ) ;
 }
 
 #ifdef USE_VC
@@ -340,14 +428,11 @@ void v_to_l_rgb ( Vc::Vector < entry_type > & v )
 {
   const entry_type max ( _max ) ;
   
-  v *= entry_type ( 1.0 / max ) ;
-
-  auto mask = ( v <= entry_type ( 0.04045 ) ) ;
-  v ( mask ) *= entry_type ( max / 12.92 ) ;
-  v ( ! mask ) = entry_type ( max )
-                 * exp (    entry_type ( 2.4 )
-                          * log ( ( v + entry_type ( 0.055 ) ) * entry_type ( 1.0 / 1.055 ) )
-                       ) ;
+  v /= max ;
+                    
+  auto mask = ( v <= 0.04045f ) ;
+  v ( mask ) = max * v / 12.92f ;
+  v ( ! mask ) = max * exp ( 2.4f * log ( ( v + 0.055f ) / 1.055f ) ) ;
 }
 
 template < typename entry_type , int _max = 255 >
@@ -355,19 +440,38 @@ void r_v_to_l_rgb ( shape_range_type < 2 > range ,
                     vigra::MultiArrayView < 2 , entry_type > * p_view )
 {
   auto view = p_view->subarray ( range[0] , range[1] ) ;
-  auto p_data = view.data() ;
   int nv = view.size() / Vc::Vector < entry_type > :: Size ;
+  auto iter = view.begin() ;
   
   for ( int i = 0 ; i < nv ; i++ )
   {
-    Vc::Vector < entry_type > v ( p_data ) ;
-    v_to_l_rgb ( v ) ;
-    v.store ( p_data ) ;
-    p_data += Vc::Vector < entry_type > :: Size ;
+    Vc::Vector < entry_type > v ;
+    for ( int e = 0 ; e < Vc::Vector < entry_type > :: Size ; e++ )
+    {
+      v[e] = iter[e] ;
+    }
+    v_to_l_rgb < entry_type , _max > ( v ) ;
+    for ( int e = 0 ; e < Vc::Vector < entry_type > :: Size ; e++ )
+    {
+      *iter = v[e] ;
+      iter++ ;
+    }
   }
   for ( int i = nv * Vc::Vector < entry_type > :: Size ; i < view.size() ; i++ )
-    e_to_l_rgb ( view[i] ) ;
+    e_to_l_rgb < entry_type , _max > ( view[i] ) ;
+}
+
+#else
+
+template < typename entry_type , int _max = 255 >
+void r_v_to_l_rgb ( shape_range_type < 2 > range ,
+                    vigra::MultiArrayView < 2 , entry_type > * p_view )
+{
+  auto view = p_view->subarray ( range[0] , range[1] ) ;
+  for ( int i = 0 ; i < view.size() ; i++ )
+    e_to_l_rgb < entry_type , _max > ( view[i] ) ;
 }
+
 // ouch... Vc does not provide pow(), so we use
 // pow(x,y) = exp^(y*lnx)
 
@@ -391,7 +495,7 @@ void r_v_to_l_rgb ( shape_range_type < 2 > range ,
 /// marginal.
 ///
 /// Once the spline is ready, the transformation functor is set up with the parameters
-/// passed into main. Finally, tf_remap is called to do the processing.
+/// passed into main. Finally, index_remap is called to do the processing.
 ///
 /// process_image also handles images with an alpha channel. The alpha channel is also
 /// mapped to the target image (if it's present), but for the alpha channel it only uses
@@ -409,6 +513,8 @@ void process_image ( const char * name ,
                      const char * output
                    )
 {
+  typedef vigra::Shape2 shape_type ;
+  
   // first we have to read and preprocess the source data. The naive way of
   // doing this would be to read the image into some buffer and then erect a
   // b-spline over this buffer. But class b-spline is designed to offer a better
@@ -437,16 +543,10 @@ void process_image ( const char * name ,
     throw not_supported ( "can only process at most one extra channel" ) ;
   }
 
-  typedef vigra::RGBValue<rc_type,0,1,2> pixel_type; 
-  typedef vigra::RGBValue<UInt16,0,1,2> pixel_type_ui16 ; 
-  typedef vigra::MultiArray<2, pixel_type> array_type ;
-  typedef vigra::MultiArrayView<2, pixel_type> view_type ;
-  typedef typename view_type::difference_type shape_type ;
-
   TinyVector < bc_code , 2 > bcv = { PERIODIC , SPHERICAL } ;
 
   // find out the image's shape
-  shape_type core_shape = imageInfo.shape() ;
+  auto core_shape = imageInfo.shape() ;
   // create a suitable bspline object
   bspline < pixel_type , 2 > bspl ( core_shape , spline_degree , bcv , EXPLICIT ) ;
   // get the view to the core coefficient area (to put the image data there)
@@ -470,13 +570,13 @@ void process_image ( const char * name ,
     // which we can use as a container for a degree-1 b-spline. This container array
     // will be a bit larger than the core shape, due to the boundary conditions used.
     // we have a convenience function yielding this shape:
-    shape_type alpha_shape = bspline<float,2>::container_size ( core_shape , 1 , bcv ) ;
+    auto alpha_shape = bspline<float,2>::container_size ( core_shape , 1 , bcv ) ;
     // only if the alpha channel is actually present, we fill the array and the view
     // above with life:
     vigra::MultiArray<2,float> target ( alpha_shape ) ;
     alpha_channel.swap ( target ) ; // swap data into alpha_channel
     // get a view to it's core area
-    shape_type left_corner = bracer<view_type>::left_corner ( bcv , 1 ) ;
+    auto left_corner = bracer<view_type>::left_corner ( bcv , 1 ) ;
     alpha_view = alpha_channel.subarray ( left_corner , left_corner + core_shape ) ;
   }
 
@@ -494,6 +594,8 @@ void process_image ( const char * name ,
   TinyVector < rc_type , 3 > max_value ;
   int component_max ;
   
+#define GAMMA
+
   if ( enum_pixel_type == vigra::ImageImportInfo::UINT8 )
   {
     // for UINT8 pixels, we use can degamma with a lookup table, which is less
@@ -522,12 +624,13 @@ void process_image ( const char * name ,
 
     // vectorization doesn't work for unsigned char, but we can multithread:
 
-//     divide_and_conquer < buffer_view > :: run ( buffer , degamma ) ;
-
     buffer_view bv ( buffer ) ;
     shape_range_type < 2 > range ( shape_type() , bv.shape() ) ;
+    
+#ifdef GAMMA
     multithread ( &rdegamma , ncores , range , &bv ) ;
-
+#endif
+    
     // still we need to work on floating point data
 
     auto it = core.begin() ;
@@ -564,20 +667,15 @@ void process_image ( const char * name ,
     // silently assumes input actually is sRGB...
     // this conversion takes quite a lot of time!
 
-    MultiArrayView < 2 , rc_type > core_flat_view ( vigra::Shape2 ( core.shape(0) * 3 , core.shape(1) ) ,
-                                                    (rc_type*) ( core.data() ) ) ;
+    MultiArrayView < 2 , rc_type > core_flat_view
+     ( vigra::Shape2 ( core.shape(0) * 3 , core.shape(1) ) ,
+       (rc_type*) ( core.data() ) ) ;
                                                     
     shape_range_type < 2 > range2 ( shape_type() , core_flat_view.shape() ) ;
     
-#ifdef USE_VC
+#ifdef GAMMA
     multithread ( &r_v_to_l_rgb<rc_type,65535> , ncores , range2 , &core_flat_view ) ;
-#endif
-    
-// #ifdef USE_VC
-//     divide_and_conquer < decltype(core) > :: run ( core , v_to_l_rgb<rc_type,65535> ) ;
-// #else
-//     divide_and_conquer < decltype(core) > :: run ( core , to_l_rgb<pixel_type,65535> ) ;
-// #endif
+#endif    
     component_max = 65535 ;
   }
   else
@@ -605,6 +703,7 @@ void process_image ( const char * name ,
   // we determine minimum and maximum values for every channel to use saturation
   // arithmetics further down
   // TODO: this also looks at transparent pixels...
+  // TODO this is slow!
 
   for ( int ch = 0 ; ch < 3 ; ch++ )
   {
@@ -655,30 +754,6 @@ void process_image ( const char * name ,
                                             pitch ,            // pitch of same
                                             roll ) ;           // roll of same
 
-#ifdef USE_VC
-
-  // this looks funny - passing in the same transform twice, both for scalar and for vector
-  // operation? but tf_se is a functor with overloaded operator(), and as the accepted types
-  // of std::function are encoded in vspline::transformation's two-argument constructor's
-  // signature, the correct overload is picked for each case. Note that using the single-argument
-  // constructor would also work here, but result in broadcasting the scalar routine to the vector
-  // data, resulting in slower operation. The one-argument form is only a crutch if vectorized
-  // code can't be had.
-
-  vspline::transformation < rc_type , 2 , 2 ,
-                            vspline::vector_traits < float > :: vsize >
-    tf ( tf_se , tf_se ) ;
-
-#else
-
-  // ... or if Vc can't be used. In this case only the single-argument constructor
-  // can be used:
-
-  vspline::transformation < rc_type , 2 , 2 >
-    tf ( tf_se ) ;
-
-#endif
-
   // this usually takes the least amount of time:
 
   cout << "creating target image... " << std::flush ;
@@ -690,11 +765,15 @@ void process_image ( const char * name ,
   typedef evaluator < 2 , pixel_type , rc_type , int > eval_type ;
   eval_type ev ( bspl ) ;
   
-  for ( int times = 0 ; times < 1 ; times++ )
-  {
-    vspline::tf_remap < eval_type , 2 , 2 >
-      ( ev , tf , result ) ;
-  }
+  typedef projector < eval_type ,
+                      tf_spherical_rectilinear<float> ,
+                      interpolator_base_type > prj_type ;
+  
+  prj_type prj ( ev , tf_se ) ;
+
+  // now we can invoke the processing chain in prj for all pixels in 'result':
+
+  vspline::index_remap < prj_type , 2 > ( prj , result ) ;
 
   if ( extra_bands )
   {
@@ -714,22 +793,20 @@ void process_image ( const char * name ,
     // we need a slightly different transformation here, working on single floats
     // instead of pixels of three RGB values
 
-#ifdef USE_VC
-    vspline::transformation < rc_type , 2 , 2 ,
-                              vspline::vector_traits < float > :: vsize >
-      tf_alpha ( tf_se , tf_se ) ;
-#else
-    vspline::transformation < rc_type , 2 , 2 >
-      tf_alpha ( tf_se ) ;
-#endif
-
     // create an evaluator for bspl_alpha
     typedef evaluator < 2 , float , rc_type , int > alpha_ev_type ;
     alpha_ev_type ev_alpha ( bspl_alpha ) ;
 
+    typedef projector < alpha_ev_type ,
+                        tf_spherical_rectilinear<float> ,
+                        alpha_interpolator_base_type >
+      alpha_prj_type ;
+    
+    alpha_prj_type alpha_prj ( ev_alpha , tf_se ) ;
+  
     // now we do a transformation-based remap of the alpha channel
-    vspline::tf_remap < alpha_ev_type , 2 , 2 >
-      ( ev_alpha , tf_alpha , alpha_result ) ;
+    vspline::index_remap < alpha_prj_type , 2 >
+      ( alpha_prj , alpha_result ) ;
   }
 
 #ifdef PRINT_ELAPSED
@@ -760,6 +837,7 @@ void process_image ( const char * name ,
   // next we convert back from linear RGB to sRGB using vigra's functor for convenience.
   // TODO: for now, this is neither multithreaded nor vectorized...
 
+#ifdef GAMMA
   if (    enum_pixel_type == vigra::ImageImportInfo::UINT8
        || enum_pixel_type == vigra::ImageImportInfo::UINT16 )
   {
@@ -767,6 +845,7 @@ void process_image ( const char * name ,
     for ( auto & pix : result )
       pix = to_gamma ( pix ) ;
   }
+#endif
   // and export with a forced range mapping to avoid vigra's automatic mapping of the
   // brightness values to 0...max.
   // TODO .setForcedRangeMapping is not in vigra documentation
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index f28c52d..a7ca2a7 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -32,13 +32,14 @@
 /// roundtrip.cc
 ///
 /// load an image, create a b-spline from it, and restore the original data,
-/// both by normal evaluation and by convolution with the restoration kernel.
-/// all of this is done 100 times each with different boundary conditions,
-/// spline degrees and in float and double arothmetic, the processing times
+/// both by normal evaluation and by convolution with the reconstruction kernel.
+/// all of this is done 16 times each with different boundary conditions,
+/// spline degrees and in float and double arithmetic, the processing times
 /// and differences between input and restored signal are printed to cout.
 ///
 /// compile with:
-/// g++ -std=c++11 -o roundtrip -O3 -pthread -DUSE_VC=1 roundtrip.cc -lvigraimpex -lVc
+/// clang++ -std=c++11 -march=native -o roundtrip -O3 -pthread -DUSE_VC=1 roundtrip.cc -lvigraimpex -lVc
+/// g++ also works.
 
 #include <vspline/vspline.h>
 
@@ -57,31 +58,9 @@
 
 using namespace std ;
 
-namespace detail
-{
 using namespace vigra ;
 
-/// we use identity transformations on coordinates. Incoming coordinates will be
-/// translated straight to outgoing coordinates. If we use such a transformation
-/// we'd expect to recover the input.
-
-template < class coordinate_type > // float or double for coordinates
-void tf_identity ( const TinyVector < coordinate_type , 2 > & c_in ,
-                         TinyVector < coordinate_type , 2 > & c_out )
-{
-  c_out = c_in ;
-}
-
-// #ifdef USE_VC
-// 
-// template < class rc_v >
-// void vtf_identity ( const TinyVector < rc_v , 2 > & c_in ,
-//                           TinyVector < rc_v , 2 > & c_out )
-// {
-//   c_out = c_in ;
-// }
-// 
-// #endif
+/// check for differences between two arrays
 
 template < class view_type >
 void check_diff ( const view_type& a , const view_type& b )
@@ -90,7 +69,7 @@ void check_diff ( const view_type& a , const view_type& b )
   using namespace vigra::acc;
   
   typedef typename view_type::value_type value_type ;
-  typedef typename ExpandElementResult < value_type > :: type real_type ;
+  typedef typename vigra::ExpandElementResult < value_type > :: type real_type ;
   typedef MultiArray<2,real_type> error_array ;
 
   error_array ea ( vigra::multi_math::squaredNorm ( b - a ) ) ;
@@ -101,7 +80,7 @@ void check_diff ( const view_type& a , const view_type& b )
 }
 
 template < class view_type , typename real_type , typename rc_type >
-void roundtrip ( view_type & data ,
+void run_test ( view_type & data ,
                  vspline::bc_code bc ,
                  int DEGREE ,
                  bool use_vc ,
@@ -118,7 +97,7 @@ void roundtrip ( view_type & data ,
   // considers appropriate for a given real_type, which is the elementary
   // type of the (pixel) data we process:
   
-  const int vsize = vspline::vector_traits < real_type > :: vsize ;
+  const int vsize = vspline::vector_traits < real_type > :: size ;
   
   // for vectorized coordinates, we use simdized coordinates with as many
   // elements as the simdized values hold:
@@ -141,6 +120,8 @@ void roundtrip ( view_type & data ,
   bsp.core = data ;
 //   cout << "created bspline object:" << endl << bsp << endl ;
   
+  // first test: time prefilter
+
 #ifdef PRINT_ELAPSED
   std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
   std::chrono::system_clock::time_point end ;
@@ -163,43 +144,29 @@ void roundtrip ( view_type & data ,
   // get a view to the core coefficients (those which aren't part of the brace)
   view_type cfview = bsp.core ;
 
-  // create an evaluator
+  // get evaluator type
   typedef vspline::evaluator<2,pixel_type,rc_type,int_type> eval_type ;
 
   // create the evaluator
   eval_type ev ( bsp ) ;
 
-  // get the coordinate and split coordinate types from the evaluator
+  // get the coordinate type from the evaluator
   typedef typename eval_type::nd_rc_type coordinate_type ;
-  typedef typename eval_type::split_coordinate_type split_type ;
-  
-//   typedef vspline::nd_mapping < int , rc_type , 2 , vsize > mmap ;
   
-  // obtain the mapping from the evaluator:
-  const auto map = ev.get_mapping() ;
-
+  // get type for coordinate array
   typedef vigra::MultiArray<2, coordinate_type> coordinate_array ;
   
-  typedef vigra::MultiArray<2, split_type> warp_array ;
-  typedef vigra::MultiArrayView<2, split_type> warp_view ;
-
-//   typedef vspline::warper < int , rc_type , 2 , 2 > warper_type ;
-
   int Tx = Nx ;
   int Ty = Ny ;
 
   // now we create a warp array of coordinates at which the spline will be evaluated.
-  // We create two versions of the warp array: one storing ordinary real coordinates
-  // (fwarp) and the other storing pre-split coordinates. Also create a target array
-  // to contain the result.
+  // Also create a target array to contain the result.
 
   coordinate_array fwarp ( Shape ( Tx , Ty ) ) ;
-  warp_array _warp ( Shape(Tx+5,Ty+4) ) ;
-  warp_view warp = _warp.subarray ( Shape(2,1) , Shape(-3,-3) ) ;
   array_type _target ( Shape(Tx,Ty) ) ;
   view_type target ( _target ) ;
   
-  rc_type dfx = 0.0 , dfy = 0.0 ;
+  rc_type dfx = 0.0 , dfy = 0.0 ; // currently evaluating right at knot point locations
   
   for ( int times = 0 ; times < 1 ; times++ )
   {
@@ -209,33 +176,16 @@ void roundtrip ( view_type & data ,
       for ( int x = 0 ; x < Tx ; x++ )
       {
         rc_type fx = (rc_type)(x) + dfx ;
-        // store the 'ordinary' cordinate to fwarp[x,y]
+        // store the coordinate to fwarp[x,y]
         fwarp [ Shape ( x , y ) ] = coordinate_type ( fx , fy ) ;
-        // and apply the mapping to (fx, fy), storing the result to warp[x,y]
-        map ( fx , warp [ Shape ( x , y ) ] , 0 ) ;
-        map ( fy , warp [ Shape ( x , y ) ] , 1 ) ;
       }
     }
   }
  
-// oops... currently we can't remap from presplit coordinates!
-// #ifdef PRINT_ELAPSED
-//   start = std::chrono::system_clock::now();
-// #endif
-//   
-//   for ( int times = 0 ; times < TIMES ; times++ )
-//     vspline::remap < eval_type , 2 > ( ev , warp , target , use_vc ) ;
-// 
-//   
-// #ifdef PRINT_ELAPSED
-//   end = std::chrono::system_clock::now();
-//   cout << "avg " << TIMES << " x remap1 from pre-split coordinates: "
-//        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
-//        << " ms" << endl ;
-// #endif
-//        
-//   check_diff<view_type> ( target , data ) ;
-  
+  // second test. perform a remap using fwarp as warp array. Since fwarp contains
+  // the discrete coordinates to the knot points, converted to float, the result
+  // should be the same as the input within the given precision
+
 #ifdef PRINT_ELAPSED
   start = std::chrono::system_clock::now();
 #endif
@@ -247,14 +197,16 @@ void roundtrip ( view_type & data ,
   
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg " << TIMES << " x remap1 from unsplit coordinates:.. "
+  cout << "avg " << TIMES << " x remap from unsplit coordinates:... "
        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
        
   check_diff<view_type> ( target , data ) ;
-  
-// oops.. currently I have problems with remaps with internal splines!
+
+  // third test: do the same with the remap routine which internally creates
+  // a b-spline ('one-shot remap')
+
 #ifdef PRINT_ELAPSED
   start = std::chrono::system_clock::now();
 #endif
@@ -273,52 +225,22 @@ void roundtrip ( view_type & data ,
 
   check_diff<view_type> ( target , data ) ;
 
-#ifdef USE_VC
-
-  vspline::transformation < rc_type , 2 , 2 , vsize >
-    tf ( tf_identity<rc_type> , tf_identity<rc_v> ) ;
-
-#else
- 
-  // using this call when USE_VC is defined would result in broadcasting
-  // of the single-element coordinate transform. The effect is the same,
-  // but the code is potentially slower.
-
-  vspline::transformation < rc_type , 2 , 2 , 1 >
-    tf ( tf_identity<rc_type> ) ;
-
-#endif
-
-#ifdef PRINT_ELAPSED
-  start = std::chrono::system_clock::now();
-#endif
-  
-  for ( int times = 0 ; times < TIMES ; times++ )
-    vspline::tf_remap < pixel_type , pixel_type , rc_type , 2 , 2 >
-      ( data , tf , target , bcv , DEGREE , use_vc ) ;
- 
-  // note:: with REFLECT this doesn't come out right, because of the .5 difference!
-      
-#ifdef PRINT_ELAPSED
-  end = std::chrono::system_clock::now();
-  cout << "avg " << TIMES << " x remap with functor & internal bspl "
-       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
-       << " ms" << endl ;
-#endif
+  // fourth test: perform an index_remap directly using te b-spline evaluator
+  // as the index_remap's interpolator. This is, yet again, the same, because
+  // it evaluates at all discrete positions, but now without the warp array:
+  // the index_remap feeds the evaluator with the discrete coordinates.
 
 #ifdef PRINT_ELAPSED
   start = std::chrono::system_clock::now();
 #endif
   
   for ( int times = 0 ; times < TIMES ; times++ )
-    vspline::tf_remap < eval_type , 2 >
-      ( ev , tf , target , use_vc ) ;
+    vspline::index_remap < eval_type , 2 >
+      ( ev , target , use_vc ) ;
 
-  // note:: with REFLECT this doesn't come out right, because of the .5 difference!
-      
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg " << TIMES << " x remap with functor & external bspl "
+  cout << "avg " << TIMES << " x index_remap with external bspl... "
        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
@@ -354,25 +276,28 @@ void process_image ( char * name )
   typedef vigra::MultiArray<2, pixel_type> array_type ;
   typedef vigra::MultiArrayView<2, pixel_type> view_type ;
 
-//   // to test that strided data are processed correctly, we load the image
-//   // to an inner subarray of containArray
-//   
+  // to test that strided data are processed correctly, we load the image
+  // to an inner subarray of containArray
+  
 //   array_type containArray(imageInfo.shape()+vigra::Shape2(3,5));
 //   view_type imageArray = containArray.subarray(vigra::Shape2(1,2),vigra::Shape2(-2,-3)) ;
   
+  // alternatively, just use the same for both
+  
   array_type containArray ( imageInfo.shape() );
   view_type imageArray ( containArray ) ;
   
   vigra::importImage(imageInfo, imageArray);
   
   // test these bc codes:
+
   vspline::bc_code bcs[] =
   {
     vspline::MIRROR ,
     vspline::REFLECT ,
     vspline::NATURAL ,
     vspline::PERIODIC
- } ;
+  } ;
 
   for ( int b = 0 ; b < 4 ; b++ )
   {
@@ -380,85 +305,33 @@ void process_image ( char * name )
     for ( int spline_degree = 0 ; spline_degree < 8 ; spline_degree++ )
     {
       cout << "testing bc code " << vspline::bc_name[bc]
-          << " spline degree " << spline_degree << endl ;
-      roundtrip < view_type , real_type , rc_type > ( imageArray , bc , spline_degree , false ) ;
+           << " spline degree " << spline_degree << endl ;
+      run_test < view_type , real_type , rc_type > ( imageArray , bc , spline_degree , false ) ;
 #ifdef USE_VC
       cout << "testing bc code " << vspline::bc_name[bc]
-          << " spline degree " << spline_degree << " using Vc" << endl ;
-      roundtrip < view_type , real_type , rc_type > ( imageArray , bc , spline_degree , true ) ;
+           << " spline degree " << spline_degree << " using Vc" << endl ;
+      run_test < view_type , real_type , rc_type > ( imageArray , bc , spline_degree , true ) ;
 #endif
     }
   }
 }
-} ; // namespace detail
 
 int main ( int argc , char * argv[] )
 {
-//   cout << fixed << showpoint ;
-//   
-//   long nops = 2 ; // number of operations in the inner loop
-//   long hz = 1500000000 ;
-//   long vsize = Vc::float_v::Size ;
-//   long flops = hz * vsize * nops ;
-//   
-//   long num1 = 1 , num2 = 2 , z = 2 ;
-//   
-//   for ( ; z < 10000L ; z = num1 + num2 , num1 = num2 , num2 = z )
-//   {
-//     long times = flops / ( z * vsize * nops ) ;
-//     cout << flops << " flops: processing " << z << " vectors " << times << " times: " ;
-//     Vc::Memory < Vc::float_v > vmem ( z * Vc::float_v::Size ) ;
-//     const Vc::Memory < Vc::float_v > cvmem = vmem ;
-//     vmem.setZero() ;
-//     Vc::float_v v7 ( .777f ) ;
-// 
-//     auto start = std::chrono::system_clock::now();
-//     for ( long t = 0 ; t < times ; t++ )
-//     {
-//       auto start = cvmem.begin() ;
-//       auto end = vmem.end() ;
-//       auto next = vmem.begin() + 1 ;
-//       while ( next < end )
-//       {
-// //         *next *= v7 * *start ;
-//         *next = *start ;
-//         ++start ;
-//         ++next ;
-//       }
-//     }
-//     auto end = std::chrono::system_clock::now();
-//     cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()  << " ms" << endl ;
-//   }
-
   cout << "testing float data, float coordinates" << endl ;
-  detail::process_image<float,float> ( argv[1] ) ;
+  process_image<float,float> ( argv[1] ) ;
 
   cout << endl << "testing double data, double coordinates" << endl ;
-  detail::process_image<double,double> ( argv[1] ) ;
+  process_image<double,double> ( argv[1] ) ;
   
   cout << "testing float data, double coordinates" << endl ;
-  detail::process_image<float,double> ( argv[1] ) ;
+  process_image<float,double> ( argv[1] ) ;
   
   cout << endl << "testing double data, float coordinates" << endl ;
-  detail::process_image<double,float> ( argv[1] ) ;
+  process_image<double,float> ( argv[1] ) ;
   
   cout << "reached end" << std::endl ;
   // oops... we hang here, failing to terminate
   
   exit ( 0 ) ;
-  
-//   typedef vigra::RGBValue<float,0,1,2> pixel_type; 
-//   typedef vigra::MultiArray<2, pixel_type> array_type ;
-//   typedef vigra::MultiArrayView<2, pixel_type> view_type ;
-//   
-//   array_type a ( 10 , 20 ) ;
-//   a = 3.0f ;
-//   view_type v ( a ) ;
-//   typedef vspline::client_traits < view_type > ct ;
-//   typename ct::manifest_type array ( ct::create_compatible ( v ) ) ;
-//   cout << array.shape() << " " << array [ vigra::Shape2 ( 3 , 3 ) ] << endl ;
-//   ct::contained_type p ;
-//   cout << p << endl ;
-//   ct::element_type e ;
-//   cout << e << endl ;
 }
diff --git a/filter.h b/filter.h
index 2b64fc6..51a8375 100644
--- a/filter.h
+++ b/filter.h
@@ -1426,7 +1426,7 @@ public:
                     const double * pole ,
                     double tolerance ,
                     bool use_vc = true ,
-                    int nslices = ncores )  ///< number of threads to use
+                    int njobs = default_njobs )  ///< number of jobs to use when multithreading
 {
   typedef typename input_array_type::value_type value_type ;
   typedef typename input_array_type::difference_type shape_type ;
@@ -1437,7 +1437,6 @@ public:
   // multithread() expects a std::function, so here we fix the type of
   // std::function we want to pass to it, create an instance of the std::function
   // and assign the routine appropriate to the task:
-  // TODO: I'd like to pass the MultiArrayViews per reference, just for consistency.
   
   typedef
   std::function < void ( range_t ,
@@ -1452,19 +1451,6 @@ public:
                          
   filter_functor_type filter_functor ;
   
-// #ifdef USE_VC
-// 
-//   if ( use_vc )
-//     filter_functor = aggregating_filter < input_array_type , output_array_type > ;
-//   else
-//     filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
-// 
-// #else
-//   
-//   filter_functor = nonaggregating_filter < input_array_type , output_array_type > ;
-//   
-// #endif
-
   auto pf = & nonaggregating_filter < input_array_type ,
                                       output_array_type ,
                                       typename input_array_type::value_type > ; // for now...
@@ -1487,7 +1473,7 @@ public:
 
   partition_t partitioning = shape_splitter < dim > :: part
     ( input.shape() ,
-      nslices ,
+      njobs ,
       axis ) ;
   
   multithread ( pf ,
@@ -1519,7 +1505,7 @@ class filter_1d < input_array_type ,
                   1 >                 // specialize for 1D
 {
 public:
-  void operator() ( input_array_type &input ,   ///< source data. can operate in-place
+  void operator() ( input_array_type &input ,    ///< source data. can operate in-place
                     output_array_type &output ,  ///< where input == output.
                     int axis ,
                     double gain ,
@@ -1528,7 +1514,7 @@ public:
                     const double * pole ,
                     double tolerance ,
                     bool use_vc = true ,
-                    int nslices = ncores )  ///< number of threads to use
+                    int njobs = default_njobs )  ///< number of jobs to use
 {
   typedef typename input_array_type::value_type value_type ;
   typedef decltype ( input.begin() ) input_iter_type ;
@@ -1541,7 +1527,7 @@ public:
 
   // if we can multithread, start out with as many lanes as the desired number of threads
 
-  int lanes = nslices ;
+  int lanes = njobs ;
   
 #ifdef USE_VC
  
@@ -1633,6 +1619,8 @@ public:
   
   // the input qualifies for fake 2D processing.
 
+//   std::cout << "fake 2D processing with " << lanes << " lanes" << std::endl ;
+  
   // we want as many chunks as we have lanes. There may be some data left
   // beyond the chunks (tail_size of value_type)
   
@@ -1751,7 +1739,7 @@ public:
       pole ,
       tolerance ,
       use_vc ,
-      nslices ) ;
+      njobs ) ;
 
   // we now have filtered data in target, but the stripes along the magin
   // in x-direction (1 runup wide) are wrong, because we applied IGNORE BC.
@@ -1788,7 +1776,7 @@ public:
 /// - pole: pointer to nbpoles doubles containing the filter poles
 /// - tolerance: acceptable error
 /// - use_vc: flag whether to use vector code or not (if present)
-/// - nslices: number of threads to use (per default as many as physical cores)
+/// - njobs: number of jobs to use when multithreading
 
 template < typename input_array_type ,  // type of array with knot point data
            typename output_array_type , // type of array for coefficients (may be the same)
@@ -1800,7 +1788,7 @@ void filter_nd ( input_array_type & input ,
                  const double * pole ,
                  double tolerance ,
                  bool use_vc = true ,
-                 int nslices = ncores )
+                 int njobs = default_njobs )
 {
   // check if operation is in-place. I assume that the test performed here
   // is sufficient to determine if the operation is in-place.
@@ -1861,7 +1849,7 @@ void filter_nd ( input_array_type & input ,
       pole ,
       tolerance ,
       use_vc ,
-      nslices ) ;
+      njobs ) ;
 
   // but if degree <= 1 we're done already, since copying the data again
   // in dimensions 1... is futile
@@ -1880,7 +1868,7 @@ void filter_nd ( input_array_type & input ,
           pole ,
           tolerance ,
           use_vc ,
-          nslices ) ;
+          njobs ) ;
   }
 }
 
diff --git a/interpolator.h b/interpolator.h
index b819a3e..e95b584 100644
--- a/interpolator.h
+++ b/interpolator.h
@@ -34,16 +34,78 @@
     \brief interface definition for interpolators
 
     The code in this file formalizes the services of an 'interpolator'.
-    In the context of vspline, this is used to present b-spline evaluators
-    to the remap routines, but the interface can accomodate other
-    interpolators as well.
-    This is meant to open up some of the remap routines to use alternative
-    interpolators, if their code isn't specific to b-splines. It might also
-    be possible to gain minimal speed improvements by handcoding stuff - I
-    handcoded nearest neighbour interpolation and the handcoded version seems
-    to run minimally faster (probably omits a futile multiplication with 1.0)
-    but it's not really a great deal.
-    Nevertheless, nice to have, no loss.
+    In vspline itself, the only use of this class is as a base class of
+    class evaluator. Class evaluator is a typical example for a
+    vspline::interpolator: it receives coordinates and yields values. But
+    of course this sort of interface can be used in many other places -
+    in fact naming it 'interpolator' isn't really apt, because it's more
+    of a 'functor yielding values at real coordinates'.
+    
+    The code for class interpolator actually originated as part of
+    class evaluator, but then I realized that what is now in class
+    interpolator is a broader concept which is useful in itself, and
+    best factored out.
+    
+    The remap routines in remap.h make use of class interpolator rather
+    than class evaluator, making them much more versatile. I found that with
+    class interpolator at hand I can implement processing pipelines which
+    go all the way from a starting point like a bunch of coordinates or
+    even only discrete coordinates into a target - to packed 8bit RGBA
+    ready to feed to the GPU. This is easily achieved by following a
+    functional programming scheme with different classes derived from
+    class interpolator. Consider a scenario where you have two processing
+    stages, where each can be expressed in terms of a class derived from
+    class interpolator.
+    
+    The first, 'innermost' class provides eval().
+    
+~~~~~~~~~~~~~~~~~~~~~
+    class A
+    : public vspline::interpolator < 2 , float , float , int >
+    {
+      typedef vspline::interpolator < 2 , float , float , int > base_type ;
+      using_interpolator_types(base_type)
+      
+      ...
+
+      void eval ( const nd_rc_type & c ,
+                  value_type & result ) const
+      {
+        // code to yield a value_type for a coordinate
+      }
+    } ;
+~~~~~~~~~~~~~~~~~~~~~
+    
+    A second, 'outer' class could look like this:
+        
+~~~~~~~~~~~~~~~~~~~~~
+    class B
+    : public vspline::interpolator < 2 , int , float , int >
+    {
+      typedef vspline::interpolator < 2 , int , float , int > base_type ;
+      using_interpolator_types(base_type)
+      
+      const A & inner ;
+      
+      B ( const class A & _inner )
+      : inner ( _inner ) {} ;
+      
+      void eval ( const nd_rc_type & c ,
+                  value_type & result ) const
+      {
+        inner::value_type x ;  // have a place for inner to deposit a value
+        inner.eval ( c , x ) ; // let inner run
+        result = some_function ( x ) ; // postprocess inner's result
+      }
+    } ;
+~~~~~~~~~~~~~~~~~~~~~
+    
+    This scheme also works for preprocessing incoming coordinates, like with
+    a coordinate transformation. In fact the whole distinction between 'coordinates'
+    and 'values' is arbitrary, the code simply produces n-dimensional output from
+    m-dimensional input, where the only limitation is that the values which
+    are handled are TinyVectors of some elementary type, and calling the dimension
+    of the 'values' 'channels' is just as arbitrary.
 */
 
 #ifndef VSPLINE_INTERPOLATOR_H
@@ -54,37 +116,78 @@
 
 namespace vspline {
 
+#ifdef USE_VC
+  
+// properly defining the simdized type of a given value_type is quite a mouthful,
+// so we use aliases here. first we define the simdized elementary type of value_type.
+// Inside a struct interpolator we have a type name handy for this type: ele_v
+// simdized_ele_type<T> is used as a template argument.
+  
+template < class value_type >
+using simdized_ele_type = typename vspline::vector_traits<value_type>::ele_v ;
+
+// next we define an aggregate of simdized elementary types constituting
+// a simdized value_type. Inside a struct interpolator, we also have a type name
+// for this type: mc_ele_v
+                          
+template < class value_type >
+using simdized_type = typename vspline::vector_traits<value_type>::type ;
+
+#endif
+
 template < int _dimension ,    // dimension of the spline
            class _value_type , // type of interpolation result
            class _rc_type ,    // singular real coordinate, like float or double
            class _ic_type ,    // singular integral coordinate, like int
-           int _vsize = vspline::vector_traits
-                       < typename vigra::ExpandElementResult < _value_type > :: type >
-                       :: vsize>       
+           int _vsize = 1 >    // number of elements in a vector/SimdArray
 struct interpolator
 {
   // fix the types we're dealing with
+  
   typedef _ic_type ic_type ;
   typedef _rc_type rc_type ;
 
+  // number of dimensions of coordinates
+
   enum { dimension = _dimension } ;
+  
+  // number of elements in simdized data
+
   enum { vsize = _vsize } ;
 
+  // unvectorized nD coordinates
+
   typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
   typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
   
-  typedef _value_type value_type ;  
+  // data type of interpolator's result
+
+  typedef _value_type value_type ; 
+
+  // elementary type of same
+  
   typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
 
+  // number of channels of same
+
   enum { channels = vigra::ExpandElementResult < value_type > :: size } ;
 
-  // for single coordinates we require an operator() taking a const& to a
-  // single coordinate and a reference to the place where the result should
-  // be deposited.
+  /// for single coordinates we require an eval() taking a const& to a
+  /// single coordinate and a reference to the place where the result should
+  /// be deposited.
 
   virtual void eval ( const nd_rc_type & c ,
                       value_type & result ) const = 0 ;
 
+  /// for convenience: operator() using eval()
+
+  value_type operator() ( const nd_rc_type & c ) const
+  {
+    value_type v ;
+    eval ( c , v ) ;
+    return v ;
+  }
+  
 #ifdef USE_VC
   
   // for vectorized operation, we need a few extra typedefs
@@ -93,21 +196,20 @@ struct interpolator
 
   /// a simdized type of the elementary type of value_type,
   /// which is used for coefficients and results. this is fixed via
-  /// the traits class vector_traits (in common.h)
+  /// the traits class vector_traits (in common.h). Note how we derive
+  /// this type using vsize from the template argument, not what
+  /// vspline::vector_traits deems appropriate for ele_type - though
+  /// both numbers will be the same in most cases.
   
-  typedef typename vector_traits < ele_type , vsize > :: type ele_v ;
+  typedef typename vector_traits < ele_type , vsize > :: ele_v ele_v ;
   
-  /// element count of Simd data types.
-  
-//   enum { vsize = ele_v::Size } ;
-
   /// compatible-sized simdized type of vsize ic_type
 
-  typedef typename vector_traits < ic_type , vsize > :: type ic_v ;
+  typedef typename vector_traits < ic_type , vsize > :: ele_v ic_v ;
 
   /// compatible-sized simdized type of vsize rc_type
 
-  typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
+  typedef typename vector_traits < rc_type , vsize > :: ele_v rc_v ;
 
   /// multichannel value as SoA, for pixels etc.
 
@@ -121,23 +223,81 @@ struct interpolator
 
   typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
 
-  /// require overload of operator() for simdized coordinates and values
-  /// this operator() is used in cases where the calling code already has the
-  /// simdized types handy:
+  /// SoA for multichannel indices (used for gather/scatter operations)
+  
+  typedef vigra::TinyVector < ic_v , channels > mc_ic_v ;
+  
+  /// chunk of vsize real coordinates
+  
+  typedef vigra::TinyVector < rc_type , vsize > rc_chunk_type ;
+  
+  /// chunk of vsize integral coordinates
+  
+  typedef vigra::TinyVector < ic_type , vsize > ic_chunk_type ;
+  
+  /// chunk of vsize value_type
+  
+  typedef vigra::TinyVector < value_type , vsize > value_chunk_type ;
+  
+  /// simdized evaluation routine. This routine takes the simdized coordinate
+  /// per const reference and deposits the result via a reference. This
+  /// method is pure virtual and must be implemented by derived classes.
+  /// This is the only vectorized evaluation routine which a class derived
+  /// from vspline::interpolator has to provide: the other eval variants
+  /// (below) are taking either or both of their arguments as pointers to
+  /// (interleaved) memory and (de)interleave to/from simdized types defined
+  /// inside the method. This is provided as utility code by class
+  /// vspline::interpolator and is pulled in with a using declaration
+  /// in the macro using_interpolator_types, see below.
+
+   virtual void eval ( const nd_rc_v & coordinate ,    // simdized nD coordinate
+                       mc_ele_v & result ) const = 0 ; // simdized value_type
+
+  /// here I provide an (ineffecient) method to evaluate vectorized data by
+  /// delegating to the unvectorized evaluation routine. This will work, but
+  /// it's slow compared to proper vector code.
+  /// But for 'quick shots' one can use this idiom in a derived class:
+  ///
+  ///    virtual void eval ( const nd_rc_v & coordinate ,
+  ///                        mc_ele_v & result ) const
+  ///    {
+  ///      broadcast_eval ( coordinate , result ) ;
+  ///    }
+  
+  void broadcast_eval ( const nd_rc_v & coordinate ,    // simdized nD coordinate
+                        mc_ele_v & result ) const       // simdized value_type
+  {
+    nd_rc_type c ;
+    value_type v ;
+    rc_type * pc = (rc_type*)(&c) ;
+    ele_type * pv = (ele_type*)(&v) ;
 
-  virtual void v_eval ( const nd_rc_v & input ,         // simdized nD coordinate
-                        mc_ele_v & result ) const = 0 ; // simdized value_type
+    for ( int e = 0 ; e < vsize ; e++ )
+    {
+      for ( int d = 0 ; d < dimension ; d++ )
+        pc[d] = coordinate[d][e] ;
+      eval ( c , v ) ;
+      for ( int ch = 0 ; ch < channels ; ch++ )
+        result[ch][e] = pv[ch] ;
+    }
+  }
 
-  /// v_eval taking pointers to vsize coordinates and vsize values,
-  /// expecting these data to be contiguous in memory. This variant
-  /// provides (de)interleaving of the the data.
+  /// again, for convenience, operator() using eval()
 
-  void v_eval ( const nd_rc_type * const pmc , // -> vsize coordinates
-                value_type * result ) const    // -> vsize result values
+  mc_ele_v operator() ( const nd_rc_v & c ) const
   {
-    nd_rc_v cv ;
-    mc_ele_v ev ;
+    mc_ele_v v ;
+    eval ( c , v ) ;
+    return v ;
+  }
+  
+  // next we have a few methods to interface with (interleaved) memory.
 
+  /// method to load/gather interleaved memory into a simdized coordinate
+  
+  void load ( const nd_rc_type * const pmc , // -> vsize coordinates
+              nd_rc_v & cv ) const           // simdized coordinates
+  {
     if ( dimension == 1 )
     {
       cv[0].load ( (rc_type*) pmc ) ;
@@ -147,10 +307,16 @@ struct interpolator
       for ( int d = 0 ; d < dimension ; d++ )
       {
         cv[d].gather ( (rc_type*) pmc ,
-                      ic_v::IndexesFromZero() * dimension + d ) ;
+                       ic_v::IndexesFromZero() * dimension + d ) ;
       }
     }
-    v_eval ( cv , ev ) ;
+  }
+  
+  /// method to store/scatter a simdized value to interleaved memory
+
+  void store ( const mc_ele_v & ev ,       // simdized value
+               value_type * result ) const // -> vsize result values
+  {
     if ( channels == 1 )
     {
       ev[0].store ( (ele_type*) result ) ;
@@ -159,274 +325,181 @@ struct interpolator
     {
       for ( int ch = 0 ; ch < channels ; ch++ )
       {
-        ev[ch].scatter ( (ele_type*) result , ic_v::IndexesFromZero() * channels + ch ) ;
+        ev[ch].scatter ( (ele_type*) result ,
+                         ic_v::IndexesFromZero() * channels + ch ) ;
       }
     }
   } ;
 
-  /// v_eval taking a reference to a simdized coordinate and a pointer
+  /// eval taking pointers to vsize coordinates and vsize values,
+  /// expecting these data to be contiguous in memory. This variant
+  /// provides (de)interleaving of the the data. This method is a helper
+  /// routine and deliberately not a pure virtual method. If a derived
+  /// class needs it, it can just use the base class' method, which in
+  /// turn will delegate to the fully simdized eval above. Having the
+  /// variants using pointers here has an advantage: derived classes
+  /// do not need to concern themselves with (de)interleaving and the
+  /// fastest method of loading/storing the data is picked automatically.
+  /// Note how the conditionals on 'dimension' and 'channels' have no
+  /// run-time cost, since both values are known at compile-time and
+  /// the conditional plus the 'dead' branch are optimized away. So we
+  /// get the fastest way of (de)interleaving automatically, as if we'd
+  /// handcoded for the specific case.
+
+  void eval ( const nd_rc_type * const pmc , // -> vsize coordinates
+              value_type * result ) const    // -> vsize result values
+  {
+    nd_rc_v cv ;
+    mc_ele_v ev ;
+
+    load ( pmc , cv ) ;
+    eval ( cv , ev ) ;
+    store ( ev , result ) ;
+  } ;
+
+  /// eval taking a reference to a simdized coordinate and a pointer
   /// to memory accommodating the result, expecting this memory to be
   /// contiguous.
 
-  void v_eval ( const nd_rc_v & cv ,         // simdized coordinate
-                value_type * result ) const  // -> vsize result values
+  void eval ( const nd_rc_v & cv ,         // simdized coordinate
+              value_type * result ) const  // -> vsize result values
   {
     mc_ele_v ev ;
-    v_eval ( cv , ev ) ;
-    if ( channels == 1 )
-    {
-      ev[0].store ( (ele_type*) result ) ;
-    }
-    else
-    {
-      for ( int ch = 0 ; ch < channels ; ch++ )
-      {
-        ev[ch].scatter ( (ele_type*) result , ic_v::IndexesFromZero() * channels + ch ) ;
-      }
-    }
+
+    eval ( cv , ev ) ;
+    store ( ev , result ) ;
   } ;
 
-// #else
-// 
-//   enum { vsize = 1 } ;
-  
-#endif
-} ;
+  /// variant taking the coordinates via a pointer and
+  /// evaluating into a simdized result passed in by reference
 
-// /// struct diversify takes a few simple functors as template arguments
-// /// and provides a wider range of functors with different, but compatible
-// /// signatures (like, pointers instead of references)
-// 
-// template < int _dimension ,    ///< dimension of coordinates
-//            class _value_type , ///< type of value which the functor produces
-//            class _rc_type ,    ///< singular real coordinate, like float or double
-//            class _ic_type ,    ///< singular integral coordinate, like int
-//            class multi_functor_type >
-// struct diversify
-// {  
-//   // fix the types we're dealing with
-//   typedef _ic_type ic_type ;
-//   typedef _rc_type rc_type ;
-// 
-//   enum { dimension = _dimension } ;
-//   typedef vigra::TinyVector < rc_type , dimension > nd_rc_type ;
-//   typedef vigra::TinyVector < ic_type , dimension > nd_ic_type ;
-//   
-//   typedef _value_type value_type ;  
-//   typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
-// 
-//   enum { channels = vigra::ExpandElementResult < value_type > :: size } ;
-// 
-//   // constructor taking a suitable object which can produce the desired behaviour
+  void eval ( const nd_rc_type * const pmc , // -> vsize coordinates
+              mc_ele_v & result ) const      // simdized result
+  {
+    nd_rc_v cv ;
+
+    load ( pmc , cv ) ;
+    eval ( cv , result ) ;
+  } ;
+
+//   /// variants loading from and/or storing to iterators. Wile it would be nice
+//   /// to call these variants 'eval' as well, we can't do so unless we are more
+//   /// specific; passing in the iterator types as template arguments conflicts
+//   /// with other signatures which can fit the template.
 // 
-//   multi_functor_type multi_functor ;
-//   
-//   diversify ( multi_functor_type _mf )
-//   : multi_functor ( _mf )
-//   { } ;
-//   
-//   // from the multi_functor passed to  the constructor, we can derive a host of
-//   // different functors which all ultimately do the same thing, but access the
-//   // arguments in different ways. We start out with signatures for eval which
-//   // precisely match the std::functions passed in:
-//   
-//   void eval ( const nd_rc_type & c , value_type & result ) const
+//   template < class iter_type >
+//   void load ( iter_type & source_iter , // iterator yielding coordinates
+//               nd_rc_v & cv ) const      // simdized coordinates
 //   {
-//     multi_functor.eval ( c , result ) ;
+//     rc_chunk_type rc_chunk ;
+//     
+//     for ( int i = 0 ; i < vsize ; i++ )
+//     {
+//       rc_chunk[i] = *source_iter ;
+//       source_iter++ ;
+//     }
+//    
+//     load ( (nd_rc_type*) (&rc_chunk) , cv ) ;
 //   }
 //   
-//   void eval ( const nd_ic_type & ic , const nd_rc_type & rc , value_type & result ) const
+//   template < class iter_type >
+//   void store ( const mc_ele_v & ev ,           // simdized value
+//                iter_type & target_iter ) const // iterator to store values
 //   {
-//     multi_functor.eval ( ic , rc , result ) ;
-//   }
-//   
-//   // operation on split coordinates (nD int coordinate+fractional part)
-//   // we can define this using scref2rref:
-//   
-//   typedef split_type < dimension , ic_type , rc_type > split_t ;
-// 
-//   void eval ( const split_t & c , value_type & result ) const
-//   {
-//     multi_functor.eval ( c.select , c.tune , result ) ;
-//   }
-//   
-//   // operation on compact split coordinates (offset+fractional part)
-//   // to define this we need cscref2rref.
-//   
-//   typedef compact_split_type < dimension , ic_type , rc_type > csplit_t ;
-// 
-//   void eval ( const csplit_t & c , value_type & result ) const
+//     value_chunk_type value_chunk ;
+//     
+//     store ( ev , (value_type*) (&value_chunk) ) ;
+//     
+//     for ( int i = 0 ; i < vsize ; i++ )
+//     {
+//       *target_iter = value_chunk[i] ;
+//       target_iter++ ;
+//     }
+//   } ;
+//
+//   template < class src_iter_type , class trg_iter_type >
+//   void iter_eval ( src_iter_type & src_iter , 
+//                    trg_iter_type & trg_iter ) const
 //   {
-//     multi_functor.eval ( c.select , c.tune , result ) ;
-//   }
-//   
-//   // in diversify we refrain from any notion of progression from unsplit
-//   // to split coordinates (via a mapping, as in mapping.h) and further
-//   // to the more compact offset/fractionals representation (which requires
-//   // the strides of the coefficient array, a metric outside the scope
-//   // of this object) - the sole purpose of this class is providing
-//   // additional signature types.
-//   
-//   // now we define the operation with other parameter signatures by
-//   // resorting to the stored functions we have. First using pointers:
+//     nd_rc_v cv ;
+//     mc_ele_v ev ;
 // 
-//   void eval ( const nd_rc_type * const p_c , value_type * p_result ) const
-//   {
-//     eval ( *p_c , *p_result ) ;
-//   }
-//   
-//   void eval ( const nd_ic_type * const p_ic ,
-//               const nd_rc_type * const p_rc ,
-//               value_type * p_result ) const
-//   {
-//     eval ( *p_ic , *p_rc , *p_result ) ;
-//   }
-//   
-//   void eval ( const split_t * const p_sc ,
-//               value_type * p_result ) const
-//   {
-//     eval ( p_sc->select , p_sc->tune , *p_result ) ;
-//   }
-//   
-//   void eval ( const csplit_t * const p_sc ,
-//               value_type * p_result ) const
-//   {
-//     eval ( p_sc->select , p_sc->tune , *p_result ) ;
-//   }
-//   
-//   // now variants taking only the coordinate and returning value_type&&
-//   
-//   value_type && eval ( const nd_rc_type & c ) const
-//   {
-//     value_type v ;
-//     eval ( c , v ) ;
-//     return v ;
-//   }
-//   
-//   value_type && eval ( const nd_ic_type & ic ,
-//                        const nd_rc_type & rc ) const
-//   {
-//     value_type v ;
-//     eval ( ic , rc , v ) ;
-//     return v ;
-//   }
-//   
-//   value_type && eval ( const split_t & sc ,
-//                        const nd_rc_type & rc ) const
-//   {
-//     value_type v ;
-//     eval ( sc.select , sc.tune , v ) ;
-//     return v ;
-//   }
-//   
-//   value_type && eval ( const csplit_t & sc ,
-//                        const nd_rc_type & rc ) const
-//   {
-//     value_type v ;
-//     eval ( sc.select , sc.tune , v ) ;
-//     return v ;
-//   }
-//   
-//   // just being playful :)
+//     load ( src_iter , cv ) ;
+//     eval ( cv , ev ) ;
+//     store ( ev , trg_iter ) ;
+//   } ;
 // 
-//   void eval ( const nd_rc_type * const p_c ,
-//               int i_c ,
-//               value_type * p_result ,
-//               int i_result
-//             ) const
+//   template < class trg_iter_type >
+//   void iter_eval ( const nd_rc_v & cv ,         // simdized coordinate
+//                    trg_iter_type & trg_iter ) const
 //   {
-//     eval ( p_c[i_c] , p_result[i_result] ) ;
-//   }
-//   
-// #ifdef USE_VC
-// 
-//   // for vectorized operation, we need a few extra typedefs
-//   // I use a _v suffix to indicate vectorized types and the prefixes
-//   // mc_ for multichannel and nd_ for multidimensional
-// 
-//   /// a simdized type of the elementary type of value_type,
-//   /// which is used for coefficients and results. this is fixed via
-//   /// the traits class vector_traits (in common.h)
-//   
-//   typedef typename vector_traits < ele_type > :: type ele_v ;
-//   
-//   /// element count of Simd data types.
-//   
-//   enum { vsize = ele_v::Size } ;
-// 
-//   /// compatible-sized simdized type of vsize ic_type
-// 
-//   typedef typename vector_traits < ic_type , vsize > :: type ic_v ;
-// 
-//   /// compatible-sized simdized type of vsize rc_type
-// 
-//   typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
-// 
-//   /// multichannel value as SoA, for pixels etc.
-// 
-//   typedef vigra::TinyVector < ele_v , channels > mc_ele_v ;
-// 
-//   /// SoA for nD coordinates/components
-// 
-//   typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
-// 
-//   /// SoA for nD shapes (or multidimensional indices)
-// 
-//   typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
+//     mc_ele_v ev ;
 // 
-//   /// SoA for multichannel indices (used for gather/scatter operations)
-// 
-//   typedef vigra::TinyVector < ic_v , channels >  mc_ic_v ;
-// 
-//   // we start out with vectorized routines directly matching v_eval
-//   // methods of the multi_functor, first the variant taking references:
-// 
-//   void v_eval ( const nd_rc_v & c ,       // simdized nD coordinate
-//                 mc_ele_v & result ) const // simdized value_type
-//   {
-//     multi_functor.v_eval ( c , result ) ;
+//     eval ( cv , ev ) ;
+//     store ( ev , trg_iter ) ;
 //   } ;
 // 
-//   /// overload of v_eval taking pointers to vsize coordinates and
-//   /// vsize values, expecting these data to be contiguous in memory
-// 
-//   void v_eval ( const nd_rc_type * const pmc ,  // pointer to vsize muli_coordinates
-//                 value_type * p_result ) const     // pointer to vsize result values
+//   template < class src_iter_type >
+//   void iter_eval ( src_iter_type & src_iter ,
+//                    mc_ele_v & result ) const      // simdized result
 //   {
-//     multi_functor.v_eval ( pmc , p_result ) ;
-//   } ;
-// 
-//   // TODO: now here we can go on defining signatures:
-//   
-// #endif
-// } ;
-
-// // now we can define nearest_neighbour as a class derived from diversify,
-// // which gives a host of different eval() and v_eval() methods which are
-// // all implemented in terms of a few basic eval() and v_eval() methods
-// // provided by the 'core' type.
-// // The idea is to do the same with the b-spline evaluator, concentrating in
-// // class evaluator on the 'core' methods and leaving the diversification to
-// // the diversify inheritance.
+//     nd_rc_v cv ;
 // 
-// template < int _dimension ,
-//            class _value_type ,
-//            typename _rc_type = float ,
-//            typename _ic_type = int >
-// struct nearest_neighbour_d
-// : public diversify < _dimension , _value_type , _rc_type , _ic_type ,
-//                      nearest_neighbour < _dimension , _value_type , _rc_type , _ic_type >
-//                    >
-// {
-//   typedef nearest_neighbour < _dimension , _value_type , _rc_type , _ic_type > core_t ;
-//   typedef diversify < _dimension , _value_type , _rc_type , _ic_type , core_t > div_t ;
-//   typedef vigra::MultiArrayView < _dimension , _value_type > substrate_type ;
-// 
-//   nearest_neighbour_d ( const substrate_type & substrate )
-//   : div_t ( core_t ( substrate ) )
-//   { } ;
-//   
-// } ;
+//     load ( src_iter , cv ) ;
+//     eval ( cv , result ) ;
+//   } ;
+
+#endif
+
+} ;
+
+/// since we use the types in class interpolator frequently, there are
+/// macros to pull all the types of class interpolator into a class derived
+/// from it. classes which want to use these macros need to invoke them
+/// with the base class as their argument.
+/// for unverctorized operation we have:
+
+#define using_singular_interpolator_types(base_type) \
+  using typename base_type::ic_type ;         \
+  using typename base_type::rc_type ;         \
+  using typename base_type::nd_ic_type ;      \
+  using typename base_type::nd_rc_type ;      \
+  using typename base_type::value_type ;      \
+  using typename base_type::ele_type ;        \
+  enum { dimension = base_type::dimension } ; \
+  enum { level = dimension - 1 } ;            \
+  enum { channels = base_type::channels } ;   \
+  using base_type::operator() ;
+
+/// for vectorized operation there are a few more types to pull in, and we also
+/// pull in those eval variants which aren't pure virtual in the base class
+/// with a using declaration.
+
+#define using_simdized_interpolator_types(base_type) \
+  using typename base_type::ele_v ;        \
+  using typename base_type::ic_v ;         \
+  using typename base_type::rc_v ;         \
+  using typename base_type::mc_ic_v ;      \
+  using typename base_type::nd_ic_v ;      \
+  using typename base_type::nd_rc_v ;      \
+  using typename base_type::mc_ele_v ;     \
+  using base_type::broadcast_eval ;        \
+  using base_type::eval ;
+  
+/// finally a macro automatically pulling in the proper set of type names
+/// depending on USE_VC. This is the macro used throughout. Here, vsize is also
+/// fixed to tha base class' value - or to 1 if USE_VC isn't defined.
+
+#ifdef USE_VC
+#define using_interpolator_types(base_type)    \
+  enum { vsize = base_type::vsize } ;          \
+  using_singular_interpolator_types(base_type) \
+  using_simdized_interpolator_types(base_type) 
+#else  
+#define using_interpolator_types(base_type)    \
+  enum { vsize = 1 } ;                         \
+  using_singular_interpolator_types(base_type) 
+#endif
 
 } ; // end of namespace vspline
 
diff --git a/mapping.h b/mapping.h
index e77509f..7bcc05b 100644
--- a/mapping.h
+++ b/mapping.h
@@ -152,7 +152,7 @@ template <typename rc_v>
 rc_v v_fmod ( const rc_v& lhs ,
               const typename rc_v::EntryType rhs )
 {
-  typedef typename vector_traits < int , rc_v::Size > :: type ic_v ;
+  typedef typename vector_traits < int , rc_v::Size > :: ele_v ic_v ;
 
   ic_v iv ( lhs / rhs ) ;
   return lhs - rhs * rc_v ( iv ) ;
@@ -208,8 +208,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -242,8 +242,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -297,8 +297,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -344,8 +344,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -388,8 +388,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -439,8 +439,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -506,8 +506,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -578,8 +578,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   void operator() ( rc_v v , ic_v & iv , rc_v & fv ) const
   {
@@ -631,8 +631,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   void operator() ( rc_v v , ic_v & iv , rc_v & fv ) const
   {
@@ -709,8 +709,8 @@ public:
 
   // vectorized version of operator()
   
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -774,8 +774,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -807,7 +807,7 @@ public:
 /// Which fits nicely with an even spline, which has a wider defined range due to
 /// the smaller support. In that case we could possibly even save ourselves the last coefficient.
 /// The first period finishes one unit spacing beyond the location of the last knot
-/// point, so if the spline is conclassed over N values, the mapping has to be conclassed
+/// point, so if the spline is constructed over N values, the mapping has to be constructed
 /// with parameter M = N + 1. The bracing applied with the periodic bracer makes sure that
 /// coordinates up to M can be processed.
 
@@ -837,8 +837,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -883,8 +883,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -938,8 +938,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -971,8 +971,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -1016,8 +1016,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -1140,14 +1140,6 @@ map_func pick_mapping ( bc_code bc ,        ///< mapping mode
         return odd_mapping_reflect < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
-      case RRAW :
-      {
-        // like REFLECT, but doesn't apply the mirroring. the only difference
-        // to odd_mapping_raw is the shift of the input by +1, to account for the
-        // widened brace used with REFLECT boundary conditions
-        return odd_mapping_reflect_raw < ic_t , rc_t , vsize > () ;
-        break ;
-      }
       case PERIODIC :
       {
         return odd_mapping_periodic < ic_t , rc_t , vsize > ( M + 1 ) ;
@@ -1222,13 +1214,6 @@ map_func pick_mapping ( bc_code bc ,        ///< mapping mode
         return even_mapping_reflect < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
-      case RRAW :
-      {
-        // like REFLECT, but doesn't apply the mirroring. for even splines,
-        // this is the same as even_mapping_raw.
-        return even_mapping_raw < ic_t , rc_t , vsize > () ;
-        break ;
-      }
       case PERIODIC :
       {
         return even_mapping_periodic < ic_t , rc_t , vsize > ( M + 1 ) ;
@@ -1255,8 +1240,8 @@ struct mapping
   map_func map ;
 
 #ifdef USE_VC
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
   typedef std::function < void ( rc_v , ic_v & , rc_v & ) > map_func_v ;
   map_func_v map_v ;
 #endif
@@ -1293,12 +1278,12 @@ struct nd_mapping
 
   typedef vigra::TinyVector < rc_t , dimension > nd_rc_t ;
   typedef vigra::TinyVector < ic_t , dimension > nd_ic_t ;
-  typedef split_type < dimension , ic_t , rc_t > split_t ;
+//   typedef split_type < dimension , ic_t , rc_t > split_t ;
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: ele_v ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: ele_v rc_v ;
   typedef vigra::TinyVector < ic_v , dimension > nd_ic_v ;
   typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ;
 
@@ -1326,18 +1311,6 @@ struct nd_mapping
       map [ d ] = mapping_type ( bcv[d] , spline_degree , ic_t ( shape[d] ) ) ;
   }
   
-//   nd_mapping()
-//   { } ;
-  
-//   /// convenience variant taking a single boundary condition code, which is used for all axes
-//   
-//   nd_mapping ( const bc_code&    bc ,
-//                const int&        _spline_degree ,
-//                const nd_ic_t&    _shape  )
-//   : nd_mapping ( bcv_type ( bc ) , spline_degree , _shape )
-//   {
-//   } ;
-  
   /// apply the mapping along axis 'axis' to coordinate v, resulting in the setting
   /// of the integral part iv and the fractional part fv
   
@@ -1355,21 +1328,6 @@ struct nd_mapping
       map [ axis ] . map ( v[axis] , iv[axis] , fv[axis] ) ;
   }
 
-  /// different signature, now we handle a split_type object as the operation's target
-
-  void operator() ( rc_t v , split_t& s , const int & axis ) const
-  {
-    map [ axis ] . map ( v , s.select[axis] , s.tune[axis] ) ;
-  }
-  
-  /// the same in nD
-  
-  void operator() ( nd_rc_t v , split_t& s ) const
-  {
-    for ( int axis = 0 ; axis < dimension ; axis++ )
-      map [ axis ] . map ( v[axis] , s.select[axis] , s.tune[axis] ) ;
-  }
-  
 #ifdef USE_VC
 
   /// finally, operation on Vc vectors of incoming coordinates. Again, first the
@@ -1412,8 +1370,8 @@ struct nd_mapping
 //       m.map ( v , i , f ) ;
 //       std::cout << "in: " << v << " -> i " << i << " f " << f << std::endl ;
 // #ifdef USE_VC
-//       typedef typename vector_traits < int , vsize > :: type ic_v ;
-//       typedef typename vector_traits < float , vsize > :: type rc_v ;
+//       typedef typename vector_traits < int , vsize > :: ele_v ic_v ;
+//       typedef typename vector_traits < float , vsize > :: ele_v rc_v ;
 //       rc_v vv ( v ) ;
 //       ic_v iv ;
 //       rc_v fv ;
diff --git a/multithread.h b/multithread.h
index 4e213d0..d32d151 100644
--- a/multithread.h
+++ b/multithread.h
@@ -111,6 +111,13 @@
 
 namespace vspline
 {
+/// when multithreading, use this number of jobs per default. This is
+/// an attempt at a compromise: too many jobs will produce too much overhead,
+/// too few will not distribute the load well and make the system vulnerable
+/// to 'straggling' threads
+
+const int default_njobs = 8 * ncores ;
+
 /// given limit_type, we define range_type as a TinyVector of two limit_types,
 /// the first denoting the beginning of the range and the second it's end, with
 /// end being outside of the range.
@@ -198,7 +205,7 @@ struct shape_splitter
   typedef partition_type < range_t > partition_t ;
   
   static partition_t part ( const shape_t & shape , ///< shape to be split n-ways
-                            int n ,                 ///< intended number of chunks
+                            int n = default_njobs , ///< intended number of chunks
                             int forbid = -1 )       ///< axis which shouldn't be split
   {
     partition_t res ;
@@ -305,7 +312,8 @@ struct shape_splitter
 
 template < int d >
 partition_type < shape_range_type<d> >
-partition ( shape_range_type<d> range , int nparts )
+partition ( shape_range_type<d> range ,
+            int nparts = default_njobs )
 {
   auto shape = range[1] - range[0] ;
   int nelements = prod ( shape ) ;
diff --git a/prefilter.h b/prefilter.h
index b92d653..1bce683 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -178,7 +178,7 @@ void solve ( input_array_type & input ,
              double tolerance ,
              double smoothing = 0.0 ,
              bool use_vc = true ,
-             int nslices = ncores * 8 )
+             int njobs = default_njobs )
 {
   if ( smoothing != 0.0 )
   {
@@ -196,7 +196,7 @@ void solve ( input_array_type & input ,
                 pole ,
                 tolerance ,
                 use_vc ,
-                nslices ) ;
+                njobs ) ;
   }
   else
     filter_nd < input_array_type , output_array_type , math_type >
@@ -207,7 +207,7 @@ void solve ( input_array_type & input ,
                 vspline_constants::precomputed_poles [ degree ] ,
                 tolerance ,
                 use_vc ,
-                nslices ) ;
+                njobs ) ;
 }
 
 /// An interlude: restoration of the original knot point data from the spline coefficients.
diff --git a/remap.h b/remap.h
index f72761b..d31e1b6 100644
--- a/remap.h
+++ b/remap.h
@@ -54,7 +54,7 @@
 ///
 /// all remap routines take two template arguments:
 ///
-/// - interpolator_type: functor object yielding values for coordinates
+/// - unary_functor_type: functor object yielding values for coordinates
 /// - dim_target:        number of dimensions of output array
 ///
 /// remaps to other-dimensional objects are supported. This makes it possible to,
@@ -74,7 +74,7 @@
 ///
 /// There is also a second set of remap functions in this file, which don't take a
 /// 'warp' array. Instead, for every target location, the location's discrete coordinates
-/// are passed to the interpolator_type object. This way, transformation-based remaps
+/// are passed to the unary_functor_type object. This way, transformation-based remaps
 /// can be implemented easily: the user code just has to provide a suitable 'interpolator'
 /// to yield values for coordinates. This interpolator will internally take the discrete
 /// incoming coordinates (into the target array) and transform them as required, internally
@@ -92,6 +92,10 @@
 /// to produce the result values in scan order of the target array. While the two remap routines
 /// and grid_eval should cover most use cases, it's quite possible to use the routine fill()
 /// itself, passing in a suitable functor.
+///
+/// While the code presented here is quite involved and there are several types and routines
+/// the use(fulness) of which isn't immediately apparent, most use cases will be able to get
+/// by using only remap() or index_remap().
 
 #ifndef VSPLINE_REMAP_H
 #define VSPLINE_REMAP_H
@@ -136,6 +140,8 @@ using bcv_type = vigra::TinyVector < bc_code , dimension > ;
 // with successive single indices generated by vigra, and using vigra's facilities
 // also simplifies matters, see the use in filter.h, ele_aggregating_filter.
 
+#ifdef USE_VC
+
 template < class rc_type , class ic_type , int _dimension , int vsize >
 struct coordinate_iterator_v
 {
@@ -211,6 +217,8 @@ public:
   } ;
 } ;
 
+#endif
+
 // currently unused variant, using vigra's MultiCoordinateIterator. seems to perform the same.
 
 // template < class _rc_v , int _dimension >
@@ -561,23 +569,23 @@ void fill ( generator_type & gen ,
 /// values. It can be fed to fill() to produce a remap function, see below.
 
 template < int dimension ,
-           typename interpolator_type ,
+           typename unary_functor_type ,
            bool strided_warp >
 struct warp_generator
 {
-  typedef typename interpolator_type::nd_rc_type nd_rc_type ;
-  typedef typename interpolator_type::value_type value_type ;
+  typedef typename unary_functor_type::in_type nd_rc_type ;
+  typedef typename unary_functor_type::out_type value_type ;
   
   typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
   
   const warp_array_type warp ; // must not use reference here!
   typename warp_array_type::const_iterator witer ;
   
-  const interpolator_type & itp ;
+  const unary_functor_type & itp ;
 
   warp_generator
     ( const warp_array_type & _warp ,
-      const interpolator_type & _itp )
+      const unary_functor_type & _itp )
   : warp ( _warp ) ,
     itp ( _itp ) ,
     witer ( _warp.begin() )
@@ -591,8 +599,7 @@ struct warp_generator
 
 #ifdef USE_VC
 
-  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
-  enum { vsize = vector_traits < ele_type > :: vsize } ;
+  enum { vsize = unary_functor_type :: vsize } ;
 
   // operator() incorporates two variants, which depend on a template argument,
   // so the conditional has no run-time effect. We use witer to stay consistent
@@ -605,28 +612,28 @@ struct warp_generator
       nd_rc_type buffer [ vsize ] ;
       for ( int e = 0 ; e < vsize ; e++ , witer++ )
         buffer[e] = *witer ;
-      itp.v_eval ( buffer , target ) ;
+      itp.eval ( buffer , target ) ;
     }
     else
     {
-      itp.v_eval ( &(*witer) , target ) ;
+      itp.eval ( &(*witer) , target ) ;
       witer += vsize ;
     }
   }
 
 #endif
 
-  warp_generator < dimension , interpolator_type , strided_warp >
+  warp_generator < dimension , unary_functor_type , strided_warp >
     subrange ( const shape_range_type < dimension > & range ) const
   {
-    return warp_generator < dimension , interpolator_type , strided_warp >
+    return warp_generator < dimension , unary_functor_type , strided_warp >
              ( warp.subarray ( range[0] , range[1] ) , itp ) ;
   }
   
-  warp_generator < dimension - 1 , interpolator_type , strided_warp >
+  warp_generator < dimension - 1 , unary_functor_type , strided_warp >
     bindOuter ( const int & c ) const
   {
-    return warp_generator < dimension - 1 , interpolator_type , strided_warp >
+    return warp_generator < dimension - 1 , unary_functor_type , strided_warp >
              ( warp.bindOuter ( c ) , itp ) ;
   }  
 } ;
@@ -636,17 +643,51 @@ struct warp_generator
 /// has a fast variant which requires the whole warp array to be unstrided, a requirement
 /// which can usually be fulfilled, since warp arrays tend to be made just for the purpose
 /// of a remap. Picking of the correct variant is done here.
+///
+/// remap takes two template arguments:
+///
+/// - 'unary_functor_type', which is a class satisfying the interface laid down in
+///   unary_functor.h. This is an object which can provide values given coordinates,
+///   like class evaluator, but generalized to allow for arbitrary ways of achieving
+///   it's goal
+///
+/// - 'dim_target' - the number of dimensions of the target array. While the number of
+///   dimensions of the source data is apparent from the unary_functor_type object, the
+///   target array's dimensionality may be different, like when picking 2D slices from
+///   a volume.
+///
+/// remap takes four parameters:
+///
+/// - a reference to a const unary_functor_type object providing the functionality needed
+///   to generate values from coordinates
+///
+/// - a reference to a const MultiArrayView holding coordinates to feed to the unary_functor_type
+///   object. I use the term 'warp array' for this array. It has to have the same shape as
+///   the target array.
+///
+/// - a reference to a MultiArrayView to use as a target. This is where the resulting
+///   data are put.
+///
+/// - a boolan flag 'use_vc' which can be set to false to switch off the use of vector
+///   code in compiles which have it activated
 
-template < typename interpolator_type  , // functor object yielding values for coordinates
-           int dim_target >                 // number of dimensions of output array
-void remap ( interpolator_type & ev ,
+template < typename unary_functor_type  , // functor object yielding values for coordinates
+           int dim_target >              // number of dimensions of output array
+void remap ( const unary_functor_type & ev ,
              const MultiArrayView < dim_target ,
-                                    typename interpolator_type::nd_rc_type > & warp ,
+                                    typename unary_functor_type::in_type > & warp ,
              MultiArrayView < dim_target ,
-                              typename interpolator_type::value_type > & output ,
+                              typename unary_functor_type::out_type > & output ,
              bool use_vc = true
            )
 {
+  // check shape compatibility
+  
+  if ( output.shape() != warp.shape() )
+  {
+    throw shape_mismatch ( "remap: the shapes of the warp array and the output array do not match" ) ;
+  }
+
   // we test if the warp array is unstrided in dimension 0. If that is so, even
   // if it is strided in higher dimensions, via the hierarchical access we will
   // eventually arrive in dimension 0 and iterate over an unstrided array.
@@ -656,14 +697,14 @@ void remap ( interpolator_type & ev ,
   if ( warp.isUnstrided ( 0 ) )
   {
     //                                set strided_warp to false vvv
-    typedef warp_generator < dim_target , interpolator_type , false > gen_t ;  
+    typedef warp_generator < dim_target , unary_functor_type , false > gen_t ;  
     gen_t gen ( warp , ev ) ;  
     fill < gen_t , dim_target > ( gen , output , use_vc ) ;
   }
   else
   {
     // warp array is strided even in dimension 0
-    typedef warp_generator < dim_target , interpolator_type , true > gen_t ;  
+    typedef warp_generator < dim_target , unary_functor_type , true > gen_t ;  
     gen_t gen ( warp , ev ) ;  
     fill < gen_t , dim_target > ( gen , output , use_vc ) ;
   }
@@ -677,16 +718,16 @@ template < typename coordinate_type , // type of coordinates in the warp array
            typename value_type ,      // type of values to produce
            int dim_out >              // number of dimensions of output array
 int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dimension ,
-                             value_type > input ,
-            const MultiArrayView < dim_out , coordinate_type > & warp ,
-            MultiArrayView < dim_out , value_type > & output ,
+                                   value_type > & input ,
+            const MultiArrayView < dim_out ,
+                                   coordinate_type > & warp ,
+            MultiArrayView < dim_out ,
+                             value_type > & output ,
             bcv_type < coordinate_traits < coordinate_type > :: dimension > bcv
               = bcv_type < coordinate_traits < coordinate_type > :: dimension >
                 ( MIRROR ) ,
             int degree = 3 ,
-            bool use_vc = true ,
-            int nthreads = ncores
-          )
+            bool use_vc = true )
 {
   const int dim_in = coordinate_traits < coordinate_type > :: dimension ;
   typedef typename coordinate_traits < coordinate_type > :: rc_type rc_type ;
@@ -706,7 +747,7 @@ int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dime
   
   // prefilter, taking data in 'input' as knot point data
   
-  bsp.prefilter ( use_vc , nthreads * 8 , input ) ;
+  bsp.prefilter ( use_vc , ncores * 8 , input ) ;
 
   // create an evaluator over the bspline
 
@@ -716,32 +757,28 @@ int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dime
   // and call the other remap variant,
   // passing in the evaluator, the coordinate array and the target array
   
-  remap < evaluator_type , dim_out > ( ev , warp , output , use_vc , nthreads ) ;
+  remap < evaluator_type , dim_out > ( ev , warp , output , use_vc ) ;
     
   return 0 ;
 }
 
 /// index_generator is like warp_generator, but instead of picking coordinates from
 /// an array, the corresponding discrete coordinates are passed to the interpolator.
-/// Instead the interpolator is directly fed discrete target coordinates, and it's up
 /// This is a convenience
 /// routine, because it delivers the logic of both coordinate generation and value
 /// storage, in several threads (due to the use of fill) - and leaves the interesting
-/// work to be done in the 'interpolator'.
-/// In a way calling it 'interpolator' is a bit off since it does much more than merely
-/// interpolating at a given position: it performs the entire processing chain from
-/// discrete target coordinate to final result.
+/// work to be done in the unary_functor.
 
-template < int dim_target , typename interpolator_type >
+template < int dim_target , typename unary_functor_type >
 struct index_generator
 {
-  typedef typename interpolator_type::value_type value_type ;
+  typedef typename unary_functor_type::out_type value_type ;
 
-  enum { dim_source = interpolator_type::dimension } ;
+  enum { dim_source = unary_functor_type::dim_in } ;
   
 #ifdef USE_VC
 
-  enum { vsize = interpolator_type :: vsize } ;
+  enum { vsize = unary_functor_type :: vsize } ;
 
 #else
   
@@ -749,17 +786,12 @@ struct index_generator
 
 #endif
   
-  typedef typename interpolator_type::ic_type ic_type ;
-  typedef typename interpolator_type::rc_type rc_type ;
-  typedef typename interpolator_type::nd_ic_type nd_ic_type ;
-  typedef typename interpolator_type::nd_rc_type nd_rc_type ;
-
-  const interpolator_type & itp ;
+  const unary_functor_type & itp ;
   const shape_range_type < dim_target > range ;
   vigra::MultiCoordinateIterator < dim_source > mci ;
   
   index_generator
-    ( const interpolator_type & _itp ,
+    ( const unary_functor_type & _itp ,
       const shape_range_type < dim_target > _range
     )
   : itp ( _itp ) ,
@@ -775,9 +807,12 @@ struct index_generator
 
 #ifdef USE_VC
   
+  typedef typename vigra::TinyVector < typename unary_functor_type::ic_v ,
+                                       dim_source > nd_ic_v ;
+
   void operator() ( value_type * target )
   {
-    typename interpolator_type::nd_ic_v index ;
+    nd_ic_v index ;
     for ( int e = 0 ; e < vsize ; e++ )
     {
       for ( int d = 0 ; d < dim_source ; d++ )
@@ -786,41 +821,41 @@ struct index_generator
       }
       ++mci ;
     }
-    itp.v_eval ( index , target ) ;
+    itp.eval ( index , target ) ;
   }
 
 #endif
 
-  index_generator < dim_target , interpolator_type >
+  index_generator < dim_target , unary_functor_type >
     subrange ( const shape_range_type < dim_target > range ) const
   {
-    return index_generator < dim_target , interpolator_type >
+    return index_generator < dim_target , unary_functor_type >
              ( itp , range ) ;
   }
   
-  index_generator < dim_target , interpolator_type >
+  index_generator < dim_target , unary_functor_type >
     bindOuter ( const int & c ) const
   {
-    nd_ic_type slice_start = range[0] , slice_end = range[1] ;
+    auto slice_start = range[0] , slice_end = range[1] ;
 
     slice_start [ dim_target - 1 ] += c ;
     slice_end [ dim_target - 1 ] = slice_start [ dim_target - 1 ] + 1 ;
     
-    return index_generator < dim_target , interpolator_type >
+    return index_generator < dim_target , unary_functor_type >
              ( itp , shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
   }  
 } ;
 
-// template < int dim_target , typename interpolator_type >
+// template < int dim_target , typename unary_functor_type >
 // struct index_generator
 // {
-//   typedef typename interpolator_type::value_type value_type ;
+//   typedef typename unary_functor_type::value_type value_type ;
 // 
-//   enum { dim_source = interpolator_type::dimension } ;
+//   enum { dim_source = unary_functor_type::dimension } ;
 //   
 // #ifdef USE_VC
 // 
-//   enum { vsize = interpolator_type :: vsize } ;
+//   enum { vsize = unary_functor_type :: vsize } ;
 // 
 // #else
 //   
@@ -828,12 +863,12 @@ struct index_generator
 // 
 // #endif
 //   
-//   typedef typename interpolator_type::ic_type ic_type ;
-//   typedef typename interpolator_type::rc_type rc_type ;
-//   typedef typename interpolator_type::nd_ic_type nd_ic_type ;
-//   typedef typename interpolator_type::nd_rc_type nd_rc_type ;
+//   typedef typename unary_functor_type::ic_type ic_type ;
+//   typedef typename unary_functor_type::rc_type rc_type ;
+//   typedef typename unary_functor_type::nd_ic_type nd_ic_type ;
+//   typedef typename unary_functor_type::nd_rc_type nd_rc_type ;
 // 
-//   const interpolator_type & itp ;
+//   const unary_functor_type & itp ;
 //   const shape_range_type < dim_target > range ;
 //   
 //   vigra::MultiCoordinateIterator < dim_source > mci ;
@@ -845,7 +880,7 @@ struct index_generator
 // #endif
 //   
 //   index_generator
-//     ( const interpolator_type & _itp ,
+//     ( const unary_functor_type & _itp ,
 //       const shape_range_type < dim_target > _range
 //     )
 //   : itp ( _itp ) ,
@@ -867,21 +902,21 @@ struct index_generator
 //   
 //   void operator() ( value_type * target )
 //   {
-//     itp.v_eval ( *civ , target ) ;
+//     itp.eval ( *civ , target ) ;
 //     ++civ ;
 //     mci += vsize ; // this sucks...
 //   }
 // 
 // #endif
 // 
-//   index_generator < dim_target , interpolator_type >
+//   index_generator < dim_target , unary_functor_type >
 //     subrange ( const shape_range_type < dim_target > range ) const
 //   {
-//     return index_generator < dim_target , interpolator_type >
+//     return index_generator < dim_target , unary_functor_type >
 //              ( itp , range ) ;
 //   }
 //   
-//   index_generator < dim_target , interpolator_type >
+//   index_generator < dim_target , unary_functor_type >
 //     bindOuter ( const int & c ) const
 //   {
 //     nd_ic_type slice_start = range[0] , slice_end = range[1] ;
@@ -889,22 +924,61 @@ struct index_generator
 //     slice_start [ dim_target - 1 ] += c ;
 //     slice_end [ dim_target - 1 ] = slice_start [ dim_target - 1 ] + 1 ;
 //     
-//     return index_generator < dim_target , interpolator_type >
+//     return index_generator < dim_target , unary_functor_type >
 //              ( itp , shape_range_type < dim_target > ( slice_start , slice_end ) ) ;
 //   }  
 // } ;
 
-template < class interpolator_type , // type satisfying the interface in class interpolator
+/// index_remap() is very similar to remap(), but while remap() picks coordinates from
+/// an array (the 'warp' array), index_remap feeds the discrete coordinates to the
+/// successive places data should be rendered to to the unary_functor_type object.
+/// So here we need a different type of unary_functor_type object: in remap(), we would use
+/// one which takes nD real coordinates, here we use one which takes nD discrete
+/// coordinates. Typically the unary_functor_type object we use here will therefore have
+/// some stage of converting the incoming discrete coordinates to 'interesting' real
+/// coordinates. If, for example, incoming coordinates are like
+/// (0,0), (1,0), (2,0) ...
+/// the unary_functor_type object might, in a first step, use a real factor on them to
+/// produce real coordinates like
+/// (0.0,0.0), (0.1,0.0), (0.2,0.0)...
+/// which in turn are used to perform some evaluation, yielding target data
+/// (R,G,B), (R,G,B),...
+/// which might be taken up again by index_remap to be stored in the target.
+///
+/// index_remap takes two template arguments:
+///
+/// - 'unary_functor_type', which is a class satisfying the interface laid down in
+///   unary_functor.h. This is an object which can provide values given coordinates,
+///   like class evaluator, but generalized to allow for arbitrary ways of achieving
+///   it's goal
+///
+/// - 'dim_target' - the number of dimensions of the target array. While the number of
+///   dimensions of the source data is apparent from the unary_functor_type object, the
+///   target array's dimensionality may be different, like when picking 2D slices from
+///   a volume.
+///
+/// index_remap takes three parameters:
+///
+/// - a reference to a const unary_functor_type object providing the functionality needed
+///   to generate values from coordinates
+///
+/// - a reference to a MultiArrayView to use as a target. This is where the resulting
+///   data are put.
+///
+/// - a boolan flag 'use_vc' which can be set to false to switch off the use of vector
+///   code in compiles which have it activated
+
+template < class unary_functor_type , // type satisfying the interface in class unary_functor
            int dim_target >          // dimension of target array
-void index_remap ( const interpolator_type & ev ,
+void index_remap ( const unary_functor_type & ev ,
                    MultiArrayView < dim_target ,
-                                    typename interpolator_type::value_type > & output ,
+                                    typename unary_functor_type::out_type > & output ,
                    bool use_vc = true           
                 )
 {
-  typedef typename interpolator_type::value_type value_type ;
+  typedef typename unary_functor_type::out_type value_type ;
   typedef TinyVector < int , dim_target > nd_ic_type ;
-  typedef index_generator < dim_target , interpolator_type > gen_t ;
+  typedef index_generator < dim_target , unary_functor_type > gen_t ;
 
   shape_range_type < dim_target > range ( nd_ic_type() , output.shape() ) ;  
   gen_t gen ( ev , range ) ;  
@@ -921,7 +995,7 @@ namespace detail // workhorse code for grid_eval
 // offsets for all dimensions yields the offset into the coefficient array
 // to the window of coefficients where the weights are to be applied.
 
-template < typename interpolator_type , // type offering eval()
+template < typename unary_functor_type , // type offering eval()
            typename target_type ,       // iterates over target array
            typename weight_type ,       // singular weight data type
            int level >                  // current axis
@@ -932,7 +1006,7 @@ struct _grid_eval
                     weight_type** const & grid_weight ,
                     const int & ORDER ,
                     int ** const & grid_ofs ,
-                    const interpolator_type & itp ,
+                    const unary_functor_type & itp ,
                     target_type & result )
   {
     for ( int ofs = 0 ; ofs < result.shape ( level ) ; ofs++ )
@@ -941,16 +1015,16 @@ struct _grid_eval
         weight [ vigra::Shape2 ( e , level ) ] = grid_weight [ level ] [ ORDER * ofs + e ] ;
       int cum_ofs = initial_ofs + grid_ofs [ level ] [ ofs ] ;
       auto region = result.bindAt ( level , ofs ) ;
-      _grid_eval < interpolator_type , decltype ( region ) , weight_type , level-1 >()
+      _grid_eval < unary_functor_type , decltype ( region ) , weight_type , level-1 >()
         ( cum_ofs , weight , grid_weight , ORDER , grid_ofs , itp , region ) ;
     }
   }
 } ;
 
-template < typename interpolator_type ,
+template < typename unary_functor_type ,
            typename target_type ,
            typename weight_type >
-struct _grid_eval < interpolator_type ,
+struct _grid_eval < unary_functor_type ,
                     target_type ,
                     weight_type ,
                     0 >
@@ -960,7 +1034,7 @@ struct _grid_eval < interpolator_type ,
                     weight_type** const & grid_weight ,
                     const int & ORDER ,
                     int ** const & grid_ofs ,
-                    const interpolator_type & itp ,
+                    const unary_functor_type & itp ,
                     target_type & region )
   {
     auto iter = region.begin() ;    
@@ -972,13 +1046,13 @@ struct _grid_eval < interpolator_type ,
     // than the unvectorized code. With g++, the vectorized code is faster
     // than either clang version, but the unvectorized code is much slower.
 
-    const int vsize = interpolator_type::vsize ;
-    const int channels = interpolator_type::channels ;
-    typedef typename interpolator_type::value_type value_type ;
-    typedef typename interpolator_type::ele_type ele_type ;
-    typedef typename interpolator_type::ic_v ic_v ;
-    typedef typename interpolator_type::ele_v ele_v ;
-    typedef typename interpolator_type::mc_ele_v mc_ele_v ;
+    const int vsize = unary_functor_type::vsize ;
+    const int channels = unary_functor_type::channels ;
+    typedef typename unary_functor_type::value_type value_type ;
+    typedef typename unary_functor_type::ele_type ele_type ;
+    typedef typename unary_functor_type::ic_v ic_v ;
+    typedef typename unary_functor_type::ele_v ele_v ;
+    typedef typename unary_functor_type::mc_ele_v mc_ele_v ;
 
     // number of vectorized results
     int aggregates = region.size() / vsize ;
@@ -1016,7 +1090,7 @@ struct _grid_eval < interpolator_type ,
       select *= channels ;    // offsets are in terms of value_type, expand
       
       // now we can call the vectorized eval routine
-      itp.v_eval ( select , vweight , vtarget ) ;
+      itp.eval ( select , vweight , vtarget ) ;
       
       // finally we scatter the vectorized result to target memory
       for ( int ch = 0 ; ch < channels ; ch++ )
@@ -1042,7 +1116,7 @@ struct _grid_eval < interpolator_type ,
         weight [ vigra::Shape2 ( e , 0 )  ] = grid_weight [ 0 ] [ ORDER * ofs + e ] ;
       int cum_ofs = initial_ofs + grid_ofs [ 0 ] [ ofs ] ;
       // TODO: here's the place to vectorize: gather weights into ORDER vectors
-      // of weights, have a target buffer, call v_eval
+      // of weights, have a target buffer, call eval
       itp.eval ( cum_ofs , weight , *iter ) ;
       ++iter ;
     }
@@ -1058,13 +1132,13 @@ struct _grid_eval < interpolator_type ,
 // via 'multithread()' which sets up the partitioning and distribution
 // to threads from a thread pool.
 
-template < typename interpolator_type , // type offering eval()
+template < typename evaluator_type , // type offering eval()
            typename target_type ,       // type of target MultiArrayView
            typename weight_type ,       // singular weight data type
            typename rc_type >           // singular real coordinate
 void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
                     rc_type ** const _grid_coordinate ,
-                    const interpolator_type * itp ,
+                    const evaluator_type * itp ,
                     target_type * p_result )
 {
   const int ORDER = itp->get_order() ;
@@ -1100,7 +1174,7 @@ void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
   // obtain the evaluator's mapping
   auto mmap = itp->get_mapping() ;
   
-  // fill in the weights and offsets, using the interplator's mapping to split
+  // fill in the weights and offsets, using the interpolator's mapping to split
   // the coordinates received in grid_coordinate, the interpolator's obtain_weights
   // method to produce the weight components, and the strides of the coefficient array
   // to convert the integral parts of the coordinates into offsets.
@@ -1118,7 +1192,7 @@ void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
   MultiArray < 2 , weight_type > weight ( vigra::Shape2 ( ORDER , dim_target ) ) ;
   
   // now call the recursive workhorse routine
-  detail::_grid_eval < interpolator_type ,
+  detail::_grid_eval < evaluator_type ,
                        target_type ,
                        weight_type ,
                        dim_target - 1 >()
@@ -1166,19 +1240,24 @@ void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
 ///
 /// grid_eval is useful for generating scaled representation of the original
 /// data, but when scaling down, aliasing will occur and the data should be
-/// low-pass-filtered adequately before processing.
-
-template < typename interpolator_type , // type offering eval()
+/// low-pass-filtered adequately before processing. Let me hint here that
+/// low-pass filtering can be achieved by using b-spline reconstruction on
+/// raw data (a 'smoothing spline') - or by prefiltering with exponential
+/// smoothing, which can be activated by passing the 'smoothing' parameter
+/// to the prefiltering routine. Of course any other way of smoothing can
+/// be used just the same, like a Burt filter or Gaussian smoothing.
+
+template < typename evaluator_type , // type offering eval()
            typename target_type ,
            typename weight_type ,       // singular weight data type
            typename rc_type >           // singular real coordinate
 void grid_eval ( rc_type ** const grid_coordinate ,
-                 const interpolator_type & itp ,
+                 const evaluator_type & itp ,
                  target_type & result )
 {
   const int dim_target = target_type::actual_dimension ;
   shape_range_type < dim_target > range ( shape_type < dim_target > () , result.shape() ) ;
-  multithread ( st_grid_eval < interpolator_type , target_type , weight_type , rc_type > ,
+  multithread ( st_grid_eval < evaluator_type , target_type , weight_type , rc_type > ,
                 ncores * 8 , range , grid_coordinate , &itp , &result ) ;
 }
 
diff --git a/unary_functor.h b/unary_functor.h
new file mode 100644
index 0000000..f858567
--- /dev/null
+++ b/unary_functor.h
@@ -0,0 +1,309 @@
+/************************************************************************/
+/*                                                                      */
+/*    vspline - a set of generic tools for creation and evaluation      */
+/*              of uniform b-splines                                    */
+/*                                                                      */
+/*            Copyright 2015, 2016 by Kay F. Jahnke                     */
+/*                                                                      */
+/*    Permission is hereby granted, free of charge, to any person       */
+/*    obtaining a copy of this software and associated documentation    */
+/*    files (the "Software"), to deal in the Software without           */
+/*    restriction, including without limitation the rights to use,      */
+/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
+/*    sell copies of the Software, and to permit persons to whom the    */
+/*    Software is furnished to do so, subject to the following          */
+/*    conditions:                                                       */
+/*                                                                      */
+/*    The above copyright notice and this permission notice shall be    */
+/*    included in all copies or substantial portions of the             */
+/*    Software.                                                         */
+/*                                                                      */
+/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
+/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
+/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
+/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
+/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
+/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
+/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
+/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
+/*                                                                      */
+/************************************************************************/
+
+/*! \file unary_functor.h
+
+    \brief interface definition for unary functors
+
+*/
+
+#ifndef VSPLINE_UNARY_FUNCTOR_H
+#define VSPLINE_UNARY_FUNCTOR_H
+
+#include <vspline/common.h>
+#include <vspline/coordinate.h>
+
+namespace vspline {
+
+#ifdef USE_VC
+  
+// properly defining the simdized type of a given value_type is quite a mouthful,
+// so we use aliases here. first we define the simdized elementary type of T
+  
+template < class T >
+using simdized_ele_type = typename vspline::vector_traits<T>::ele_v ;
+
+// next we define an aggregate of simdized elementary types constituting
+// a simdized value_type.
+                          
+template < class T >
+using simdized_type = typename vspline::vector_traits<T>::type ;
+
+#endif
+
+template < typename IN ,
+           typename OUT ,
+           int _vsize = 1 >
+struct unary_functor
+{  
+  // number of dimensions
+
+  enum { dim_in = vigra::ExpandElementResult < IN > :: size } ;
+  enum { dim_out = vigra::ExpandElementResult < OUT > :: size } ;
+
+  // number of elements in simdized data
+
+  enum { vsize = _vsize } ;
+
+  // typedefs for incoming (argument)  and outgoing (result) type
+  
+  typedef IN in_type ;
+  typedef OUT out_type ;
+  
+  // elementary types of same
+  
+  typedef typename vigra::ExpandElementResult < IN > :: type in_ele_type ;
+  typedef typename vigra::ExpandElementResult < OUT > :: type out_ele_type ;
+  
+  /// for single coordinates we require an eval() taking a const& to a
+  /// single coordinate and a reference to the place where the result should
+  /// be deposited.
+
+  virtual void eval ( const IN & c ,
+                      OUT & result ) const = 0 ;
+
+  /// for convenience: operator() using eval()
+
+  OUT operator() ( const IN & c ) const
+  {
+    OUT v ;
+    eval ( c , v ) ;
+    return v ;
+  }
+  
+#ifdef USE_VC
+  
+  // for vectorized operation, we need a few extra typedefs
+  // I use a _v suffix to indicate vectorized types and the prefixes
+  // mc_ for multichannel and nd_ for multidimensional
+
+  /// a simdized type of the elementary type of value_type,
+  /// which is used for coefficients and results. this is fixed via
+  /// the traits class vector_traits (in common.h). Note how we derive
+  /// this type using vsize from the template argument, not what
+  /// vspline::vector_traits deems appropriate for ele_type - though
+  /// both numbers will be the same in most cases.
+  
+  typedef typename vector_traits < IN , vsize > :: ele_v in_ele_v ;
+  typedef typename vector_traits < OUT , vsize > :: ele_v out_ele_v ;
+  
+  typedef typename vector_traits < IN , vsize > :: type in_v ;
+  typedef typename vector_traits < OUT , vsize > :: type out_v ;
+  
+  typedef typename vector_traits < int , vsize > :: ele_v ic_v ;
+  
+  /// simdized evaluation routine. This routine takes the simdized coordinate
+  /// per const reference and deposits the result via a reference. This
+  /// method is pure virtual and must be implemented by derived classes.
+  /// This is the only vectorized evaluation routine which a class derived
+  /// from vspline::unary_functor has to provide: the other eval variants
+  /// (below) are taking either or both of their arguments as pointers to
+  /// (interleaved) memory and (de)interleave to/from simdized types defined
+  /// inside the method. This is provided as utility code by class
+  /// vspline::unary_functor and is pulled in with a using declaration
+  /// in the macro using_unary_functor_types, see below.
+
+   virtual void eval ( const in_v & c ,             // simdized nD input
+                       out_v & result ) const = 0 ; // simdized mD output
+
+  /// here I provide an (ineffecient) method to evaluate vectorized data by
+  /// delegating to the unvectorized evaluation routine. This will work, but
+  /// it's slow compared to proper vector code.
+  /// But for 'quick shots' one can use this idiom in a derived class:
+  ///
+  ///    virtual void eval ( const in_v & c ,
+  ///                        out_v & result ) const
+  ///    {
+  ///      broadcast_eval ( c , result ) ;
+  ///    }
+  
+  void broadcast_eval ( const in_v & c ,       // simdized nD input
+                        out_v & result ) const // simdized mD output
+  {
+    in_type cc ;
+    out_type v ;
+    in_ele_type * pc = (in_ele_type*)(&cc) ;
+    out_ele_type * pv = (out_ele_type*)(&v) ;
+
+    for ( int e = 0 ; e < vsize ; e++ )
+    {
+      for ( int d = 0 ; d < dim_in ; d++ )
+        pc[d] = c[d][e] ;
+      eval ( c , v ) ;
+      for ( int d = 0 ; d < dim_out ; d++ )
+        result[d][e] = pv[d] ;
+    }
+  }
+
+  /// again, for convenience, operator() using eval()
+
+  out_v operator() ( const in_v & c ) const
+  {
+    out_v v ;
+    eval ( c , v ) ;
+    return v ;
+  }
+  
+  // next we have a few methods to interface with (interleaved) memory.
+
+  /// method to load/gather interleaved memory, T being IN or OUT
+  
+  template < typename T >
+  void load ( const T * const pc ,
+              typename vector_traits < T , vsize > :: type & cv ) const
+  {
+    typedef typename vigra::ExpandElementResult < T > :: type ele_type ;
+    enum { dimension = vigra::ExpandElementResult < T > :: size } ;
+
+    if ( dimension == 1 )
+    {
+      cv[0].load ( (ele_type*) pc ) ;
+    }
+    else
+    {
+      for ( int d = 0 ; d < dimension ; d++ )
+      {
+        cv[d].gather ( (ele_type*) pc ,
+                       ic_v::IndexesFromZero() * dimension + d ) ;
+      }
+    }
+  }
+  
+  /// method to store/scatter a simdized value to interleaved memory
+
+  template < typename T >
+  void store ( const typename vector_traits < T , vsize > :: type & ev ,
+               T * result ) const
+  {
+    typedef typename vigra::ExpandElementResult < T > :: type ele_type ;
+    enum { dimension = vigra::ExpandElementResult < T > :: size } ;
+
+    if ( dimension == 1 )
+    {
+      ev[0].store ( (ele_type*) result ) ;
+    }
+    else
+    {
+      for ( int d = 0 ; d < dimension ; d++ )
+      {
+        ev[d].scatter ( (ele_type*) result ,
+                        ic_v::IndexesFromZero() * dimension + d ) ;
+      }
+    }
+  } ;
+
+  void eval ( const in_type * const pmc ,
+              out_type * result ) const
+  {
+    in_v cv ;
+    out_v ev ;
+
+    load ( pmc , cv ) ;
+    eval ( cv , ev ) ;
+    store ( ev , result ) ;
+  } ;
+
+  /// eval taking a reference to a simdized coordinate and a pointer
+  /// to memory accommodating the result, expecting this memory to be
+  /// contiguous.
+
+  void eval ( const in_v & cv ,
+              out_type * result ) const
+  {
+    out_v ev ;
+
+    eval ( cv , ev ) ;
+    store ( ev , result ) ;
+  } ;
+
+  /// variant taking the coordinates via a pointer and
+  /// evaluating into a simdized result passed in by reference
+
+  void eval ( const in_type * const pmc ,
+              out_v & result ) const
+  {
+    in_v cv ;
+
+    load ( pmc , cv ) ;
+    eval ( cv , result ) ;
+  } ;
+
+#endif
+
+} ;
+
+/// since we use the types in class unary_functor frequently, there are
+/// macros to pull all the types of class unary_functor into a class derived
+/// from it. classes which want to use these macros need to invoke them
+/// with the base class as their argument.
+/// for unverctorized operation we have:
+
+#define using_singular_unary_functor_types(base_type) \
+  using typename base_type::in_ele_type ;     \
+  using typename base_type::out_ele_type ;    \
+  using typename base_type::in_type ;         \
+  using typename base_type::out_type ;        \
+  enum { dim_in = base_type::dim_in } ;       \
+  enum { dim_out = base_type::dim_out } ;     \
+  using base_type::operator() ;
+
+/// for vectorized operation there are a few more types to pull in, and we also
+/// pull in those eval variants which aren't pure virtual in the base class
+/// with a using declaration.
+
+#define using_simdized_unary_functor_types(base_type) \
+  using typename base_type::in_ele_v ;     \
+  using typename base_type::out_ele_v ;    \
+  using typename base_type::in_v ;         \
+  using typename base_type::out_v ;        \
+  using typename base_type::ic_v ;         \
+  using base_type::broadcast_eval ;        \
+  using base_type::eval ;
+  
+/// finally a macro automatically pulling in the proper set of type names
+/// depending on USE_VC. This is the macro used throughout. Here, vsize is also
+/// fixed to tha base class' value - or to 1 if USE_VC isn't defined.
+
+#ifdef USE_VC
+#define using_unary_functor_types(base_type)    \
+  enum { vsize = base_type::vsize } ;          \
+  using_singular_unary_functor_types(base_type) \
+  using_simdized_unary_functor_types(base_type) 
+#else  
+#define using_unary_functor_types(base_type)    \
+  enum { vsize = 1 } ;                         \
+  using_singular_unary_functor_types(base_type) 
+#endif
+
+} ; // end of namespace vspline
+
+#endif // VSPLINE_UNARY_FUNCTOR_H
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git