Sun Jul 2 09:02:38 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 7528e65d28e66664f28aee01bdbf3dc3faeee962
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Mon Nov 14 12:18:34 2016 +0100

    using aggregate_traits throughout, work on pv.cc
    I'm trying to push some more fixed types out, so now I've worked on
    vigra::TinyVecors, replacing them with aggregate_traits<T,N>::type.
    I'd like to move to code where the aggregated types 'collapse' to
    simple type T if N==1, but I still have trouble getting my code to
    work with this definition. Anyway, factoring more of the type-fixing out
    is a good idea.
    I've also had good fun with pv. I tuned it some more and by now it's
    running quite smoothly at 50fps, except for some sfml-specific quirls which
    I can't resolve: some 10 secondes after starting pv, I get a 'does not respond'
    false error which I have to acknowledge, sometimes resulting in a crash soon
    after for no apparent reason. Sometimes the false error message even appears
    twice. Apparently this can be fixed by switching to a newer sfml version,
    but the 'stable' version 2.4 I downloaded crashed istantly, and compiling
    from source now also results in an error (which doesn't seem to be sfml's
    fault but a compiler bug) - so there's an impasse here...
    Anyway, if pv is running it runs fine, and I've added a caleidoscope
    function just for fun which modifies the warp array only, so that the resulting
    caleidoscopic animation is just as fast as the plain image.
---
 brace.h                     |  40 ++--
 bspline.h                   |   2 +-
 common.h                    | 366 +++++++++++++++++++++++++++++++---
 eval.h                      | 471 +++++++++++++++++++++++++-------------------
 example/impulse_response.cc |  96 ++++++---
 example/roundtrip.cc        |   4 +-
 filter.h                    |  21 +-
 mapping.h                   | 167 ++++++----------
 prefilter.h                 |   8 +-
 remap.h                     | 100 ++++++----
 10 files changed, 829 insertions(+), 446 deletions(-)

diff --git a/brace.h b/brace.h
index e729172..3c91e08 100644
--- a/brace.h
+++ b/brace.h
@@ -207,7 +207,7 @@ struct bracer
 /// is the section of the coeficient array the evaluation code will look at.
 
   static shape_type target_shape ( shape_type source_shape ,
-                            TinyVector < bc_code , dimension > bcv ,
+                            typename aggregate_traits < bc_code , dimension > :: type bcv ,
                             int spline_degree )
   {
     shape_type target_shape ;
@@ -218,20 +218,20 @@ struct bracer
     return target_shape ;
   }
 
-/// convenience variant of the previous routine using the same BC for all axes
-
-  static shape_type target_shape ( shape_type source_shape ,
-                            bc_code bc ,
-                            int spline_degree )
-  {
-    TinyVector < bc_code , dimension > bcv ( bc ) ;
-    return target_shape ( source_shape , bcv , spline_degree ) ;
-  }
+// /// convenience variant of the previous routine using the same BC for all axes
+// 
+//   static shape_type target_shape ( shape_type source_shape ,
+//                             bc_code bc ,
+//                             int spline_degree )
+//   {
+//     typename aggregate_traits < bc_code , dimension > :: type bcv ( bc ) ;
+//     return target_shape ( source_shape , bcv , spline_degree ) ;
+//   }
   
 /// this method gives the left offset to the 'core' subarray (array minus bracing),
 /// given the BC codes and the spline degree
 
-  static shape_type left_corner ( TinyVector < bc_code , dimension > bcv ,
+  static shape_type left_corner ( typename aggregate_traits < bc_code , dimension > :: type bcv ,
                                   int spline_degree )
   {
     shape_type target_offset ;
@@ -243,8 +243,8 @@ struct bracer
 /// this method gives the right offset to the 'core' subarray (array minus bracing),
 /// given the BC codes and the spline degree
 
-  static shape_type right_corner ( TinyVector < bc_code , dimension > bcv ,
-                            int spline_degree )
+  static shape_type right_corner ( typename aggregate_traits < bc_code , dimension > :: type bcv ,
+                                   int spline_degree )
   {
     shape_type target_offset ;
     for ( int d = 0 ; d < dimension ; d++ )
@@ -255,8 +255,8 @@ struct bracer
 /// given a braced array, return the size of it's 'core', the array without applied bracing
 
   static shape_type core_shape ( view_type& a ,
-                          TinyVector < bc_code , dimension > bcv ,
-                          int spline_degree )
+                                 typename aggregate_traits < bc_code , dimension > :: type bcv ,
+                                 int spline_degree )
   {
     return a.subarray (   a.shape()
                         - (   right_corner ( bcv , spline_degree )
@@ -266,8 +266,8 @@ struct bracer
 /// produce a view to the core
 
   static view_type core_view ( view_type& a ,
-                        TinyVector < bc_code , dimension > bc ,
-                        int spline_degree )
+                               typename aggregate_traits < bc_code , dimension > :: type bc ,
+                               int spline_degree )
   {
     return a.subarray ( left_corner ( bc , spline_degree ) ,
                         a.shape() - right_corner ( bc , spline_degree ) ) ;
@@ -498,9 +498,9 @@ struct bracer
 /// This variant of apply braces along all axes in one go.
 
   static void apply ( view_type& a ,          ///< target array, containing the core and (empty) frame
-               TinyVector < bc_code , dimension > bcv ,      ///< boundary condition codes
-               TinyVector < int , dimension > left_corner ,  ///< sizes of left braces
-               TinyVector < int , dimension > right_corner ) ///< sizes of right braces
+               typename aggregate_traits < bc_code , dimension > :: type bcv ,  ///< boundary condition codes
+               typename aggregate_traits < int , dimension > :: type left_corner ,  ///< sizes of left braces
+               typename aggregate_traits < int , dimension > :: type right_corner ) ///< sizes of right braces
   {
     for ( int dim = 0 ; dim < dimension ; dim++ )
       apply ( a , bcv[dim] , left_corner[dim] , right_corner[dim] , dim ) ;
diff --git a/bspline.h b/bspline.h
index 97f5a6a..7b2a79f 100644
--- a/bspline.h
+++ b/bspline.h
@@ -166,7 +166,7 @@ struct bspline
   /// multidimensional index type
   typedef typename view_type::difference_type shape_type ;
   /// nD type for one boundary condition per axis
-  typedef TinyVector < bc_code , dimension > bcv_type ;
+  typedef typename aggregate_traits < bc_code , dimension > :: type bcv_type ;
 
   /// type for pointer to a prefiltering method
   typedef void (*p_solve) ( view_type& ,
diff --git a/common.h b/common.h
index e6e2da1..a645639 100644
--- a/common.h
+++ b/common.h
@@ -73,6 +73,312 @@
 /// use SimdArrays of twice the vector size, because this produces the fastest
 /// evaluation on my system.
 
+template < class T , int _vsize >
+struct masked_pseudo_vector ;
+
+template < class T , int _vsize >
+struct pseudo_vector: public vigra::TinyVector < T , _vsize >
+{
+  typedef pseudo_vector < T , _vsize > this_type ;
+  typedef vigra::TinyVector < T , _vsize > base_type ;
+  typedef T value_type ;
+  typedef T EntryType ;
+  typedef pseudo_vector < int , _vsize > IndexType ;
+  typedef pseudo_vector < bool , _vsize > MaskType ;
+  typedef pseudo_vector < int , _vsize > index_type ;
+  typedef pseudo_vector < bool , _vsize > mask_type ;
+  typedef pseudo_vector < bool , _vsize > Mask ;
+  
+  enum { vsize = _vsize } ;
+  enum { Size = _vsize } ;
+  
+  pseudo_vector()
+  : base_type()
+  {
+  }
+  
+  template < class I >
+  pseudo_vector ( const I & i )
+  : base_type ( i )
+  {} ;
+  
+  pseudo_vector ( const T* mem )
+  {
+    this->load ( mem ) ;
+  }
+  
+  pseudo_vector ( const T* mem , const index_type & indexes )
+  {
+    this->gather ( mem , indexes ) ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator- () const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = - (*this)[e] ;
+    return result ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator< ( const this_type& other ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] < other[e] ;
+    return result ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator< ( const value_type& rhs ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] < rhs ;
+    return result ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator> ( const this_type& other ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] > other[e] ;
+    return result ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator> ( const value_type& rhs ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] > rhs ;
+    return result ;
+  }
+  
+  
+  pseudo_vector < bool , _vsize > operator<= ( const this_type& other ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] <= other[e] ;
+    return result ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator<= ( const value_type& rhs ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] <= rhs ;
+    return result ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator>= ( const this_type& other ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] >= other[e] ;
+    return result ;
+  }
+  
+  pseudo_vector < bool , _vsize > operator>= ( const value_type& rhs ) const
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = (*this)[e] >= rhs ;
+    return result ;
+  }
+  
+  friend pseudo_vector < bool , _vsize > asin ( const this_type & arg )
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = asin ( arg[e] ) ;
+    return result ;
+  }
+  
+  friend pseudo_vector < bool , _vsize > atan2 ( const this_type & arg1 , const this_type & arg2 )
+  {
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = atan2 ( arg1[e] , arg2[e] ) ;
+    return result ;
+  }
+  
+//   this_type & operator++ ( int i = 1 )
+//   {
+//     for ( int e = 0 ; e < vsize ; e++ )
+//       ++(this[e]) ;
+//     return *this ;
+//   }
+  
+  friend this_type operator| ( const this_type & a , const this_type & b )
+  {    
+    pseudo_vector < bool , _vsize > result ;
+    for ( int e = 0 ; e < vsize ; e++ )
+      result[e] = a[e] | b[e] ;
+    return result ;
+  }
+  
+  static this_type IndexesFromZero()
+  {
+    this_type result ;
+    for ( int i = 0 ; i < vsize ; i++ )
+      result[i] = i ;
+    return result ;
+  }
+  
+  template < typename align_flag >
+  void load ( const T * const base , const align_flag f = false )
+  {
+    for ( int e = 0 ; e < vsize ; e++ )
+      (*this)[e] = base[e] ;
+  }
+  
+  void gather ( const T * const base , const IndexType & indexes )
+  {
+    for ( int e = 0 ; e < vsize ; e++ )
+      (*this)[e] = base[indexes[e]] ;
+  }
+  
+  void gather ( const T * const base , const IndexType & indexes , const MaskType & mask )
+  {
+    for ( int e = 0 ; e < vsize ; e++ )
+    {
+      if ( mask[e] )
+        (*this)[e] = base[indexes[e]] ;
+    }
+  }
+  
+  template < typename align_flag >
+  void store ( T * base , const align_flag f = false )
+  {
+    for ( int e = 0 ; e < vsize ; e++ )
+      base[e] = (*this)[e] ;
+  }
+  
+  void scatter ( T * base , const IndexType & indexes )
+  {
+    for ( int e = 0 ; e < vsize ; e++ )
+      base[indexes[e]] = (*this)[e] ;
+  }
+  
+  void scatter ( T * base , const IndexType & indexes , const MaskType & mask )
+  {
+    for ( int e = 0 ; e < vsize ; e++ )
+    {
+      if ( mask[e] )
+        base[indexes[e]] = (*this)[e] ;
+    }
+  }
+  
+  masked_pseudo_vector < T , vsize > operator() ( const MaskType& mask )
+  {
+    return masked_pseudo_vector < T , _vsize > ( *this , mask ) ;
+  }
+  
+  bool isFull() const
+  {
+    return this->all() ;
+  }
+
+} ;
+
+template < class T , int _vsize >
+bool any_of ( const pseudo_vector < T , _vsize > & v )
+{
+  return v.any() ;
+}
+
+template < class T , int _vsize >
+bool none_of ( const pseudo_vector < T , _vsize > & v )
+{
+  return ! v.any() ;
+}
+
+template < class T , int _vsize >
+bool all_of ( const pseudo_vector < T , _vsize > & v )
+{
+  return v.all() ;
+}
+
+template < class T , int _vsize >
+pseudo_vector < T , _vsize > floor ( const pseudo_vector < T , _vsize > & v )
+{
+  pseudo_vector < T , _vsize > result ;
+  for ( int e = 0 ; e < _vsize ; e++ )
+    result[e] = floor ( v[e] ) ;
+  return result ;
+}
+
+template < class T , int _vsize >
+struct masked_pseudo_vector
+: public pseudo_vector < T , _vsize >
+{
+  typedef pseudo_vector < T , _vsize > base_type ;
+  typedef masked_pseudo_vector < T , _vsize > this_type ;
+  
+  const typename pseudo_vector < T , _vsize > :: Mask mask ;
+  base_type & target ;
+  
+  masked_pseudo_vector ( base_type & _target ,
+                         const typename base_type :: Mask & _mask )
+  : target ( _target ) ,
+    mask ( _mask )
+  { } ;
+  
+  base_type & operator= ( const base_type & other )
+  {
+    for ( int e = 0 ; e < _vsize ; e++ )
+    {
+      if ( mask[e] )
+        target[e] = other[e] ;
+    }
+    return target ;
+  }
+  
+  base_type & operator-= ( const base_type & other )
+  {
+    for ( int e = 0 ; e < _vsize ; e++ )
+    {
+      if ( mask[e] )
+        target[e] -= other[e] ;
+    }
+    return target ;
+  }
+  
+  base_type & operator+= ( const base_type & other )
+  {
+    for ( int e = 0 ; e < _vsize ; e++ )
+    {
+      if ( mask[e] )
+        target[e] += other[e] ;
+    }
+    return target ;
+  }
+  
+  base_type & operator*= ( const base_type & other )
+  {
+    for ( int e = 0 ; e < _vsize ; e++ )
+    {
+      if ( mask[e] )
+        target[e] *= other[e] ;
+    }
+    return target ;
+  }
+  
+  base_type & operator/= ( const base_type & other )
+  {
+    for ( int e = 0 ; e < _vsize ; e++ )
+    {
+      if ( mask[e] )
+        target[e] /= other[e] ;
+    }
+    return target ;
+  }
+  
+} ;
+
+/// struct vector_traits is the central location to fix vecorization throughout vspline.
+/// Nothe how we use a default vector width twice the size of a standard Vc vector for
+/// the given T. In my tests, this made the code faster. TODO test on other systems
+
 template < class T , int _vsize = 2 * Vc::Vector < T > :: Size >
 struct vector_traits
 {
@@ -81,41 +387,53 @@ struct vector_traits
   // if vsize given is precisely the Size of a Vc::Vector<T>, make simdized_type
   // such a Vc::Vector, otherwise make it a Vc::SimdArray<T,vsize>
   
-//   typedef typename std::conditional < vsize == Vc::Vector < T > :: Size ,
-//                                       Vc::Vector < T > ,
-//                                       Vc::SimdArray < T , vsize > > :: type simdized_type ;
+  typedef typename std::conditional < vsize == Vc::Vector < T > :: Size ,
+                                      Vc::Vector < T > ,
+                                      Vc::SimdArray < T , vsize > > :: type type ;
 
 // to always use Simdarrays, I use
 
-  typedef Vc::SimdArray < T , vsize > simdized_type ;
+//   typedef Vc::SimdArray < T , vsize > simdized_type ;
 } ;
 
-#endif
+/// while I haven't put it to use everywhere, here is a similar struct to fix what I call
+/// aggregation: the grouping together of a fixed number (_channels) of class  T, packed
+/// tightly in memory. This is similar to a SIMD vector, but represents a view to data which
+/// would, if at all, lend itself to vertical vectorization. Types fixed by this traits class
+/// would typically be things like single pixels or nD coordinates.
 
-/// struct aggregate_traits is a traits class to help with aggregation of values
-/// into aggregates of values. The code, which is dimension-agnostic, has to
-/// operate on aggregate types as well as on singular types. With this traits class
-/// we can enable the code to pick the singular type as the 'aggregate type' when
-/// the 'aggregate' size is 1, and a vigra::TinyVector otherwise. This fixes the
-/// formation of small aggregates in one central place and removes unnecessary
-/// aggegations where N==1.
-  
-template < typename T , int N >
+template < class T , int _channels >
 struct aggregate_traits
 {
+  enum { channels = _channels } ;
   typedef T value_type ;
-  enum { size = N } ;
-  typedef vigra::TinyVector < T , N > type ;
+
+// I'd like to use the definition below, but so far I have trouble getting it to work
+// smoothly with the rest of my code. This is due to the fact that at times I have variants
+// of routines which only differ in the type of a parameter which may be an aggregate type.
+// if the 'aggregate of one' type specification is removed by the definition below, the signatures
+// become identical producing a 'can't overload' error.
+
+//   typedef typename std::conditional < channels == 1 ,
+//                                       T ,
+//                                       vigra::TinyVector < T , channels > > :: type type ;
+
+// so for now, it's TinyVectors throughout. This does at least put the aggregation
+// in a single place and keeps the TinyVectors out of the remainder of the code.
+
+  typedef typename vigra::TinyVector < T , channels > type ;
 } ;
 
-template < typename T >
-struct aggregate_traits < T , 1 >
+template < class T , int _vsize = 2 * Vc::Vector < T > :: Size >
+struct ps_vector_traits
 {
-  typedef T value_type ;
-  enum { size = 1 } ;
-  typedef T type ;
+  enum { vsize = _vsize } ;
+
+  typedef pseudo_vector < T , vsize > simdized_type ;
 } ;
 
+#endif
+
 namespace vspline {
 
 /// dimension-mismatch is thrown if two arrays have different dimensions
@@ -417,7 +735,7 @@ struct divide_and_conquer
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ele_type > :: simdized_type value_v ;
+  typedef typename vector_traits < ele_type > :: type value_v ;
   enum { vsize = value_v::Size } ;
 
   /// now we repeat the pattern for vectorized operation.
@@ -441,8 +759,8 @@ struct divide_and_conquer
 
     if ( data.isUnstrided() )
     {
-      typedef typename vector_traits < int , vsize > :: simdized_type ic_v ;
-      typedef typename aggregate_traits < ic_v , channels > :: type mc_ic_v ;
+      typedef typename vector_traits < int , vsize > :: type ic_v ;
+      typedef typename vigra::TinyVector < ic_v , channels > :: type mc_ic_v ;
 
       // if data is unstrided, we can process it's contents by gather/scatter
       // operations to vectors which we process with the functor vf. This is
diff --git a/eval.h b/eval.h
index 5184b18..f7f0fbc 100644
--- a/eval.h
+++ b/eval.h
@@ -147,20 +147,22 @@ MultiArray < 2 , target_type > calculate_weight_matrix ( int degree , int deriva
 /// function using the same signature. It is not specific to b-splines.
 /// We access the weight functors via a pointer to this base class in the code below.
 
-template < typename rc_type >
+template < typename ele_type , // elementary type of value_type
+           typename rc_type >  // type of real-valued coordinate
 struct weight_functor_base
 {
   // we define two pure virtual overloads for operator(), one for unvectorized
   // and one for vectorized operation. In case the scope of evaluation is extended
   // to other types of values, we'll have to add the corresponding signatures here.
   
-  virtual void operator() ( rc_type* result , const rc_type& delta ) = 0 ;
+  virtual void operator() ( ele_type* result , const rc_type& delta ) = 0 ;
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < rc_type > :: simdized_type rc_type_v ;
+  typedef typename vector_traits < ele_type > :: type ele_v ;
+  typedef typename vector_traits < rc_type , ele_v::Size > :: type rc_v ;
 
-  virtual void operator() ( rc_type_v* result , const rc_type_v& delta ) = 0 ;
+  virtual void operator() ( ele_v* result , const rc_v& delta ) = 0 ;
 
 #endif
 } ;
@@ -172,20 +174,18 @@ struct weight_functor_base
 /// the constructor choose the derivative gives more flexibility and less type
 /// proliferation.
 
-template < typename target_type ,   // type for weights (may be a Vc::Vector)
-           typename rc_type >       // single real value
+template < typename target_type ,   // type for weights (may be a simdized type)
+           typename ele_type ,      // elementary type of value_type
+           typename delta_type >    // type for deltas (may be a simdized type)
 
 struct bspline_derivative_weights
-#ifdef USE_VC
-: public Vc::VectorAlignedBase
-#endif
 {
-  typedef typename MultiArray<2,rc_type>::iterator wm_iter ;
+  typedef typename MultiArray < 2 , ele_type > :: iterator wm_iter ;
 
   // TODO I would make this const, but in vigra, the iterator obtained by calling begin()
   // on a const array is protected (multi_ierator.hxx, 438) why is this so?
 
-  MultiArray<2,rc_type> weight_matrix ;
+  MultiArray < 2 , ele_type > weight_matrix ;
   const int degree ;
   const int derivative ;
   const int columns ;
@@ -193,7 +193,7 @@ struct bspline_derivative_weights
   wm_iter wm_end ;
 
   bspline_derivative_weights ( int _degree , int _derivative = 0 )
-  : weight_matrix ( calculate_weight_matrix<rc_type> ( _degree , _derivative ) ) ,
+  : weight_matrix ( calculate_weight_matrix < ele_type > ( _degree , _derivative ) ) ,
     degree ( _degree ) ,
     derivative ( _derivative ) ,
     columns ( _degree + 1 )
@@ -202,9 +202,9 @@ struct bspline_derivative_weights
     wm_end = weight_matrix.end() ;
   } ;
   
-  void operator() ( target_type* result , const target_type & delta )
+  void operator() ( target_type* result , const delta_type & delta )
   {
-    register target_type power = delta ;
+    register target_type power ( delta ) ;
     register wm_iter factor_it = wm_begin ;
     register const wm_iter factor_end = wm_end ;
 
@@ -228,7 +228,7 @@ struct bspline_derivative_weights
         }
         if ( factor_it == factor_end ) // avoid multiplication if exhausted, break now
           break ;
-        power *= delta ;               // otherwise produce next power(s) of delta(s)
+        power *= target_type ( delta ) ; // otherwise produce next power(s) of delta(s)
       }
     }
   }
@@ -237,20 +237,21 @@ struct bspline_derivative_weights
 /// we derive from the weight functor base class to obtain a (multi-) functor
 /// specifically for (derivatives of) a b-spline :
 
-template < typename rc_type >
+template < typename ele_type , typename rc_type >
 struct bspline_derivative_weight_functor
-: public weight_functor_base < rc_type >
+: public weight_functor_base < ele_type , rc_type >
 {
-  typedef weight_functor_base < rc_type > base_class ;
+  typedef weight_functor_base < ele_type , rc_type > base_class ;
 
   // set up the fully specialized functors to which operator() delegates:
 
-  bspline_derivative_weights < rc_type , rc_type >  weights ;
+  bspline_derivative_weights < ele_type , ele_type , rc_type >  weights ;
 
 #ifdef USE_VC
-  using typename base_class::rc_type_v ;
+  using typename base_class::ele_v ;
+  using typename base_class::rc_v ;
   
-  bspline_derivative_weights < rc_type_v , rc_type >  weights_v ; 
+  bspline_derivative_weights < ele_v , ele_type , rc_v >  weights_v ; 
 #endif
 
   bspline_derivative_weight_functor ( int degree , int d = 0 )
@@ -263,13 +264,13 @@ struct bspline_derivative_weight_functor
   
   // handle the weight calculation by delegation to the functors set up at construction
   
-  virtual void operator() ( rc_type* result , const rc_type& delta )
+  virtual void operator() ( ele_type* result , const rc_type& delta )
   {
     weights ( result , delta ) ;
   }
 
 #ifdef USE_VC
-  virtual void operator() ( rc_type_v* result , const rc_type_v& delta )
+  virtual void operator() ( ele_v* result , const rc_v& delta )
   {
     weights_v ( result , delta ) ;
   }
@@ -462,9 +463,6 @@ template < int dimension ,         ///< dimension of the spline
            class ic_type = int >   ///< singular integral coordinate, currently only int
 
 class evaluator
-#ifdef USE_VC
-: public Vc::VectorAlignedBase
-#endif
 {
 public:
   
@@ -496,8 +494,10 @@ public:
   /// types for a multidimensional real/integral coordinate
   // TODO this is duplicating the typedef in mapping.h
 
-  typedef TinyVector < rc_type , dimension >                       nd_rc_type ;
-  typedef TinyVector < ic_type , dimension >                       nd_ic_type ;
+  typedef typename aggregate_traits < rc_type , dimension > :: type   nd_rc_type ;
+  typedef typename aggregate_traits < ic_type , dimension > :: type   nd_ic_type ;
+
+  typedef typename aggregate_traits < int , dimension > :: type   derivative_spec_type ;
 
   /// type for a 'split' coordinate, mainly used internally in the evaluator, but
   /// also in 'warp arrays' used for remapping
@@ -509,7 +509,7 @@ public:
   typedef typename MultiArray < 1 , ic_type > :: iterator          offset_iterator ;
   typedef MultiArrayView < dimension + 1 , ele_type >              component_view_type ;
   typedef typename component_view_type::difference_type            expanded_shape_type ;
-  typedef TinyVector < ele_type* , channels >                      component_base_type ;
+  typedef typename aggregate_traits < ele_type* , channels > :: type component_base_type ;
 
 #ifdef USE_VC
 
@@ -519,10 +519,9 @@ public:
 
   /// a simdized type of the elementary type of value_type,
   /// which is used for coefficients and results. this is fixed via
-  /// the traits class vector_traits (in common.h) to be a Vc::Vector
-  /// or Vc::SimdArray of ele_type:
-
-  typedef typename vector_traits < ele_type > :: simdized_type ele_v ;
+  /// the traits class vector_traits (in common.h)
+  
+  typedef typename vector_traits < ele_type > :: type ele_v ;
   
   /// element count of Simd data types.
   
@@ -530,27 +529,27 @@ public:
 
   /// compatible-sized simdized type of vsize ic_type
 
-  typedef typename vector_traits < ic_type , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < ic_type , vsize > :: type ic_v ;
 
   /// compatible-sized simdized type of vsize rc_type
 
-  typedef typename vector_traits < rc_type , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
 
   /// multichannel value as SoA, for pixels etc.
 
-  typedef TinyVector < ele_v , channels > mc_ele_v ;
+  typedef typename aggregate_traits < ele_v , channels > :: type mc_ele_v ;
 
   /// SoA for nD coordinates/components
 
-  typedef TinyVector < rc_v , dimension > nd_rc_v ;
+  typedef typename aggregate_traits < rc_v , dimension > :: type nd_rc_v ;
 
   /// SoA for nD shapes (or multidimensional indices)
 
-  typedef TinyVector < ic_v , dimension > nd_ic_v ;
+  typedef typename aggregate_traits < ic_v , dimension > :: type nd_ic_v ;
 
   /// SoA for multichannel indices (used for gather/scatter operations)
 
-  typedef TinyVector < ic_v , channels >  mc_ic_v ;
+  typedef typename aggregate_traits < ic_v , channels > :: type  mc_ic_v ;
 
 #else
   
@@ -562,19 +561,19 @@ public:
 
   typedef nd_mapping < ic_type , rc_type , dimension , vsize > nd_mapping_type ;
 
-  typedef weight_functor_base < ele_type > weight_functor_base_type ;
+  typedef weight_functor_base < ele_type , rc_type > weight_functor_base_type ;
 
   /// in the context of b-spline calculation, this is the weight generating
   /// functor which will be used:
 
-  typedef bspline_derivative_weight_functor < ele_type > bspline_weight_functor_type ;
+  typedef bspline_derivative_weight_functor < ele_type , rc_type > bspline_weight_functor_type ;
   
   // to try out gaussian weights, one might instead use
   // typedef gaussian_weight_functor < ele_type > bspline_weight_functor_type ;
   
   /// we need one functor per dimension:
     
-  typedef TinyVector < weight_functor_base_type* , dimension > nd_weight_functor ;
+  typedef typename aggregate_traits < weight_functor_base_type* , dimension >  :: type nd_weight_functor ;
   
   // while in the context of B-splines the weight functors are, of course, functors which
   // calculate the weights via the B-spline basis functions, the formulation we use here
@@ -617,7 +616,7 @@ public:
   evaluator ( const array_type& _coefficients ,
               nd_mapping_type _mmap ,
               int _spline_degree ,
-              TinyVector < int , dimension > _derivative = TinyVector < int , dimension >(0) )
+              derivative_spec_type _derivative = derivative_spec_type ( 0 ) )
   : coefficients ( _coefficients ) ,
     spline_degree ( _spline_degree ) ,
     ORDER ( _spline_degree + 1 ) ,
@@ -725,7 +724,7 @@ public:
   /// sufficient to create a suitable evaluator object:
   
   evaluator ( const bspline < value_type , dimension > & bspl ,
-              TinyVector < int , dimension > _derivative = TinyVector < int , dimension >(0) )
+              derivative_spec_type _derivative = derivative_spec_type ( 0 ) )
   : evaluator ( bspl.coeffs ,
                 nd_mapping_type ( bspl.bcv , bspl.spline_degree , bspl.core_shape ) ,
                 bspl.spline_degree ,
@@ -835,44 +834,124 @@ public:
     }
   } ;
 
-  // next are the variants of operator(). there are quite a few, since I've coded for operation
+  // next are evaluation routines. there are quite a few, since I've coded for operation
   // from different starting points and both for vectorized and nonvectorized operation.
   
-  /// variant of operator() which takes an offset and a set of weights. This is the final delegate,
+  /// evaluation variant which takes an offset and a set of weights. This is the final delegate,
   /// calling the recursive _eval method. Having the weights passed in via a const MultiArrayViev &
   /// allows calling code to provide their own weights, together with their shape, in one handy
   /// packet. And in the normal sequence of delegation inside class eval, the next routine 'up'
-  /// can also package the weights nicely in a MultiArrayView.
+  /// can also package the weights nicely in a MultiArrayView. Note that select is now an ic_type,
+  /// a single integral value representing an offset into the coefficient array. We can either use
+  /// a different signature or a different name for the routine to keep it distinct, I chose to
+  /// name it differently (not eval):
   
-  value_type operator() ( const ic_type& select ,
-                          const MultiArrayView < 2 , ele_type > & weight )
+  void eval0 ( const ic_type & select ,
+               const MultiArrayView < 2 , ele_type > & weight ,
+               value_type & result )
   {
     const value_type * base = coefficients.data() + select ;
     offset_iterator ofs = offsets.begin() ;  // offsets reflects the positions inside the subarray
-    return _eval<level,value_type>() ( base , ofs , weight ) ;
+    result = _eval<level,value_type>() ( base , ofs , weight ) ;
+  }
+
+  /// 'penultimate' delegate, taking a multidimensional index 'select' to the beginning of
+  /// the coefficient window to process. Here the multidimensional index is translated
+  /// into an offset from the coefficient array's base adress. Carrying the information as
+  /// a reference to a multidimensional index all the way to this point does no harm, it's
+  /// only a reference after all.
+
+  void eval ( const nd_ic_type & select ,
+              const MultiArrayView < 2 , ele_type > & weight ,
+              value_type & result )
+  {
+    eval0 ( sum ( select * coefficients.stride() ) , weight , result ) ;
+  }
+
+  /// evaluation variant taking the components of a split coordinate, first the shape_type
+  /// representing the origin of the coefficient window to process, second the fractional parts
+  /// of the coordinate which are needed to calculate the weights to apply to the coefficient
+  /// window. Note that the weights aren't as many values as the window has coefficients, but
+  /// only ORDER weights per dimension; the 'final' weights would be the tensor product of the
+  /// sets of weights for each dimension, which we don't explicitly use here. Also note that
+  /// we limit the code here to have the same number of weights for each axis, which might be
+  /// changed to accept differently sized tensors.
+  
+  void eval ( const nd_ic_type& select ,
+              const nd_rc_type& tune ,
+              value_type & result )
+  {
+    // To calculate the weights we want efficient storage, being very much inner-loop here,
+    // but at the same time we want adequate packaging for the weights as well. So we obtain
+    // the memory by creating an adequately-sized C++ array right here (cheap, no new needed)
+    // and package it in a MultiArrayView, which is also cheap.
+
+    ele_type weight_data [ workspace_size ] ; // get space for the weights
+    MultiArrayView < 2 , ele_type >           // package them in a MultiArrayView
+     weight ( Shape2 ( ORDER , dimension ) ,  // with dimension sets of ORDER weights
+              weight_data ) ;                 // using the space claimed above
+    
+    // now we call obtain_weights, which will fill in 'weight' with weights for the
+    // given set of fractional coordinate parts in 'tune'
+
+    obtain_weights ( weight , tune ) ;
+    eval ( select , weight , result ) ;   // delegate
   }
 
-  /// alternative implementation of the final delegate. Here, we calculate the tensor
-  /// product of the component vectors in 'weight' and use flat_eval to obtain the
-  /// weighted sum over the coefficient window. Surprisingly enough, this seems to
-  /// be just as fast as the other variant.
-  /// TODO test more thoroughly, would be nice to use instead, with an additional
-  /// operator() taking self-same tensor product.
+  /// this variant of eval() takes a split coordinate:
+  
+  void eval ( const split_coordinate_type & sc , // presplit coordinate
+              value_type & result )
+  {
+    eval ( sc.select , sc.tune , result ) ;
+  }
+
+  /// this variant take a coordinate which hasn't been mapped yet, so it uses
+  /// the nd_mapping, mmap, and applies it. It's the most convenient form but
+  /// also the slowest, due to the mapping, which is quite expensive computationally.
+  
+  void eval ( const nd_rc_type& c , // unsplit coordinate
+              value_type & result )
+  {
+    split_coordinate_type sc ;
+    mmap ( c , sc ) ;  /// apply the mappings
+    eval ( sc.select , sc.tune , result ) ;
+  }
+  
+  /// variant taking a plain rc_type for a coordinate. only for 1D splines!
+  /// This is to avoid hard-to-track errors which might ensue from allowing
+  /// broadcasting of single rc_types to nd_rc_types for D>1. We convert the
+  /// single coordinate (of rc_type) to an aggregate of 1 to fit the signature
+  /// of the nD code above
+  
+  void eval ( const rc_type& c , // single 1D coordinate
+              value_type & result )
+  {
+    static_assert ( dimension == 1 ,
+                    "evaluation at a single real coordinate is only allowed for 1D splines" ) ;
+    nd_rc_type cc ( c ) ;
+    eval ( cc , result ) ;
+  }
+
+  /// alternative implementation of the last part of evaluation. Here, we calculate the
+  /// tensor product of the component vectors in 'weight' and use flat_eval to obtain the
+  /// weighted sum over the coefficient window.
 
   /// evaluation using the tensor product of the weight component vectors. This is simple,
   /// no need for recursion here, it's simply the sum of all values in the window, weighted
   /// with their corresponding weights. note that there are no checks; if there aren't enough
-  /// weights th code may crash. Unused for now, code is explicitly inlined further down.
-  
-//   template < class dtype , class weight_type >
-//   dtype flat_eval ( const dtype* & pdata ,
-//                     const weight_type * const weight )
-//   {
-//     dtype result = dtype() ;
-//     for ( int i = 0 ; i < window_size ; i++ )
-//       result += weight[i] * pdata[offsets[i]] ;
-//     return result ;
-//   }
+  /// weights th code may crash. Note that this code is formulated as a template and can be
+  /// used both for vectorized and unvectorized operation.
+
+  template < class dtype , class weight_type >
+  void flat_eval ( const dtype * const & pdata ,
+                   const weight_type * const pweight ,
+                   value_type & result )
+  {
+    result = pweight[0] * pdata[offsets[0]] ;
+    for ( int i = 1 ; i < window_size ; i++ )
+      result += pweight[i] * pdata[offsets[i]] ;
+  }
 
   /// calculation of the tensor product of the component vectors of the weights:
 
@@ -938,102 +1017,25 @@ public:
     }
   } ;
 
-//   /// operator() variant which first calculates the final weights and then applies them
-//   /// using a simple summation loop. currently unused, since it is slower.
+  /// evaluation variant which first calculates the final weights and then applies them
+  /// using a simple summation loop. currently unused, since it is slower, but it can be
+  /// used instead of eval() with the same signature.
   
-//   // TODO: try buffering the window of coefficients as well - also on the stack.
-//   // then the final evaluation collapses to a loop like sum += *(a++) + *(b++)
-//   // this might be faster, and the coefficient fetching and the weight calculations
-//   // might even run in parallel since they are totally independent of each other.
-// 
-//   value_type alt_operator ( const ic_type& select ,
-//                             const MultiArrayView < 2 , ele_type > & weight )
-//   {
-//     const value_type * base = coefficients.data() + select ;
-//     ele_type flat_weight [ window_size ] ;
-//     ele_type * iter = flat_weight ;
-//     tensor_product < level , ele_type >() ( iter , weight , 1.0 ) ;
-//     value_type sum ;
-//     for ( int i = 0 ; i < window_size ; i++ )
-//       sum += flat_weight[i] * base[offsets[i]] ;
-//     return sum ;
-//   }
-
-  /// 'penultimate' delegate, taking a multidimensional index 'select' to the beginning of
-  /// the coefficient window to process. Here the multidimensional index is translated
-  /// into an offset from the coefficient array's base adress. Carrying the information as
-  /// a reference to a multidimensional index all the way to this point does no harm, it's
-  /// only a reference after all.
-
-  value_type operator() ( const nd_ic_type& select ,
-                          const MultiArrayView < 2 , ele_type > & weight )
+  void flat_eval ( const ic_type & select ,
+                   const MultiArrayView < 2 , ele_type > & weight ,
+                   value_type & result )
   {
-    // here's the choice of strategy TODO codify to use eiter option
-//     return alt_operator ( sum ( select * coefficients.stride() ) , weight ) ;
-    return operator() ( sum ( select * coefficients.stride() ) , weight ) ;
-  }
-
-  /// variant of operator() taking the components of a split coordinate, first the shape_type
-  /// representing the origin of the coefficient window to process, second the fractional parts
-  /// of the coordinate which are needed to calculate the weights to apply to the coefficient
-  /// window. Note that the weights aren't as many values as the window has coefficients, but
-  /// only ORDER weights per dimension; the 'final' weights would be the tensor product of the
-  /// sets of weights for each dimension, which we don't explicitly use here. Also note that
-  /// we limit the code here to have the same number of weights for each axis, which might be
-  /// changed to accept differently sized tensors.
-  
-  value_type operator() ( const nd_ic_type& select ,
-                          const nd_rc_type& tune )
-  {
-    // calculate the weights. We want to produce efficient storage for the weights, being
-    // very much inner-loop here, but at the same time we want adequate packaging for the
-    // weights as well. So we obtain the memory by creating an adequately-sized C++ array right
-    // here (cheap, no new needed) and package it in a MultiArrayView, which is also cheap.
-
-    ele_type weight_data [ workspace_size ] ; // get space for the weights
-    MultiArrayView < 2 , ele_type >           // package them in a MultiArrayView
-     weight ( Shape2 ( ORDER , dimension ) ,  // with dimension sets of ORDER weights
-              weight_data ) ;                 // using the space claimed above
-    obtain_weights ( weight , tune ) ;        // obtain the weights for given 'tune'
-    return operator() ( select , weight ) ;   // delegate to the final delegate (above)
-  }
-
-  /// TODO: I think the next few operator() overloads should go.
-  /// I'd like the notion of a 'split coordinate' as a separate entity to disappear
-  /// into a pair of parameters passed into class evaluator's operator().
-  /// I leave the code in for now bcause some of my examples use split_types
-  
-  /// this variant of operator() takes a split coordinate:
-  
-  value_type operator() ( const split_coordinate_type& sc )  // presplit coordinate
-  {
-    return operator() ( sc.select , sc.tune ) ;
-  }
-
-  /// this variant take a coordinate which hasn't been mapped yet, so it uses
-  /// the nd_mapping, mmap, and applies it. It's the most convenient form but
-  /// also the slowest, due to the mapping, which is quite expensive computationally.
-  /// TODO variant taking the mapping as a parameter?
-  
-  value_type operator() ( const nd_rc_type& c )      /// coordinate
-  {
-    split_coordinate_type sc ;
-    mmap ( c , sc ) ;  /// apply the mappings
-    return operator() ( sc.select , sc.tune ) ;
-  }
-  
-  /// variant taking a plain rc_type for a coordinate. only for 1D splines!
-  /// This is to avoid hard-to-track errors which might ensue from allowing
-  /// broadcasting of single rc_types to nd_rc_types for D>1. We convert the
-  /// single coordinate (of rc_type) to an aggregate of 1 to fit the signature
-  /// of the nD code above
-  
-  value_type operator() ( const rc_type& c )         /// coordinate
-  {
-    static_assert ( dimension == 1 ,
-                    "evaluation at a single real coordinate is only allowed for 1D splines" ) ;
-    nd_rc_type cc ( c ) ;
-    return operator() ( cc ) ;
+    // gat a pointer to the coefficient window's beginning
+    const value_type * base = coefficients.data() + select ;
+    // prepare space for the multiplied-out weights
+    ele_type flat_weight [ window_size ] ;
+    // we need an instance of a pointer to these weights here, passing it in
+    // per reference to be manipulated by the code it's passed to
+    ele_type * iter = flat_weight ;
+    // now we multiply out the weights using tensor_product()
+    tensor_product < level , ele_type >() ( iter , weight , 1.0 ) ;
+    // finally we delegate to flat_eval above
+    flat_eval ( base , flat_weight , result ) ;
   }
 
 #ifdef USE_VC
@@ -1105,27 +1107,31 @@ public:
 
   // vectorized variants of the evaluation routines:
   
-  /// this is the vectorized version of the final delegate calling _v_eval. The first
-  /// argument is avector of offsets to windows of coefficients.
-  /// The second argument is a 2D array of vecorized weights.
+  /// this is the vectorized version of the final delegate, calling _v_eval. The first
+  /// argument is a vector of offsets to windows of coefficients, the vectorized equivalent
+  /// of the single offset in eval0, the unvectorized routine.
+  /// The second argument is a 2D array of vecorized weights, the third a reference to
+  /// the result.
   
-  mc_ele_v operator() ( const ic_v& select ,  // offsets to lower corners of the subarrays
-                        const MultiArrayView < 2 , ele_v > & weight ) // vectorized weights
+  void v_eval0 ( const ic_v& select ,  // offsets to lower corners of the subarrays
+                 const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
+                 mc_ele_v & result )
   {
     // we need an instance of this iterator because it's passed into _v_eval by reference
     // and manipulated by the code there:
     
     offset_iterator ofs = component_offsets.begin() ;
     
-    // now we can call the recursive _v_eval routine
+    // now we can call the recursive _v_eval routine yielding the result
     
-    return _v_eval < mc_ele_v , level >() ( component_base , select , ofs , weight ) ;
+    result = _v_eval < mc_ele_v , level >() ( component_base , select , ofs , weight ) ;
   }
 
   /// in the penultimate delegate, we move from nD coordinates to offsets
 
-  mc_ele_v operator() ( const nd_ic_v& select ,  // lower corners of the subarrays
-                        const MultiArrayView < 2 , ele_v > & weight ) // vectorized weights
+  void v_eval ( const nd_ic_v& select ,  // lower corners of the subarrays
+                const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
+                mc_ele_v & result )
   {
     // here we transform the incoming nD coordinates into offsets into the coefficient
     // array's memory.
@@ -1138,13 +1144,14 @@ public:
     
     // now we delegate to the final delegate
     
-    return operator() ( origin , weight ) ;
+    v_eval0 ( origin , weight , result ) ;
   }
 
   // again, pretty much like the unvectorized version, now using simdized data:
-  
-  mc_ele_v operator() ( const nd_ic_v& select ,  // lower corners of the subarrays
-                        const nd_rc_v& tune )    // fractional parts of the incoming coordinates
+
+  void v_eval ( const nd_ic_v& select ,  // lower corners of the subarrays
+                const nd_rc_v& tune ,    // fractional parts of the incoming coordinates
+                mc_ele_v & result )
   {
     // like in the unvectorized code, the 2D MultiArrayView to the weights is created
     // as a view to data on the stack (weight_data) which is lightweight and fast
@@ -1157,9 +1164,9 @@ public:
   
     obtain_weights ( weight , tune ) ;
     
-    // delegate to operator() above
+    // delegate
 
-    return operator() ( select , weight ) ;
+    v_eval ( select , weight , result ) ;
   }
 
   /// here we take the approach to require the calling function to present pointers to
@@ -1169,8 +1176,8 @@ public:
   /// in contiguous memory. But this is not always the case, for example when the
   /// data are strided. Anyway, it doesn't hurt to have the option.
   
-  void operator() ( const split_coordinate_type* const psc , // pointer to vsize presplit coordinates
-                    value_type* result )                     // pointer to vsize result values
+  void v_eval ( const split_coordinate_type* const psc , // pointer to vsize presplit coordinates
+                value_type * result )                    // pointer to vsize result values
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1189,9 +1196,8 @@ public:
       }
     }
 
-    // delegate
-    
-    mc_ele_v v_result = operator() ( select , tune ) ;
+    mc_ele_v v_result ;
+    v_eval ( select , tune , v_result ) ;
 
     // and deposit it in the memory the caller provides
     for ( int ch = 0 ; ch < channels ; ch++ )
@@ -1203,9 +1209,9 @@ public:
   /// case where these aspects are stored in different containers. This type of access
   /// is more efficient, since we can use gather operations
   
-  void operator() ( const ic_type * pi ,  // pointer to integral parts of coordinates
-                    const rc_type * pr ,  // pointer to real parts fo coordinates
-                    value_type* result )  // pointer to vsize result values
+  void v_eval ( const ic_type * const pi ,  // pointer to integral parts of coordinates
+                const rc_type * const pr ,  // pointer to real parts fo coordinates
+                value_type * result )  // pointer to vsize result values
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1218,9 +1224,8 @@ public:
       tune[d].gather ( pr , nd_interleave[d] )  ;
     }
 
-    // delegate
-    
-    mc_ele_v v_result = operator() ( select , tune ) ;
+    mc_ele_v v_result ;
+    v_eval ( select , tune , v_result ) ;
 
     // and deposit it in the memory the caller provides
     
@@ -1228,14 +1233,14 @@ public:
       v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
   }
 
-  /// This variant of operator() works directly on vector data (of unsplit coordinates)
+  /// This variant of v_eval() works directly on vector data (of unsplit coordinates)
   /// This burdens the calling code with (de)interleaving the data. But often the calling
   /// code performs a traversal of a large body of data and is therefore in a better position
   /// to perform the (de)interleaving e.g. by a gather/scatter operation, or already receives
   /// the data in simdized form.
   
-  void operator() ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
-                    mc_ele_v & result )      // number of channels * value vectors
+  void v_eval ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
+                mc_ele_v & result )      // number of channels * value vectors
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1246,51 +1251,105 @@ public:
 
     // delegate
 
-    result = operator() ( select , tune ) ;
+    v_eval ( select , tune , result ) ;
   }
 
   /// This variant also operates on unsplit coordinates. Here again we require the calling function
   /// to pass pointers to vsize input and output values in contiguous memory. The data are
   /// (de)interleved in this routine, the actual evaluation is delegated to the variant working
-  /// on vectorized data.
+  /// on vectorized data. If dimension == 1, the coordinate data are obtained with a load instruction
+  /// instead of a gather, which should be faster (TODO check) - ditto for channels and storing.
+  // TODO might try v_eval variants taking rc_v& / ele_v& for 1D / 1 channel case instead of
+  // relying on the optimizer to shed the unnecessary TinyVector<T,1> container
   
-  void operator() ( const nd_rc_type* const pmc ,  // pointer to vsize muli_coordinates
-                    value_type* result )           // pointer to vsize result values
+  void v_eval ( const nd_rc_type * const pmc ,  // pointer to vsize muli_coordinates
+                value_type * result )           // pointer to vsize result values
   {
     nd_rc_v input ;
     mc_ele_v v_result ;
 
     // gather the incoming (interleaved) coordinates
+
+    if ( dimension == 1 )
+    {
+      input[0].load ( (const rc_type* const)pmc ) ;
+    }
+    else
+    {
+      for ( int d = 0 ; d < dimension ; d++ )
+        input[d].gather ( (const rc_type* const)pmc , nd_interleave[d] ) ;
+    }
+
+    // call v_eval() for vectorized data
     
-    for ( int d = 0 ; d < dimension ; d++ )
-      input[d].gather ( (const rc_type* const)pmc , nd_interleave[d] ) ;
+    v_eval ( input , v_result ) ;
+
+    // and deposit it in the memory the caller provides
+    
+    if ( channels == 1 )
+    {
+      v_result[0].store ( (ele_type*)result ) ;
+    }
+    else
+    {
+      for ( int ch = 0 ; ch < channels ; ch++ )
+        v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
+    }
+  }
 
-    // call operator() for vectorized data
+  void v_eval ( const rc_type * const pmc ,  // pointer to vsize 1D coordinates
+                value_type * result )        // pointer to vsize result values
+  {
+    static_assert ( dimension == 1 ,
+                    "this v_eval variant is intended for 1D splines only" ) ;
+
+    nd_rc_v input ;
+    mc_ele_v v_result ;
+
+    // gather the incoming (interleaved) coordinates
+    
+    input[0].load ( (const rc_type* const)pmc ) ;
+
+    // call v_eval() for vectorized data
     
-    operator() ( input , v_result ) ;
+    v_eval ( input , v_result ) ;
 
     // and deposit it in the memory the caller provides
     
-    for ( int ch = 0 ; ch < channels ; ch++ )
-      v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
+    if ( channels == 1 )
+    {
+      v_result[0].store ( (ele_type*)result ) ;
+    }
+    else
+    {
+      for ( int ch = 0 ; ch < channels ; ch++ )
+        v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
+    }
   }
 
   /// mixed form, where input is a vectorized coordinate
   /// and output goes to interleaved memory
 
-  void operator() ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
-                    value_type* result )     // pointer to vsize result values
+  void v_eval ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
+                value_type * result )    // pointer to vsize result values
   {
     mc_ele_v v_result ;
 
-    // call operator() for vectorized data
+    // call v_eval() for vectorized data
 
-    operator() ( input , v_result ) ;
+    v_eval ( input , v_result ) ;
 
     // and deposit result in the memory the caller provides
 
-    for ( int ch = 0 ; ch < channels ; ch++ )
-      v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
+    if ( channels == 1 )
+    {
+      v_result[0].store ( (ele_type*)result ) ;
+    }
+    else
+    {
+      for ( int ch = 0 ; ch < channels ; ch++ )
+        v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
+    }
   }
 
 #endif // USE_VC
diff --git a/example/impulse_response.cc b/example/impulse_response.cc
index 657ba2b..e41e356 100644
--- a/example/impulse_response.cc
+++ b/example/impulse_response.cc
@@ -47,8 +47,8 @@
 /// 
 /// double ir_5[] = {
 /// -0.0084918610197410 ,
-/// +0.0197221240122632 ,
-/// -0.0458040841912519 ,
+/// +0.0197222540252632 ,
+/// -0.0458040841925519 ,
 /// +0.1063780046433000 ,
 /// -0.2470419274022756 ,
 /// +0.5733258709616592 ,
@@ -58,8 +58,8 @@
 /// +0.5733258709616593 ,
 /// -0.2470419274022757 ,
 /// +0.1063780046433000 ,
-/// -0.0458040841912519 ,
-/// +0.0197221240122632 ,
+/// -0.0458040841925519 ,
+/// +0.0197222540252632 ,
 /// -0.0084918610197410 ,
 ///  } ;
 ///
@@ -72,6 +72,7 @@
 
 #include <iomanip>
 #include <vspline/vspline.h>
+#include <vigra/multi_math.hxx>
 #include <random>
 
 int main ( int argc , char * argv[] )
@@ -82,7 +83,7 @@ int main ( int argc , char * argv[] )
   
 // using the highest-level access to prefiltering, we code:
 
-  vspline::bspline < double , 1 > bsp ( 10001 , degree , vspline::MIRROR ) ; // , vspline::EXPLICIT ) ;
+  vspline::bspline < double , 1 > bsp ( 10001 , degree , vspline::PERIODIC ) ; // , vspline::EXPLICIT ) ;
   auto v1 = bsp.core ;
   v1 [ 5000 ] = 1.0 ;
   bsp.prefilter() ;
@@ -108,38 +109,71 @@ int main ( int argc , char * argv[] )
 //         0.00001 ) ;
 //   f.solve ( v1.begin() ) ;
         
-  std::cout << "double ir_" << degree << "[] = {" << std::endl ;
-  std::cout << std::fixed << std::showpos << std::showpoint << std::setprecision(16);
-  int k ;
-  for ( k = 0 ; k < 10000 ; k++ )
-  {
-    if ( fabs ( v1[k] ) > cutoff )
-      std::cout << v1[k] << " ," << std::endl ;
-  }
-  if ( fabs ( v1[k] ) > cutoff )
-    std::cout << v1[k] ;
-  std::cout << " } ;" << std::endl ;
+//   std::cout << "double ir_" << degree << "[] = {" << std::endl ;
+//   std::cout << std::fixed << std::showpos << std::showpoint << std::setprecision(16);
+//   int k ;
+//   for ( k = 0 ; k < 10000 ; k++ )
+//   {
+//     if ( fabs ( v1[k] ) > cutoff )
+//       std::cout << v1[k] << " ," << std::endl ;
+//   }
+//   if ( fabs ( v1[k] ) > cutoff )
+//     std::cout << v1[k] ;
+//   std::cout << " } ;" << std::endl ;
   
   typedef vspline::evaluator < 1 , double , double > eval_type ; // get the appropriate evaluator type 
   eval_type ev ( bsp ) ;                                   // create the evaluator
 //   std::cout << ev ( 5001.5 ) << std::endl ;
   
-  vigra::MultiArray < 1 , double > coords ( 12 ) ;
-  coords[5] = 5000.1 ;
-  vigra::MultiArray < 1 , double > target ( 12 ) ;
-  typedef vspline::split_type < 1 , int , double > splt ;
-  if ( degree & 1 )
-  {
-    for ( int i = 0 ; i < 12  ; i++ )
-      target[i] = ev ( coords[i] , vspline::odd_mapping_mirror < splt , 1 > ( 12 ) ) ;
-  }
-  else
-  {
-    for ( int i = 0 ; i < 12  ; i++ )
-      target[i] = ev ( coords[i] , vspline::even_mapping_mirror < splt , 1 > ( 12 ) ) ;
-  }
+  vigra::MultiArray < 1 , double > coords ( 25 ) ;
+  coords[2] = 5000.1 ;
+  vigra::MultiArray < 1 , double > target ( 25 ) ;
+  
+//   typedef vspline::split_type < 1 , int , double > splt ;
+//   if ( degree & 1 )
+//   {
+//     for ( int i = 0 ; i < 25  ; i++ )
+//       ev.eval ( coords[i] , target[i] ) ;
+//   }
+//   else
+//   {
+//     for ( int i = 0 ; i < 25  ; i++ )
+//       ev.eval ( coords[i] , target[i] ) ;
+//   }
     
-//   vspline::remap < double , double , 1 , 1 > ( bsp , coords , target ) ;
+//   typedef vigra::TinyVector < double , 1 > monad_t ;
+//   vigra::MultiArray < 1 , monad_t > mcoords ( 25 ) ;
+//   mcoords[5] = { 5000.1 } ;
+//   vigra::MultiArray < 1 , monad_t > mtarget ( 25 ) ;
+  vspline::remap < double , double , 1 , 1 > ( bsp , coords , target ) ;
   for ( auto v : target )
     std::cout << v << std::endl ;
+  
+  typedef vector_traits < double > :: type dvs_t ;
+  typedef aggregate_traits < dvs_t , 1 > :: type dv_t ;
+  
+  dv_t dv ;
+  dv[3] = 5000.1 ;
+  std::cout << dv << std::endl ;
+  
+  dv_t rv ;
+  ev.v_eval ( dv , rv ) ;
+  std::cout << rv << std::endl ;
+  
+  ev.v_eval ( (double*)&(dv) , (double*)&(rv) ) ;
+//   ev.v_eval ( (vigra::TinyVector<double,1>*)&(dv) , (double*)&(rv) ) ;
+  
+//   using namespace vigra::multi_math ;
+//   vigra::TinyVector < double , 12 > xx ;
+//   xx[3] = 1.0 ;
+//   std::cout << sqrt ( xx ) << std::endl ;
+//   pseudo_vector < double , 3 > tv3 ;
+//   pseudo_vector < double , 3 > zero ;
+//   {
+//     auto less = ( tv3 < zero ) ;
+//     for ( int i = 0 ; i < 3 ; i++ )
+//       std::cout << less[i] << std::endl ;
+//   }
+  
+  
 }
\ No newline at end of file
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index 7bc63d3..86052c9 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -105,7 +105,7 @@ void roundtrip ( view_type & data ,
                  vspline::bc_code bc ,
                  int DEGREE ,
                  bool use_vc ,
-                 int TIMES = 10 )
+                 int TIMES = 16 )
 {
   typedef typename view_type::value_type pixel_type ;
   typedef typename view_type::difference_type Shape;
@@ -251,7 +251,7 @@ void roundtrip ( view_type & data ,
   
 #ifdef PRINT_ELAPSED
   end = std::chrono::system_clock::now();
-  cout << "avg " << TIMES << " x remap1 from pre-split coordinates in warper object: "
+  cout << "avg " << TIMES << " x remap1 using warper object:        "
        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
        << " ms" << endl ;
 #endif
diff --git a/filter.h b/filter.h
index b08ef3c..94dac75 100644
--- a/filter.h
+++ b/filter.h
@@ -952,6 +952,7 @@ int nonaggregating_filter ( source_type &source ,
 
 #ifdef USE_VC
 
+/* currently unused
 // test if an index vector contains sequential indices
 
 template < typename IT >
@@ -959,6 +960,7 @@ bool sequential ( const IT & indexes )
 {
   return Vc::all_of ( indexes - indexes[0] == IT::IndexesFromZero() ) ;
 }
+*/
 
 /// extended gather and scatter routines taking 'extrusion parameters'
 /// which handle how many times and with which stride the gather/scatter
@@ -1109,16 +1111,16 @@ int ele_aggregating_filter ( source_type &source ,
   typedef typename source_type::value_type ele_type ;    ///< elementary type in source
 
   // Initially I used a plain Vc::Vector as the simdized type, but I found that using a SimdArray
-  // of twice that size performed slightly better. Her's the one place where this type can be fixed,
+  // of twice that size performed slightly better. Here's the one place where this type can be fixed,
   // it's choice 'trickles down' to the remainder of the code, so experimentation is cheap. vsize has
   // to be a multiple of the genuine vector's size, though, because in gather/scatter above we explicitly
   // use aligned load/stores to the buffer. If we use plain load/stores to the buffer there, other
   // vsizes can be used, but performance suffers - we want to use the fastest ops possible after all.
   // TODO: This should be tested with different systems
   
-//   typedef Vc::Vector < ele_type > simdized_type ;        ///< vectorized type, use plain vector
-
-  typedef typename vector_traits < ele_type > :: simdized_type simdized_type ;
+  // we now have a traits class fixing the simdized type (in common.h):
+  
+  typedef typename vector_traits < ele_type > :: type simdized_type ;
 
   const int vsize ( simdized_type::Size ) ;              ///< number of ele_type in a simdized_type
   typedef typename simdized_type::IndexType index_type ; ///< indexes for gather/scatter
@@ -1424,16 +1426,15 @@ public:
   
 #ifdef USE_VC
  
-  // TODO: this is duplicate with the definition in ele_aggregating_filter
-  typedef Vc::SimdArray < ele_type ,
-                          2 * Vc::Vector<ele_type>::Size > simdized_type ; ///< vectorized type
-
+//   typedef vector_traits < ele_type > simdized_type ;
+  const int vsize = vector_traits < ele_type > :: vsize ;
+  
   // if we can use vector code, the number of lanes is multiplied by the
   // number of elements a simdized type inside the vector code can handle
 
   if ( use_vc )
   {
-    lanes *= simdized_type::Size ;
+    lanes *= vsize ;
   }
 
 #endif
@@ -1682,7 +1683,7 @@ template < typename input_array_type ,      // type of array with knot point dat
            typename output_array_type >     // type of array for coefficients (may be the same)
 void filter_nd ( input_array_type & input ,
                  output_array_type & output ,
-                 vigra::TinyVector<bc_code,input_array_type::actual_dimension> bc ,
+                 typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type bc ,
                  int nbpoles ,
                  const double * pole ,
                  double tolerance ,
diff --git a/mapping.h b/mapping.h
index 1c9da36..49fb480 100644
--- a/mapping.h
+++ b/mapping.h
@@ -116,52 +116,8 @@ namespace vspline {
 using namespace std ;
 using namespace vigra ;
 
-// typedef int default_ic_type ;
-// typedef float default_rc_type ;
-
-// type naming scheme:
-// nd_ : multidimensional
-// mc_ : multichannel
-// ic  : integral coordinate/integral component
-// rc  : real coordinate/real component
-// value : 'payload' type, like pixel
-// ele : elementary type of val
-// _v: vecorized type, single Vc:Vector or SimdArray, or classure thereof
-
-/// nd_ic_type is simply a TinyVector of integral numbers. These are the
-/// integral parts of the incoming real coordinates, which are used to determine the
-/// location of the window into the coefficient array which will be used to produce the
-/// evaluation result for the incoming coordinates.
-
-template < int N ,
-           typename IT >
-struct nd_ic_type
-:public TinyVector < IT , N >
-{
-  typedef IT ic_type ;
-  typedef IT value_type ;
-  enum { dimension = N } ;
-} ;
-
-/// nd_rc_type is a TinyVector of real numbers in the range of [0.0 - 1.0],
-/// in the case of odd splines, or [-0.5 - 0.5] in the case of even splines. They constitute
-/// the fractional part of a real coordinate left over when the integral part is taken away.
-/// They are the 'delta' which is fed into the weight-generating functors producing the
-/// weights of the weighted summation of the coefficients in the window which is defined
-/// by the integral parts of the coordinates.
-
-template < int N ,
-           typename FT >
-struct nd_rc_type
-:public TinyVector < FT , N >
-{
-  typedef FT rc_type ;
-  typedef FT value_type ;
-  enum { dimension = N } ;
-} ;
-
 /// class split_type contains n-dimensional 'split coordinates', consisting of the
-/// integral and fracional part of the original real coordinates, separated so that
+/// integral and fracional parts of the original real coordinates, separated so that
 /// they can be processed by the evaluation routine.
 
 template < int N ,
@@ -173,8 +129,10 @@ struct split_type
   typedef FT rc_type ;
   typedef FT value_type ;
   enum { dimension = N } ;
-  nd_ic_type<N,IT> select ; ///< named select because it selects the range of coefficients
-  nd_rc_type<N,FT> tune ;   ///< named tune because it is the base for the weights 
+  typedef typename aggregate_traits < IT , N > :: type select_type ;
+  select_type select ; ///< named select because it selects the range of coefficients
+  typedef typename aggregate_traits < FT , N > :: type tune_type ;
+  tune_type tune ; ///< named tune because it is the base for the weights 
 } ;
 
 #ifdef USE_VC
@@ -190,7 +148,7 @@ rc_v v_modf ( const rc_v& source ,
   typedef typename rc_v::mask_type mask_type ;
   typedef typename rc_v::EntryType single_type ;
   
-  mask_type negative = Vc::isnegative ( source ) ;
+  mask_type negative ( source < single_type(0) ) ; // Vc::isnegative ( source ) ;
   rc_v help ( source ) ;
   
   // we treat the case that any of the incoming vector components is negative separately
@@ -200,7 +158,7 @@ rc_v v_modf ( const rc_v& source ,
   if ( any_of ( negative ) )
   {
     help(negative) = -source ;
-    (*iptr) = Vc::floor ( help ) ;
+    (*iptr) = floor ( help ) ;
     help -= (*iptr) ;
     (*iptr)(negative) = -(*iptr) ;
     help(negative) = -help ;
@@ -209,7 +167,7 @@ rc_v v_modf ( const rc_v& source ,
   else
   {
     // for all positive components, the operation is trivial: 
-    (*iptr) = Vc::floor ( source ) ;
+    (*iptr) = floor ( source ) ;
     return ( help - *iptr ) ;
   }
 }
@@ -221,7 +179,8 @@ template <typename rc_v>
 rc_v v_fmod ( const rc_v& lhs ,
               const typename rc_v::EntryType rhs )
 {
-  typedef Vc::SimdArray < int , rc_v::Size > ic_v ;
+//   typedef Vc::SimdArray < int , rc_v::Size > ic_v ;
+  typedef typename vector_traits < int , rc_v::Size > :: type ic_v ;
   
   ic_v iv ( lhs / rhs ) ;
   return lhs - rhs * rc_v ( iv ) ;
@@ -288,8 +247,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -320,8 +279,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -375,8 +334,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -420,8 +379,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -474,8 +433,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -532,8 +491,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -615,8 +574,8 @@ public:
 
   // vectorized version of operator()
   
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -686,8 +645,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -753,8 +712,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -802,8 +761,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -874,8 +833,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -923,8 +882,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -991,8 +950,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -1063,8 +1022,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -1144,9 +1103,7 @@ map_func pick_mapping ( bc_code bc , int spline_degree , int M )
       }
       default:
       {
-        // TODO: throw exception instead?
-        cerr << "mapping for BC code " << bc_name[bc] << " is not supported" << endl ;
-        return odd_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
+        throw not_supported ( "mapping for this BC code is not supported" ) ;
         break ;
       }
     }
@@ -1194,9 +1151,7 @@ map_func pick_mapping ( bc_code bc , int spline_degree , int M )
       }
       default:
       {
-        // TODO: throw exception instead?
-        cerr << "mapping for BC code " << bc_name[bc] << " is not supported" << endl ;
-        return even_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
+        throw not_supported ( "mapping for this BC code is not supported" ) ;
         break ;
       }
     }
@@ -1214,8 +1169,8 @@ struct mapping
   map_func map ;
 
 #ifdef USE_VC
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
   typedef std::function < void ( rc_v , ic_v & , rc_v & ) > map_func_v ;
   map_func_v map_v ;
 #endif
@@ -1245,21 +1200,23 @@ template < typename ic_t , typename rc_t , int dimension , int vsize = 1 >
 struct nd_mapping
 {
   typedef mapping < ic_t , rc_t , vsize > mapping_type ;
-  typedef TinyVector < bc_code , dimension > bcv_type ;
-  typedef TinyVector < rc_t , dimension > nd_rc_t ;
-  typedef TinyVector < ic_t , dimension > nd_ic_t ;
+  typedef typename aggregate_traits < mapping_type , dimension > :: type nd_mapping_type ;
+
+  typedef typename aggregate_traits < bc_code , dimension > :: type bcv_type ;
+  typedef typename aggregate_traits < rc_t , dimension > :: type nd_rc_t ;
+  typedef typename aggregate_traits < ic_t , dimension > :: type nd_ic_t ;
   typedef split_type < dimension , ic_t , rc_t > split_t ;
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
-  typedef TinyVector < ic_v , dimension > nd_ic_v ;
-  typedef TinyVector < rc_v , dimension > nd_rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+  typedef typename aggregate_traits < ic_v , dimension > :: type nd_ic_v ;
+  typedef typename aggregate_traits < rc_v , dimension > :: type nd_rc_v ;
 
 #endif
 
-  TinyVector < mapping_type , dimension > map ; // container for the mappings
+  nd_mapping_type map ; // container for the mappings
   bcv_type bcv ;
   int spline_degree ;
   nd_ic_t shape ;
@@ -1282,14 +1239,14 @@ struct nd_mapping
   nd_mapping()
   { } ;
   
-  /// convenience variant taking a single boundary condition code, which is used for all axes
-  
-  nd_mapping ( const bc_code&    bc ,
-               const int&        _spline_degree ,
-               const nd_ic_t&    _shape  )
-  : nd_mapping ( bcv_type ( bc ) , spline_degree , _shape )
-  {
-  } ;
+//   /// convenience variant taking a single boundary condition code, which is used for all axes
+//   
+//   nd_mapping ( const bc_code&    bc ,
+//                const int&        _spline_degree ,
+//                const nd_ic_t&    _shape  )
+//   : nd_mapping ( bcv_type ( bc ) , spline_degree , _shape )
+//   {
+//   } ;
   
   /// apply the mapping along axis 'axis' to coordinate v, resulting in the setting
   /// of the integral part iv and the fraczional part fv
@@ -1365,8 +1322,8 @@ struct nd_mapping
 //       m.map ( v , i , f ) ;
 //       std::cout << "in: " << v << " -> i " << i << " f " << f << std::endl ;
 // #ifdef USE_VC
-//       typedef typename vector_traits < int , vsize > :: simdized_type ic_v ;
-//       typedef typename vector_traits < float , vsize > :: simdized_type rc_v ;
+//       typedef typename vector_traits < int , vsize > :: type ic_v ;
+//       typedef typename vector_traits < float , vsize > :: type rc_v ;
 //       rc_v vv ( v ) ;
 //       ic_v iv ;
 //       rc_v fv ;
diff --git a/prefilter.h b/prefilter.h
index 0b06b47..ed6179a 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -158,7 +158,7 @@ template < typename input_array_type ,      // type of array with knot point dat
            typename output_array_type >     // type of array for coefficients (may be the same)
 void solve_vigra ( input_array_type & input ,
                    output_array_type & output ,
-                   TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
+                   typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type bcv ,
                    int degree ,
                    double tolerance ,
                    int nslices = ncores )
@@ -187,7 +187,7 @@ void solve_vigra ( input_array_type & input ,
 {
   filter_nd ( input ,
               output ,
-              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
+              typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type ( bc ) ,
               degree / 2 ,
               precomputed_poles [ degree ] ,
               tolerance ,
@@ -201,7 +201,7 @@ template < typename input_array_type ,      // type of array with knot point dat
            typename output_array_type >     // type of array for coefficients (may be the same)
 void solve_vc ( input_array_type & input ,
                 output_array_type & output ,
-                TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
+                typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type bcv ,
                 int degree ,
                 double tolerance ,
                 int nslices = ncores )
@@ -230,7 +230,7 @@ void solve_vc ( input_array_type & input ,
 {
   filter_nd ( input ,
               output ,
-              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
+              typename aggregate_traits < bc_code , input_array_type::actual_dimension > :: type ( bc ) ,
               degree / 2 ,
               precomputed_poles [ degree ] ,
               tolerance ,
diff --git a/remap.h b/remap.h
index f9acdd1..9f9ee37 100644
--- a/remap.h
+++ b/remap.h
@@ -109,11 +109,19 @@ struct coordinate_traits
 /// pick the TinyVector's value_type as our rc_type
 
 template < typename T , int N >
-struct coordinate_traits < TinyVector < T , N > >
+struct coordinate_traits < vigra::TinyVector < T , N > >
 {
   typedef typename vigra::TinyVector < T , N >::value_type rc_type ;
 } ;
 
+// TODO: I'd like to write it like this, but it won't compile:
+
+// template < typename T , int N >
+// struct coordinate_traits < typename aggregate_traits < T , N > :: type >
+// {
+//   typedef typename aggregate_traits < T , N > :: type::value_type rc_type ;
+// } ;
+
 /// since remap can also operate with pre-split coordinates, we need another partial
 /// specialization of coordinate_traits for split_type, which also defines a value_type
 
@@ -154,8 +162,6 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
 
 #endif
   
-  // TODO: we blindly assume that we can coiterate in scan order here:
-  
   auto source_it = warp.begin() ;
   auto target_it = output.begin() ;
   
@@ -175,7 +181,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
   {
     int aggregates = warp.elementCount() / vsize ;            // number of full vectors
     leftover = warp.elementCount() - aggregates * vsize ;     // any leftover single values
-    coordinate_type * source = warp.data() ;                      // acces to memory
+    coordinate_type * source = warp.data() ;                  // acces to memory
     value_type * destination = output.data() ;
 
     if ( warp.isUnstrided() )
@@ -186,7 +192,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
         // best case: output array has consecutive memory, no need to buffer
         for ( int a = 0 ; a < aggregates ; a++ , source += vsize , destination += vsize )
         {
-          ev ( source , destination ) ;
+          ev.v_eval ( source , destination ) ;
         }
         target_it += aggregates * vsize ;
       }
@@ -198,7 +204,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
         value_type target_buffer [ vsize ] ;
         for ( int a = 0 ; a < aggregates ; a++ , source += vsize )
         {
-          ev ( source , target_buffer ) ;
+          ev.v_eval ( source , target_buffer ) ;
           for ( int e = 0 ; e < vsize ; e++ )
           {
             *target_it = target_buffer[e] ;
@@ -222,7 +228,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
             source_buffer[e] = *source_it ;
             ++source_it ;
           }
-          ev ( source_buffer , destination ) ;
+          ev.v_eval ( source_buffer , destination ) ;
         }
         target_it += aggregates * vsize ;
       }
@@ -237,7 +243,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
             source_buffer[e] = *source_it ;
             ++source_it ;
           }
-          ev ( source_buffer , target_buffer ) ;
+          ev.v_eval ( source_buffer , target_buffer ) ;
           for ( int e = 0 ; e < vsize ; e++ )
           {
             *target_it = target_buffer[e] ;
@@ -254,7 +260,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
   while ( leftover-- )
   {
     // process leftovers with single-value evaluation
-    *target_it = ev ( *source_it ) ;
+    ev.eval ( *source_it , *target_it ) ;
     ++source_it ;
     ++target_it ;
   }
@@ -307,7 +313,7 @@ int remap ( const bspline < value_type , dim_in > & bspl ,
 /// one-shot remaps where the spline isn't reused.
 
 template < int dimension >
-using bcv_type = TinyVector < bc_code , dimension > ;
+using bcv_type = typename aggregate_traits < bc_code , dimension > :: type ;
 
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class coordinate_type , // usually TinyVector of float for coordinates
@@ -358,8 +364,8 @@ class warper
 {
 public:
   
-  typedef vigra::TinyVector < ic_type , coordinate_dimension > nd_ic_type ;
-  typedef vigra::TinyVector < rc_type , coordinate_dimension > nd_rc_type ;
+  typedef typename aggregate_traits < ic_type , coordinate_dimension > :: type nd_ic_type ;
+  typedef typename aggregate_traits < rc_type , coordinate_dimension > :: type nd_rc_type ;
 
   vigra::MultiArray < dimension , nd_ic_type > _select ;
   vigra::MultiArray < dimension , nd_rc_type > _tune ;
@@ -451,7 +457,7 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
            a < aggregates ;
            a++ , pselect += vsize , ptune += vsize , destination += vsize )
       {
-        ev ( (ic_type*) pselect , (rc_type*) ptune , destination ) ;
+        ev.v_eval ( (ic_type*) pselect , (rc_type*) ptune , destination ) ;
       }
       target_it += aggregates * vsize ;
     }
@@ -463,7 +469,7 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
       value_type target_buffer [ vsize ] ;
       for ( int a = 0 ; a < aggregates ; a++ , pselect += vsize , ptune += vsize )
       {
-        ev ( (ic_type*) pselect , (rc_type*) ptune , target_buffer ) ;
+        ev.v_eval ( (ic_type*) pselect , (rc_type*) ptune , target_buffer ) ;
         for ( int e = 0 ; e < vsize ; e++ )
         {
           *target_it = target_buffer[e] ;
@@ -481,7 +487,7 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
   while ( leftover-- )
   {
     // process leftovers with single-value evaluation
-    *target_it = ev ( *select_it , *tune_it ) ;
+    ev.eval ( *select_it , *tune_it , *target_it ) ;
     ++select_it ;
     ++tune_it ;
     ++target_it ;
@@ -490,6 +496,8 @@ int st_wremap ( const bspline < value_type , dim_in > & bspl ,
   return 0 ;
 }
 
+/// overload of remap() using presplit coordinates packaged in a warper object
+
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class ic_type ,         // integral coordinate part
            class rc_type ,         // real coordinate part
@@ -543,8 +551,8 @@ template < class rc_type ,       // elementary type for coordinates, usually flo
            int dim_in ,          // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >         // number of dimensions of output array
 using transform
-= std::function < void ( const TinyVector < rc_type , dim_out > & ,
-                               TinyVector < rc_type , dim_in > & ) > ;
+= std::function < void ( const typename aggregate_traits < rc_type , dim_out > :: type & ,
+                               typename aggregate_traits < rc_type , dim_in > :: type & ) > ;
 
 #ifdef USE_VC
 
@@ -557,7 +565,7 @@ using rc_v
 = typename
    vector_traits < rc_type ,
                    vector_traits < typename ExpandElementResult < value_type > :: type > :: vsize
-                 > :: simdized_type ;
+                 > :: type ;
                  
 /// This type alias defines a vectorized coordinate transformation.
 /// it is the equivalent of the unvectorized type above, taking TinyVectors
@@ -568,8 +576,8 @@ template < class rc_v ,            // simdized type of vsize real coordinates
            int dim_out >           // number of dimensions of output array
 using vtransform
 = std::function < void
-                   ( const TinyVector < rc_v , dim_out > & ,
-                     TinyVector < rc_v , dim_in > & ) > ;
+                   ( const typename aggregate_traits < rc_v , dim_out > :: type & ,
+                           typename aggregate_traits < rc_v , dim_in > :: type & ) > ;
 
 #endif
 
@@ -616,16 +624,16 @@ class transformation
 
   transform_type tf ;   /// functor for single-value coordinate transformation
 
-  typedef TinyVector < rc_type , dim_in > rc_in_type ;
-  typedef TinyVector < rc_type , dim_out > rc_out_type ;
+  typedef typename aggregate_traits < rc_type , dim_in > :: type rc_in_type ;
+  typedef typename aggregate_traits < rc_type , dim_out > :: type rc_out_type ;
   
 #ifdef USE_VC
 
   typedef rc_v < rc_type , value_type > rc_v_type ;
   const int vsize = rc_v_type::Size ;
   
-  typedef TinyVector < rc_v_type , dim_in > rc_v_in_type ;
-  typedef TinyVector < rc_v_type , dim_out > rc_v_out_type ;
+  typedef typename aggregate_traits < rc_v_type , dim_in > :: type rc_v_in_type ;
+  typedef typename aggregate_traits < rc_v_type , dim_out > :: type rc_v_out_type ;
 
   typedef vtransform < rc_v_type , dim_in , dim_out > vtransform_type ;
   
@@ -731,11 +739,11 @@ template < class _rc_v , int _dimension >
 struct coordinate_iterator_v
 {
   enum { dimension = _dimension } ;
-  typedef vigra::TinyVector < int , dimension > shape_type ;
+  typedef typename aggregate_traits < int , dimension > :: type shape_type ;
 
   typedef _rc_v rc_v ;
   enum { vsize = rc_v::Size } ;
-  typedef vigra::TinyVector < rc_v , dimension > nd_rc_v ; // vectorized n-dimensional coordinate
+  typedef typename aggregate_traits < rc_v , dimension > :: type nd_rc_v ; // vectorized n-dimensional coordinate
 
   shape_type start ;
   shape_type end ;
@@ -779,7 +787,8 @@ public:
       c[0] ( mask ) -= shape[0] ; // fold back to range
       for ( int d = 1 ; d < dimension ; d++ )
       {
-        c[d] ( mask ) ++ ; // increase next-higher dimension's index
+//         c[d] ( mask ) ++ ; // increase next-higher dimension's index
+        c[d] ( mask ) += 1 ; // increase next-higher dimension's index
         mask = ( c[d] >= end[d] ) ; // check for range overflow
         // resultant mask is either empty or the same as before
         if ( none_of ( mask ) )         // if no overflow, we're done
@@ -835,8 +844,11 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
   typedef typename evaluator_type::mc_ele_v mc_ele_v ;
   const int vsize = evaluator_type::vsize ;
 
-  TinyVector < rc_v , dim_out > c_in ; // incoming coordinates (into output, which has dim_out dims)
-  TinyVector < rc_v , dim_in > c_out ; // transformed coordinates (into input, which has dim_in dims)
+   // incoming coordinates (into output, which has dim_out dims)
+  typename aggregate_traits < rc_v , dim_out > :: type c_in ;
+
+  // transformed coordinates (into input, which has dim_in dims)
+  typename aggregate_traits < rc_v , dim_in > :: type c_out ;
   mc_ele_v result ;                    // result, as struct of vectors
   
 #endif
@@ -863,13 +875,13 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
       if ( output.isUnstrided() )
       {
         // finally, here we evaluate the spline
-        ev ( c_out , destination ) ;
+        ev.v_eval ( c_out , destination ) ;
         destination += vsize ;
       }
       else
       {
         // alternative evaluation if we can't write straight to destination 
-        ev ( c_out , target_buffer ) ;
+        ev.v_eval ( c_out , target_buffer ) ;
         for ( int e = 0 ; e < vsize ; e++ )
         {
           target_it.get<1>() = target_buffer[e] ;
@@ -883,8 +895,8 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
   
 #endif // USE_VC
 
-  TinyVector < rc_type , dim_out > cs_in ;
-  TinyVector < rc_type , dim_in > cs_out ;
+  typename aggregate_traits < rc_type , dim_out > :: type cs_in ;
+  typename aggregate_traits < rc_type , dim_in > :: type cs_out ;
   
   // process leftovers, if any - if vc isn't used, this loop does all the processing
   while ( leftover-- )
@@ -892,7 +904,7 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
     // process leftovers with single-value evaluation of transformed coordinate
     cs_in = target_it.get<0>() + offset ;
     tf ( cs_in , cs_out ) ;
-    target_it.get<1>() = ev ( cs_out ) ;
+    ev.eval ( cs_out , target_it.get<1>() ) ;
     ++target_it ;
   }
 
@@ -982,7 +994,8 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            int dim_out >           // number of dimensions of output array
 int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
                          MultiArrayView < dim_out ,
-                                          TinyVector < rc_type , dim_in > > & warp_array ,
+                                          typename aggregate_traits < rc_type , dim_in > :: type >
+                           & warp_array ,
                          typename MultiArrayView < dim_out , value_type > :: difference_type offset ,
                          bool use_vc = true
                        )
@@ -991,12 +1004,12 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
   /// generated by vigra's CoupledIterator or, in the vectorized code,
   /// by coordinate_iterator_v
 
-  typedef TinyVector < rc_type , dim_out > nd_rc_in ;
+  typedef typename aggregate_traits < rc_type , dim_out > :: type nd_rc_in ;
 
   /// transformed coordinates (into input, which has dim_in dims)
   /// populating the resulting warp array
 
-  typedef TinyVector < rc_type , dim_in > nd_rc_out ;
+  typedef typename aggregate_traits < rc_type , dim_in > :: type nd_rc_out ;
 
   nd_rc_in cs_in ;
   nd_rc_out cs_out ;
@@ -1007,15 +1020,15 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
 #ifdef USE_VC
 
   typedef typename ExpandElementResult<value_type>::type ele_type ;
-  typedef typename vector_traits < ele_type > :: simdized_type ele_v ;
+  typedef typename vector_traits < ele_type > :: type ele_v ;
   const int vsize = ele_v::Size ;
-  typedef typename vector_traits < rc_type , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < rc_type , vsize > :: type rc_v ;
 
   /// vecorized incoming coordinates (into output, which has dim_out dims)
-  typedef TinyVector < rc_v , dim_out > nd_rc_in_v ;
+  typedef typename aggregate_traits < rc_v , dim_out > :: type nd_rc_in_v ;
 
   /// vectorized transformed coordinates (into input, which has dim_in dims)
-  typedef TinyVector < rc_v , dim_in > nd_rc_out_v ;
+  typedef typename aggregate_traits < rc_v , dim_in > :: type nd_rc_out_v ;
 
   if ( use_vc )
   {
@@ -1045,7 +1058,7 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
   // process leftovers, if any - if vc isn't used, this loop does all the processing
   while ( leftover-- )
   {
-    // process leftovers with single-value evaluation of transformed coordinate
+    // process leftovers
     cs_in = target_it.get<0>() + offset ;
     tf ( cs_in , cs_out ) ;
     target_it.get<1>() = cs_out ;
@@ -1063,7 +1076,8 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            int dim_out >           // number of dimensions of output array
 int make_warp_array ( transformation < value_type , rc_type , dim_in , dim_out > tf ,
                       MultiArrayView < dim_out ,
-                                       TinyVector < rc_type , dim_in > > warp_array ,
+                                       typename aggregate_traits < rc_type , dim_in > :: type >
+                        warp_array ,
                       bool use_vc = true ,
                       int nthreads = ncores
                     )

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git