[vspline] 07/72: moved allocation of workspace to evaluator, moved vector type fixing to traits class with the vector type now fixed in a traits class, I managed to globally change the simdized type to a SimdArray with twice the Vc::Vector size, which resulted in ca. 15% overall eval performance gain. And I moved the allocation of the 'workspace' into class evaluator, which makes it more like the functor I have in mind when I rework the remap code.

Sun Jul 2 09:02:37 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 12f848538e1a00c8e887231c8a92df33011bf574
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Tue Nov 1 12:00:59 2016 +0100

    moved allocation of workspace to evaluator, moved vector type fixing to traits class
    with the vector type now fixed in a traits class, I managed to globally change
    the simdized type to a SimdArray with twice the Vc::Vector size, which resulted in
    ca. 15% overall eval performance gain. And I moved the allocation of the 'workspace'
    into class evaluator, which makes it more like the functor I have in mind when I
    rework the remap code.
---
 bspline.h                   |  43 ++++++++++--
 common.h                    |  49 +++++++++++++-
 doxy.h                      |  12 +++-
 eval.h                      | 161 ++++++++++++++++++++++++++------------------
 example/impulse_response.cc |  90 ++++++++++++++++++++-----
 example/roundtrip.cc        |   3 +-
 filter.h                    |  22 +++---
 mapping.h                   | 153 +++++++++++++++++++++--------------------
 prefilter.h                 |  52 ++++++++++++--
 remap.h                     |  85 ++++++++++++++++-------
 vspline.doxy                |   2 +-
 11 files changed, 464 insertions(+), 208 deletions(-)

diff --git a/bspline.h b/bspline.h
index 301c0c1..97f5a6a 100644
--- a/bspline.h
+++ b/bspline.h
@@ -296,22 +296,32 @@ public:
   // multiple of the Vc:Vector's Size for the given data type to have better-aligned
   // access. This may or may not help, has to be tested. We might also want to position
   // the origin of the brace to an aligned position, since evaluation is based there.
-  
+
   bspline ( shape_type _core_shape ,  ///< shape of knot point data
             int _spline_degree = 3 , ///< spline degree with reasonable default
             bcv_type _bcv = bcv_type ( MIRROR ) ,   ///< boundary conditions and common default
             prefilter_strategy _strategy = BRACED , ///< default strategy is the 'implicit' scheme
             int _horizon = -1 ,                     ///< width of frame for explicit scheme
             view_type _space = view_type() ,        ///< coefficient storage to 'adopt'
-            double _tolerance = .0000001            ///< acceptable error (relative to unit pulse)
+            double _tolerance = -1.0                ///< acceptable error (relative to unit pulse)
           )
   : core_shape ( _core_shape ) ,
     spline_degree ( _spline_degree ) ,
     bcv ( _bcv ) ,
-    strategy ( _strategy ) ,
-    tolerance ( _tolerance )
+    strategy ( _strategy )
   {
-    // heuristic horizon for reasonable precision - we assume that no one in their right
+    if ( _tolerance < 0.0 )
+    {
+      // heuristic: 'reasonable' defaults
+      if ( std::is_same < real_type , float > :: value )
+        tolerance = .000001 ;
+      else if ( std::is_same < real_type , double > :: value )
+        tolerance = .0000000000001 ;
+      else
+        tolerance = 0.0 ;
+    }
+
+    // heuristic: horizon for reasonable precision - we assume that no one in their right
     // minds would want a negative horizon ;)
 
     if ( _horizon < 0 )
@@ -480,6 +490,29 @@ public:
     }
   }
 
+  /// overloaded constructor for 1D splines. This is useful because if we don't
+  /// provide it, the caller would have to pass TinyVector < T , 1 > instead of T
+  /// for the shape and the boundary condition.
+  
+  bspline ( long _core_shape ,                      ///< shape of knot point data
+            int _spline_degree = 3 ,                ///< spline degree with reasonable default
+            bc_code _bcv = bcv_type ( MIRROR ) ,    ///< boundary conditions and common default
+            prefilter_strategy _strategy = BRACED , ///< default strategy is the 'implicit' scheme
+            int _horizon = -1 ,                     ///< width of frame for explicit scheme
+            view_type _space = view_type() ,        ///< coefficient storage to 'adopt'
+            double _tolerance = -1.0                ///< acceptable error (relative to unit pulse)
+          )
+  :bspline ( TinyVector < long , 1 > ( _core_shape ) ,
+             _spline_degree ,
+             bcv_type ( _bcv ) ,
+             _strategy ,
+             _horizon ,
+             _space ,
+             _tolerance )
+  {
+    static_assert ( _dimension == 1 , "bspline: 1D constructor only usable for 1D splines" ) ;
+  } ;
+
   /// shift will change the interpretation of the data in a bspline object.
   /// d is taken as a difference to add to the current spline degree. The coefficients
   /// remain the same, but creating an evaluator from the shifted spline will make
diff --git a/common.h b/common.h
index 4e1b578..8f9e720 100644
--- a/common.h
+++ b/common.h
@@ -46,7 +46,50 @@
 #include <thread>
 
 #ifdef USE_VC
+
 #include <Vc/Vc>
+
+// initially, I used Vc::SimdArray in many places, but I figured that having a central
+// place to fix a simdized type would be a good idea, so here's a traits class. With this
+// central definition of the simdized type, it's also easy to make more of an effort to get
+// that simdized type which is most suited for the situation, so I introduced a conditional
+// which picks plain Vc::Vector<T> if vsize is the right size, and SimdArray only if it is not.
+
+// TODO maybe add types for masks and index vectors?
+
+// TODO: I'd like to increase the default _vsize, like to twice the value used now,
+// just to see what happens, but I can't yet get this modification to trickle down
+// without a bunch of errors I can't even relate to the change...
+
+/// traits class vector_traits picks the appropriate simdized type for a given
+/// elementary type T and a vector length of _vsize.
+/// if vsize isn't given, it is taken to be a Vc::Vector<T>'s Size.
+/// the definition used here assures that whenever possible, actual Vc::Vector<T>
+/// is used as the simdized type, and SimdArray<T,_vsize> is only used if it can't be
+/// avoided. Why so? I have the suspicion that the code generated for SimdArrays
+/// may be less efficient than the code for Vc::Vectors.
+/// hmmm... the difference is, if at all, small, and the extra finessing produces
+/// problems further down, so I stick with sall SimdArrays for now, but I always
+/// use SimdArrays of twice the vector size, because this produces the fastest
+/// evaluation on my system.
+
+template < class T , int _vsize = 2 * Vc::Vector < T > :: Size >
+struct vector_traits
+{
+  enum { vsize = _vsize } ;
+
+  // if vsize given is precisely the Size of a Vc::Vector<T>, make simdized_type
+  // such a Vc::Vector, otherwise make it a Vc::SimdArray<T,vsize>
+  
+//   typedef typename std::conditional < vsize == Vc::Vector < T > :: Size ,
+//                                       Vc::Vector < T > ,
+//                                       Vc::SimdArray < T , vsize > > :: type simdized_type ;
+
+// to always use Simdarrays, I use
+
+  typedef Vc::SimdArray < T , vsize > simdized_type ;
+} ;
+
 #endif
 
 namespace vspline {
@@ -80,7 +123,7 @@ struct not_supported
   : std::invalid_argument ( msg ) { }  ;
 } ;
 
-/// exception which is thrown by mapping mode REJECT for out-of-bounds coordinates
+/// out_of_bounds is thrown by mapping mode REJECT for out-of-bounds coordinates
 /// this exception is left without a message, it only has a very specific application,
 /// and there it may be thrown often, so we don't want anything slowing it down.
 
@@ -350,7 +393,7 @@ struct divide_and_conquer
   
 #ifdef USE_VC
 
-  typedef Vc::Vector < ele_type > value_v ;
+  typedef typename vector_traits < ele_type > :: simdized_type value_v ;
   enum { vsize = value_v::Size } ;
 
   /// now we repeat the pattern for vectorized operation.
@@ -374,7 +417,7 @@ struct divide_and_conquer
 
     if ( data.isUnstrided() )
     {
-      typedef Vc::SimdArray < int , vsize > ic_v ;
+      typedef typename vector_traits < int , vsize > :: simdized_type ic_v ;
       typedef vigra::TinyVector < ic_v , channels > mc_ic_v ;
 
       // if data is unstrided, we can process it's contents by gather/scatter
diff --git a/doxy.h b/doxy.h
index 3735170..56c77dd 100644
--- a/doxy.h
+++ b/doxy.h
@@ -81,7 +81,7 @@ On the evaluation side I provide
 
  \section compile_sec Compilation
  
- While your distro's packages may be sufficient to geth vspline up and running, you may need newer versions of VIGRA and Vc. At the time of this writing the latest versions commonly available were Vc 1.2.0 and VIGRA 1.11.0; I compiled Vc and VIGRA from source, using up-to-date pulls from their respective repositories.
+ While your distro's packages may be sufficient to get vspline up and running, you may need newer versions of VIGRA and Vc. At the time of this writing the latest versions commonly available were Vc 1.2.0 and VIGRA 1.11.0; I compiled Vc and VIGRA from source, using up-to-date pulls from their respective repositories.
  
  To compile software using vspline, I use this g++ call:
  
@@ -150,8 +150,16 @@ On the evaluation side I provide
  
  Now obviously many things have been done by default here: The default spline degree was used - it's 3, for a cubic spline. Also, boundary treatment mode 'MIRROR' was used per default. Further default parameters cause the spline to be 'braced' so that it can be evaluated with vspline's evaluation routines, Vc (if compiled in) was used for prefiltering, and the code used as many threads as your system has physical cores. You could have passed different values for all the parameters I have [...]
  
- while the sequence of operations indicated here looks a bit verbose (why not create the bspline object by a call like bspl ( a ) ?), in 'real' code you can use bspl.core straight away as the space to contain your data - you might get the data from a file or by some other process.
+ while the sequence of operations indicated here looks a bit verbose (why not create the bspline object by a call like bspl(a) ?), in 'real' code you can use bspl.core straight away as the space to contain your data - you might get the data from a file or by some other process or do something like this  where the bspline object provides the array and you interface it via a view to it's 'core':
+   
+ vspline::bspline < double , 1 > bsp ( 10001 , degree , vspline::MIRROR ) ;
  
+ auto v1 = bsp.core ; // get a view to the bspline's 'core'
+ 
+ v1 [ 5000 ] = 1.0 ; // assign some value (the core is zero-initialized)
+ 
+ bsp.prefilter() ; // perform the prefiltering
+
  Next you may want to evaluate the spline at some pair of coordinates x, y.
  To do so, you can use this idiom:
  
diff --git a/eval.h b/eval.h
index e14590a..7ad20c6 100644
--- a/eval.h
+++ b/eval.h
@@ -163,7 +163,7 @@ struct weight_functor_base
   
 #ifdef USE_VC
 
-  typedef Vc::Vector < rc_type >       rc_type_v ;
+  typedef typename vector_traits < rc_type > :: simdized_type rc_type_v ;
 
   virtual void operator() ( rc_type_v* result , const rc_type_v& delta ) = 0 ;
 
@@ -509,7 +509,9 @@ public:
   /// a vector of the elementary type of value_type,
   /// which is used for coefficients and results:
 
-  typedef Vc::Vector < ele_type > ele_v ;
+  typedef typename vector_traits < ele_type > :: simdized_type ele_v ;
+//   typedef typename vector_traits < ele_type > :: simdized_type evt ;
+//   typedef typename vector_traits < ele_type , 2 * evt::Size > :: simdized_type ele_v ;
   
   /// element count of Simd data types, always the same as the number of elements in a Vc::Vector
   /// of ele_type and used throughout. If elementary types of a size different from ele_type's
@@ -517,13 +519,13 @@ public:
   
   enum { vsize = ele_v::Size } ;
 
-  /// compatible-sized SimdArray of vsize ic_type
+  /// compatible-sized simdized type of vsize ic_type
 
-  typedef Vc::SimdArray < ic_type , vsize > ic_v ;
+  typedef typename vector_traits < ic_type , vsize > :: simdized_type ic_v ;
 
-  /// compatible-sized SimdArray of vsize rc_type
+  /// compatible-sized simdized type of vsize rc_type
 
-  typedef Vc::SimdArray < rc_type , vsize > rc_v ;
+  typedef typename vector_traits < rc_type , vsize > :: simdized_type rc_v ;
 
   /// multichannel value as SoA, for pixels etc.
 
@@ -572,6 +574,14 @@ public:
   
 private:
   
+  ele_type * p_workspace ;
+
+#ifdef USE_VC
+  
+  ele_v * p_workspace_v ;
+
+#endif
+
   nd_weight_functor fweight ;       ///< set of pointers to weight functors, one per dimension
   const array_type& coefficients ;  ///< b-spline coefficient array
   const expanded_shape_type expanded_stride ;        ///< strides in terms of expanded value_type
@@ -606,6 +616,11 @@ public:
     mmap ( _mmap )               // I enforce the passing in of a mapping, even though it
                                  // may not be used at all. TODO: can I do better?
   {
+    p_workspace = new ele_type [ ORDER * dimension ] ;
+#ifdef USE_VC
+    p_workspace_v = new ele_v [ ORDER * dimension ] ;
+#endif
+
     // initalize the weight functors. In this constructor, we use only bspline weight
     // functors, even though the evaluator can operate with all weight functors
     // filling in the right number of basis values given a delta. To make use of this
@@ -702,7 +717,7 @@ public:
   evaluator ( const bspline < value_type , dimension > & bspl ,
               TinyVector < int , dimension > _derivative = TinyVector < int , dimension >(0) )
   : evaluator ( bspl.coeffs ,
-                nd_mapping_type ( bspl ) ,
+                nd_mapping_type ( bspl.bcv , bspl.spline_degree , bspl.core_shape ) ,
                 bspl.spline_degree ,
                 _derivative )
   {
@@ -741,16 +756,24 @@ public:
   /// the axis, so now weights for axis 0 are first etc.. This results in a slighly funny-looking
   /// initial call to _eval, but the confusion from reverse-filling the workspace was probably worse.
   
-  template < typename dtype ,
-             typename nd_rc_type >
-  void fill_multi_basis ( dtype * p_workspace ,
-                          const nd_rc_type& c )
+  template < typename nd_rc_type >
+  void fill_multi_basis ( const nd_rc_type& c )
   {
     auto ci = c.cbegin() ;
     for ( int axis = 0 ; axis < dimension ; ++ci , ++axis )
       (*(fweight[axis])) ( p_workspace + axis * ORDER , *ci ) ;
   }
 
+#ifdef USE_VC
+  template < typename nd_rc_type >
+  void fill_multi_basis_v ( const nd_rc_type& c )
+  {
+    auto ci = c.cbegin() ;
+    for ( int axis = 0 ; axis < dimension ; ++ci , ++axis )
+      (*(fweight[axis])) ( p_workspace_v + axis * ORDER , *ci ) ;
+  }
+#endif
+
   /// _eval is the workhorse routine and implements the recursive arithmetic needed to
   /// evaluate the spline. First the 'basis' for the current dimension is obtained
   /// from the basis iterator. Once the basis has been obtained, it's values are
@@ -775,15 +798,15 @@ public:
   {
     dtype operator() ( const dtype* & pdata ,
                        offset_iterator & ofs ,
-                       const ele_type * p_workspace ,
+                       const ele_type * p_weights ,
                        const int& ORDER
                      )
     {
       dtype result = dtype() ;
       for ( int i = 0 ; i < ORDER ; i++ )
       {
-        result +=   p_workspace[i]
-                  * _eval < level - 1 , dtype >() ( pdata , ofs , p_workspace - ORDER , ORDER ) ;
+        result +=   p_weights[i]
+                  * _eval < level - 1 , dtype >() ( pdata , ofs , p_weights - ORDER , ORDER ) ;
       }
       return result ;
     }
@@ -800,14 +823,14 @@ public:
   {
     dtype operator() ( const dtype* & pdata ,
                        offset_iterator & ofs ,
-                       const ele_type * p_workspace ,
+                       const ele_type * p_weights ,
                        const int& ORDER
                      )
     {
       dtype result = dtype() ;
       for ( int i = 0 ; i < ORDER ; i++ )
       {
-        result += pdata [ *ofs ] * p_workspace[i] ;
+        result += pdata [ *ofs ] * p_weights[i] ;
         ++ofs ;
       }
       return result ;
@@ -838,7 +861,7 @@ public:
     dtype operator() ( const component_base_type& base , ///< base adresses of components
                        const ic_v& origin ,              ///< offsets to evaluation window origins
                        offset_iterator & ofs ,           ///< offsets to coefficients inside this window
-                       const ele_v * p_workspace ,       ///< vectorized weights
+                       const ele_v * p_weights ,       ///< vectorized weights
                        const int& ORDER )
     {
       dtype sum = dtype() ;    ///< to accumulate the result
@@ -846,10 +869,10 @@ public:
 
       for ( int i = 0 ; i < ORDER ; i++ )
       {
-        subsum = _v_eval < dtype , level - 1 >() ( base , origin , ofs , p_workspace - ORDER , ORDER );
+        subsum = _v_eval < dtype , level - 1 >() ( base , origin , ofs , p_weights - ORDER , ORDER );
         for ( int ch = 0 ; ch < channels ; ch++ )
         {
-          sum[ch] += p_workspace[i] * subsum[ch] ;
+          sum[ch] += p_weights[i] * subsum[ch] ;
         }
       }
       return sum ;
@@ -864,7 +887,7 @@ public:
     dtype operator() ( const component_base_type& base , ///< base adresses of components
                        const ic_v& origin ,              ///< offsets to evaluation window origins
                        offset_iterator & ofs ,           ///< offsets to coefficients in this window
-                       const ele_v * p_workspace ,       ///< vectorized weights
+                       const ele_v * p_weights ,       ///< vectorized weights
                        const int& ORDER )
     {
       dtype sum = dtype() ;
@@ -873,7 +896,7 @@ public:
       {
         for ( int ch = 0 ; ch < channels ; ch++ )
         {
-          sum[ch] += p_workspace[i] * ele_v ( base[ch] , origin + *ofs ) ;
+          sum[ch] += p_weights[i] * ele_v ( base[ch] , origin + *ofs ) ;
         }
         ++ofs ;
       }
@@ -893,8 +916,7 @@ public:
   /// *decreases* p_workspace by ORDER, etc. until for processing along the 0 (or x) axis
   /// the workspace pointer is equal again to what this here eval receives as p_workspace.
 
-  value_type eval ( const shape_type& s ,     // lower corner of the subarray
-                    ele_type * p_workspace )  // precalculated multi_basis
+  value_type eval ( const shape_type& s )     // lower corner of the subarray
   {
     const value_type * base = & coefficients [ s ] ;
     auto ofs = offsets.begin() ;                // offsets reflects the positions inside the subarray
@@ -903,53 +925,62 @@ public:
 
   /// this variant of eval takes a split coordinate:
   
-  value_type operator() ( const split_coordinate_type& sc ,  // presplit coordinate
-                          ele_type * p_workspace )           // space for weights
+  value_type operator() ( const split_coordinate_type& sc )  // presplit coordinate
   {
-    fill_multi_basis ( p_workspace , sc.tune ) ;
-    return eval ( sc.select , p_workspace ) ;
+    fill_multi_basis ( sc.tune ) ;
+    return eval ( sc.select ) ;
   }
 
   /// this variant take a coordinate which hasn't been mapped yet, so it uses
   /// the nd_mapping, mmap, and applies it. It's the most convenient form but
   /// also the slowest, due to the mapping, which is quite expensive computationally.
   
-  value_type operator() ( const nd_rc_type& c ,      /// coordinate
-                          ele_type * p_workspace )   /// space for weights
+  value_type operator() ( const nd_rc_type& c )      /// coordinate
   {
     split_coordinate_type sc ;
     mmap ( c , sc ) ;  /// apply the mappings
-    fill_multi_basis ( p_workspace , sc.tune ) ;
-    return eval ( sc.select , p_workspace ) ;
+    fill_multi_basis ( sc.tune ) ;
+    return eval ( sc.select ) ;
   }
 
-  /// variant to use if the caller doesn't provide a workspace
-  /// this is less efficient than the above version, so for mass evaluations, like in the
-  /// remap routine, it isn't used, but it's nice to have for ad-hoc use where one doesn't
-  /// want to look into the workings of the code.
-
-  value_type operator() ( const nd_rc_type& c )   // coordinate
+  /// variant taking a plain rc_type for a coordinate. only for 1D splines!
+  
+  value_type operator() ( const rc_type& c )         /// coordinate
   {
-    ele_type p_workspace[workspace_size()] ;      // provide space for weights
-    return operator() ( c , p_workspace ) ;       // delegate to above routine
+    static_assert ( dimension == 1 ,
+                    "evaluation at a single real coordinate is only allowed for 1D splines" ) ;
+    split_coordinate_type sc ;
+    mmap ( nd_rc_type(c) , sc ) ;  /// apply the mappings - need a nd_rc_type here
+    fill_multi_basis ( sc.tune ) ;
+    return eval ( sc.select ) ;
   }
 
-  /* currently unused:
-  /// we also allow passing in of a single real value which is used to
-  /// construct a nd_rc_type. beware! might cause hard-to-track bugs
-
-  value_type operator() ( const rc_type& c )
-  {
-    return operator() ( nd_rc_type(c) ) ;
-  }
-  */
+//   /// variant to use if the caller doesn't provide a workspace
+//   /// this is less efficient than the above version, so for mass evaluations, like in the
+//   /// remap routine, it isn't used, but it's nice to have for ad-hoc use where one doesn't
+//   /// want to look into the workings of the code.
+// 
+//   value_type operator() ( const nd_rc_type& c )   // coordinate
+//   {
+//     ele_type p_workspace[workspace_size()] ;      // provide space for weights
+//     return operator() ( c , p_workspace ) ;       // delegate to above routine
+//   }
+
+//   /// we also allow passing in of a single real value which is used to
+//   /// construct a nd_rc_type. this is only allowed for 1D to avoid hard-to-track bugs
+// 
+//   value_type operator() ( const rc_type& c )
+//   {
+//     static_assert ( dimension == 1 ,
+//                     "evaluation at a single real coordinate is only allowed for 1D splines" ) ;
+//     return operator() ( nd_rc_type(c) ) ;
+//   }
 
 #ifdef USE_VC
 
   /// vectorized variants of the previous routines
   
-  mc_ele_v operator() ( nd_ic_v& s ,           // lower corners of the subarrays
-                        ele_v * p_workspace )  // precalculated weights
+  mc_ele_v operator() ( nd_ic_v& s )           // lower corners of the subarrays
   {
     // first we sum up the coordinates in all dimensions into origin values. this is analogous
     // to forming origin = & coefficients [ ... ] - coefficients.data() in unvectorized code
@@ -969,7 +1000,7 @@ public:
     // now we can call the recursive _eval routine
     
     return _v_eval < mc_ele_v , level >() ( component_base , origin , ofs ,
-                                            p_workspace + level * ORDER , ORDER ) ;
+                                            p_workspace_v + level * ORDER , ORDER ) ;
   }
 
   /// here we take the approach to require the calling function to present pointers to
@@ -980,8 +1011,7 @@ public:
   /// data are strided.
   
   void operator() ( const split_coordinate_type* const psc , // pointer to vsize presplit coordinates
-                    value_type* result ,                     // pointer to vsize result values
-                    ele_v * p_workspace )                    // space for weight vectors
+                    value_type* result )                     // pointer to vsize result values
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1002,9 +1032,9 @@ public:
 
     // calculate the result
     
-    fill_multi_basis ( p_workspace , tune ) ;
+    fill_multi_basis_v ( tune ) ;
 
-    mc_ele_v v_result = operator() ( select , p_workspace ) ;
+    mc_ele_v v_result = operator() ( select ) ;
 
     // and deposit it in the memory the caller provides
     for ( int ch = 0 ; ch < channels ; ch++ )
@@ -1017,8 +1047,7 @@ public:
   /// to perform the (de)interleaving e.g. by a gather/scatter operation.
   
   void operator() ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
-                    mc_ele_v & result ,      // number of channels * value vectors
-                    ele_v * p_workspace )    // space for weight vectors
+                    mc_ele_v & result )      // number of channels * value vectors
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1029,11 +1058,11 @@ public:
 
     // calculate the weights
 
-    fill_multi_basis ( p_workspace , tune ) ;
+    fill_multi_basis_v ( tune ) ;
 
     // delegate
 
-    result = operator() ( select , p_workspace ) ;
+    result = operator() ( select ) ;
   }
 
   /// This variant operates on unsplit coordinates. Here again we require the calling function
@@ -1042,8 +1071,7 @@ public:
   /// on vectorized data.
   
   void operator() ( const nd_rc_type* const pmc ,  // pointer to vsize muli_coordinates
-                    value_type* result ,           // pointer to vsize result values
-                    ele_v * p_workspace )          // space for weight vectors
+                    value_type* result )           // pointer to vsize result values
   {
     nd_rc_v input ;
     mc_ele_v v_result ;
@@ -1053,7 +1081,7 @@ public:
       input[d].gather ( (const rc_type* const)pmc , nd_interleave[d] ) ;
 
     // call operator() for vectorized data
-    operator() ( input , v_result , p_workspace ) ;
+    operator() ( input , v_result ) ;
 
     // and deposit it in the memory the caller provides
     for ( int ch = 0 ; ch < channels ; ch++ )
@@ -1064,14 +1092,13 @@ public:
   /// and output goes to interleaved memory
 
   void operator() ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
-                    value_type* result ,     // pointer to vsize result values
-                    ele_v * p_workspace )    // space for weight vectors
+                    value_type* result )     // pointer to vsize result values
   {
     mc_ele_v v_result ;
 
     // call operator() for vectorized data
 
-    operator() ( input , v_result , p_workspace ) ;
+    operator() ( input , v_result ) ;
 
     // and deposit result in the memory the caller provides
 
@@ -1089,6 +1116,10 @@ public:
       if ( fweight[d] != &wfd0 )
         delete fweight[d] ;
     }
+    delete[] p_workspace ;
+#ifdef USE_VC
+    delete[] p_workspace_v ;
+#endif
   }
 } ;
 
diff --git a/example/impulse_response.cc b/example/impulse_response.cc
index d444ae1..791f24a 100644
--- a/example/impulse_response.cc
+++ b/example/impulse_response.cc
@@ -29,45 +29,105 @@
 /*                                                                      */
 /************************************************************************/
 
-/// impulse_response.cc
+/// \file impulse_response.cc
 ///
+/// \brief get the impulse response of a b-spline prefilter
+/// 
 /// filter a unit pulse with a b-spline prefilter of a given degree
 /// and display the central section of the result
 ///
 /// compile with:
 /// g++ -std=c++11 -o impulse_response -O3 -pthread -DUSE_VC=1 impulse_response.cc -lVc
 ///
-/// to get the central section with values byond +/- 0.00042 of a degree 7 b-spline:
+/// to get the central section with values beyond +/- 0.0042 of a degree 5 b-spline:
 ///
-/// impulse_response 7 .00042
+/// impulse_response 5 .0042
+///
+/// producing this output:
+/// 
+/// double ir_5[] = {
+/// -0.0084918610197410 ,
+/// +0.0197221240122632 ,
+/// -0.0458040841912519 ,
+/// +0.1063780046433000 ,
+/// -0.2470419274022756 ,
+/// +0.5733258709616592 ,
+/// -1.3217294729875093 ,
+/// +2.8421709220216247 ,
+/// -1.3217294729875098 ,
+/// +0.5733258709616593 ,
+/// -0.2470419274022757 ,
+/// +0.1063780046433000 ,
+/// -0.0458040841912519 ,
+/// +0.0197221240122632 ,
+/// -0.0084918610197410 ,
+///  } ;
+///
+/// which, when used as a convolution kernel, will have the same effect on a signal
+/// as applying the recursive filter itself, but with lessened precision due to windowing.
+///
+/// note how three different ways of getting the result are given, the variants
+/// using lower-level access to the filter are commented out.
+
 
 #include <iomanip>
-#include <vspline/prefilter.h>
+#include <vspline/vspline.h>
 #include <random>
 
 int main ( int argc , char * argv[] )
 {
   int degree = std::atoi ( argv[1] ) ;
   double cutoff = std::atof ( argv[2] ) ;
+  assert ( degree >= 0 && degree < 25 ) ;
   
-  vigra::MultiArray < 1 , double > array1 ( 10001 ) ;
-  vigra::MultiArrayView < 1 , double > v1 ( array1 ) ;
-  vigra::TinyVector<vspline::bc_code, 1> bcv ( vspline::MIRROR ) ;
+// using the highest-level access to prefiltering, we code:
+
+  vspline::bspline < double , 1 > bsp ( 10001 , degree , vspline::MIRROR ) ; // , vspline::EXPLICIT ) ;
+  auto v1 = bsp.core ;
+  v1 [ 5000 ] = 1.0 ;
+  bsp.prefilter() ;
+
+// using slightly lower-level access to the prefiltering code, we could achieve
+// the same result with:
+
+//   vigra::MultiArray < 1 , double > v1 ( 10001 ) ;
+//   v1[5000] = 1.0 ;
+//   vspline::solve_vc ( v1 , v1 , vspline::MIRROR , degree , 0.00001 ) ;
   
-  v1 = 0.0 ;
-  v1[5000] = 1.0 ;
-  vspline::solve_vc ( v1 , v1 , bcv , degree , 0.00001 ) ;
+// and, going yet one level lower, this code also produces the same result:
   
+//   vigra::MultiArray < 1 , double > v1 ( 10001 ) ;
+//   v1[5000] = 1.0 ;
+//   typedef decltype ( v1.begin() ) iter_type ;
+//   vspline::filter < iter_type , iter_type , double >
+//     f ( v1.size() ,
+//         vspline::overall_gain ( degree / 2 , precomputed_poles [ degree ] ) ,
+//         vspline::MIRROR ,
+//         degree / 2 ,
+//         precomputed_poles [ degree ] ,
+//         0.00001 ) ;
+//   f.solve ( v1.begin() ) ;
+        
   std::cout << "double ir_" << degree << "[] = {" << std::endl ;
-//   std::cout << std::fixed << std::showpos << std::showpoint << std::setprecision(16);
-  std::cout << std::showpos ;
+  std::cout << std::fixed << std::showpos << std::showpoint << std::setprecision(16);
   int k ;
   for ( k = 0 ; k < 10000 ; k++ )
   {
-    if ( fabs ( v1[k] > cutoff ) )
+    if ( fabs ( v1[k] ) > cutoff )
       std::cout << v1[k] << " ," << std::endl ;
   }
-  if ( fabs ( v1[k] > cutoff ) )
+  if ( fabs ( v1[k] ) > cutoff )
     std::cout << v1[k] ;
   std::cout << " } ;" << std::endl ;
-}
+  
+  typedef vspline::evaluator < 1 , double > eval_type ; // get the appropriate evaluator type 
+  eval_type ev ( bsp ) ;                                   // create the evaluator
+  std::cout << ev ( 5001.5 ) << std::endl ;
+  
+  vigra::MultiArray < 1 , double > coords ( 12 ) ;
+  coords[5] = 5000.1 ;
+  vigra::MultiArray < 1 , double > target ( 12 ) ;
+  vspline::remap < double , double , 1 , 1 > ( bsp , coords , target ) ;
+  for ( auto v : target )
+    std::cout << v << std::endl ;
+}
\ No newline at end of file
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index 8feec87..1e3ef3c 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -114,7 +114,8 @@ void roundtrip ( view_type & data ,
 
 #ifdef USE_VC
   
-  const int vsize = Vc::Vector < real_type > :: Size ;
+//   const int vsize = Vc::Vector < real_type > :: Size ;
+  const int vsize = vector_traits < real_type > :: vsize ;
   
 #else
   
diff --git a/filter.h b/filter.h
index 23b33e4..b08ef3c 100644
--- a/filter.h
+++ b/filter.h
@@ -456,7 +456,6 @@ value_type icc_safe_beyond ( IT c , int k )
     icc += p * icc + c[i] ;
   }
   
-  std::cout << "icc: " << icc << std::endl ;
   return icc ;
 }
 
@@ -468,7 +467,6 @@ value_type iacc_safe_beyond ( out_iter c , int k )
 
   for ( int i = z - 1 ; i >= 0 ; i-- )
     iacc = p * ( iacc - c [ M - 1 + i ] ) ; 
-  std::cout << "iacc: " << iacc << std::endl ;
   return iacc ;
 }
 
@@ -484,8 +482,6 @@ value_type iacc_safe_beyond ( out_iter c , int k )
 // 
 // value_type iacc_guess ( out_iter c , int k )
 // {
-//   for ( int i = 0 ; i < M ; i++ )
-//     std::cout << "c[" << i << "] = " << c[i] << std::endl ;
 //   return c[M-1] * ( pole[k] / ( pole[k] * pole[k] - 1.0 ) ) ;
 // }
 
@@ -1122,8 +1118,7 @@ int ele_aggregating_filter ( source_type &source ,
   
 //   typedef Vc::Vector < ele_type > simdized_type ;        ///< vectorized type, use plain vector
 
-  typedef Vc::SimdArray < ele_type ,
-                          2 * Vc::Vector<ele_type>::Size > simdized_type ; ///< vectorized type
+  typedef typename vector_traits < ele_type > :: simdized_type simdized_type ;
 
   const int vsize ( simdized_type::Size ) ;              ///< number of ele_type in a simdized_type
   typedef typename simdized_type::IndexType index_type ; ///< indexes for gather/scatter
@@ -1320,7 +1315,7 @@ int aggregating_filter ( source_type &source ,
 /// Now we have the routines which perform the buffering and filtering for a chunk of data,
 /// We add code for multithreading. This is done by using utility code from common.h, namely
 /// the routine 'divide_and_conquer_2', which splits two arrays in the same way and applies
-/// a functor to each chunk in a separate thread.
+/// a functor to each pair of chunks in a separate thread.
 /// filter_1d, which is the routine processing nD arrays along a specific axis, might as well
 /// be a function. But functions can't be patially specialized (at least not with my compiler)
 /// so I use a functor instead, which, as a class, can be partially specialized. We want a
@@ -1443,13 +1438,13 @@ public:
 
 #endif
 
-  std::cout << "have " << lanes << " lanes" << std::endl ;
+//   std::cout << "have " << lanes << " lanes" << std::endl ;
   
   // we give the filter some space to run up to precision
   // TODO we may want to use the proper horizon values here
   // TODO and make sure 64 is enough.
 
-  std::cout << "tolerance: " << tolerance << std::endl ;
+//   std::cout << "tolerance: " << tolerance << std::endl ;
   
   if ( tolerance <= 0.0 )
   {
@@ -1469,6 +1464,7 @@ public:
     runup = horizon ;
     
     // the absolute minimum to successfully run the fake 2D filter is this:
+    // TODO we might rise the threshold, min_length, here
     
     int min_length = 4 * runup * lanes + 2 * runup ;
     
@@ -1476,7 +1472,7 @@ public:
     
     if ( input.shape(0) < min_length )
     {
-      std::cout << "input is too short for fake 2D processing with the given tolerance" << std::endl ;
+//       std::cout << "input too short for fake 2D with the given tolerance" << std::endl ;
       lanes = 1 ;
     }
     else
@@ -1511,7 +1507,7 @@ public:
 
   if ( lanes == 1 )
   {
-    std::cout << "using simple 1D filter" << std::endl ;
+//     std::cout << "using simple 1D filter" << std::endl ;
     // this is a simple single-threaded implementation
     filter_type solver ( input.shape(0) ,
                          gain ,
@@ -1525,7 +1521,7 @@ public:
   
   // the input qualifies for fake 2D processing.
 
-  std::cout << "input qualifies for fake 2D, using " << lanes << " lanes" << std::endl ;
+//   std::cout << "input qualifies for fake 2D, using " << lanes << " lanes" << std::endl ;
   
   // we want as many chunks as we have lanes. There may be some data left
   // beyond the chunks (tail_size of value_type)
@@ -1535,8 +1531,6 @@ public:
   core_size = lanes * chunk_size ;
   int tail_size = input.shape(0) - core_size ;
   
-  std::cout << "tail size: " << tail_size << std::endl ;
-  
   // just doublecheck
 
   assert ( core_size + tail_size == input.shape(0) ) ;
diff --git a/mapping.h b/mapping.h
index 23c4379..5cbb85b 100644
--- a/mapping.h
+++ b/mapping.h
@@ -289,8 +289,10 @@ public:
   
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  // we fix the simdized type by looking at vector_traits (in common.h)
+
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -338,8 +340,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -347,7 +349,7 @@ public:
   {
     rc_v fl_i ;
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = fl_i  ;      // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
   }
 
 #endif
@@ -375,8 +377,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -386,7 +388,7 @@ public:
     v += real_t ( 0.5 ) ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder in [-0.5 ... 0.5]
     fv -= real_t ( 0.5 ) ;
-    iv = fl_i  ;              // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
 #endif
@@ -435,8 +437,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -448,7 +450,7 @@ public:
     
     rc_v fl_i ;
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = fl_i  ;      // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
   }
 
 #endif
@@ -485,8 +487,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -500,7 +502,7 @@ public:
     v += real_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder in [-0.5 ... 0.5]
     fv -= real_t(0.5) ;
-    iv = fl_i  ;              // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
 #endif
@@ -544,8 +546,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -561,13 +563,14 @@ public:
     {
       // v has some out-of-bounds values
       fv = v_modf ( v , &fl_i ) ;    // split v into integral and remainder from [0...1[
-      iv = fl_i  ;         // set integer part from float representing it
-      iv ( mask ) = int_t ( -1 ) ;   // set iv to -1 at out-of-bounds values
+      iv = ic_v ( fl_i )  ;         // set integer part from float representing it
+      // set iv to -1 at out-of-bounds values. mask cast is safe as sizes match
+      iv ( typename ic_v::mask_type ( mask ) ) = int_t ( -1 ) ;
       throw out_of_bounds() ;
     }
     
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = fl_i  ;      // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
   }
 
 #endif
@@ -606,8 +609,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -623,15 +626,16 @@ public:
     {
       // v has some out-of-bounds values
       fv = v_modf ( v , &fl_i ) ;    // split v into integral and remainder from [0...1[
-      iv = fl_i  ;         // set integer part from float representing it
-      iv ( mask ) = int_t ( -1 ) ;   // set iv to -1 at out-of-bounds values
+      iv = ic_v ( fl_i )  ;         // set integer part from float representing it
+      // set iv to -1 at out-of-bounds values. mask cast is safe as sizes match.
+      iv ( typename ic_v::mask_type ( mask ) ) = int_t ( -1 ) ;
       throw out_of_bounds() ;
     }
     // now we are sure that v is safely inside [ 0 : _ceiling [
     v += real_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;           // split v into integral and remainder in [-0.5 ... 0.5]
     fv -= real_t(0.5) ;
-    iv = fl_i  ;              // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
 #endif
@@ -693,8 +697,8 @@ public:
 
   // vectorized version of operator()
   
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -711,12 +715,12 @@ public:
       v = abs ( v ) ;                     // map to half period
       v = _ceiling - v ;                  // flip
       fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder from [0...1[
-      iv = fl_i  ;              // set integer part from float representing it
+      iv = ic_v ( fl_i )  ;              // set integer part from float representing it
       return ;
     }
     // here we are sure that v is safely inside [ 0 : _ceiling ]
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = fl_i  ;      // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
   }
 
 #endif
@@ -769,8 +773,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -790,7 +794,7 @@ public:
     v += real_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder in [-0.5 ... 0.5]
     fv -= real_t(0.5) ;
-    iv = fl_i  ;              // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
 #endif
@@ -841,8 +845,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -855,7 +859,7 @@ public:
       v ( v < real_t(0) ) += _ceiling ;   // shift values below zero up one period
     }
     fv = v_modf ( v , &fl_i ) ;        // split v into integral and remainder from [0...1[
-    iv = fl_i  ;             // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;             // set integer part from float representing it
   }
 
 #endif
@@ -895,8 +899,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -911,7 +915,7 @@ public:
     v += real_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;        // split v into integral and remainder in [-0.5 ... 0.5]
     fv -= real_t(0.5) ;
-    iv = fl_i  ;             // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;             // set integer part from float representing it
   }
 
 #endif
@@ -972,8 +976,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -983,7 +987,7 @@ public:
     v ( v < real_t ( 0 ) ) = real_t ( 0 ) ;
     v ( v > _ceiling ) = _ceiling ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder from [0...1[
-    iv = fl_i  ;              // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
 #endif
@@ -1026,8 +1030,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -1039,7 +1043,7 @@ public:
     v += real_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;        // split v into integral and remainder in [-0.5 ... 0.5]
     fv -= real_t(0.5) ;
-    iv = fl_i  ;             // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;             // set integer part from float representing it
   }
 
 #endif
@@ -1099,8 +1103,8 @@ public:
   
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -1116,7 +1120,7 @@ public:
     }
     v += real_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = fl_i  ;      // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
   }
 
 #endif
@@ -1176,8 +1180,8 @@ public:
 
 #ifdef USE_VC
 
-  typedef Vc::SimdArray < int_t , vsize > ic_v ;
-  typedef Vc::SimdArray < real_t , vsize > rc_v ;
+  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -1192,11 +1196,12 @@ public:
       v ( v > _ceiling ) = _ceilx2 - v ;
     }
     fv = v_modf ( v , &fl_i ) - real_t(0.5) ;
-    iv = fl_i  ;      // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
     auto mask = ( v >= _ceiling ) ;  // guard against special == case; defensively use >= instead of ==
     if ( any_of ( mask ) )
     {
-      iv ( mask ) = _ceiling - 1 ;
+      // mask cast is safe as sizes match
+      iv ( typename ic_v::mask_type ( mask ) ) = _ceiling - 1 ;
       fv ( mask ) = real_t ( 0.5 ) ;
     }
   }
@@ -1346,15 +1351,11 @@ public:
 
 #endif
 
-private:
-  
   TinyVector < mapping_type* , dimension > map ; // container for the mappings
   bcv_type bcv ;
   int spline_degree ;
   shape_type shape ;
   
-public:
-  
   /// 'standard' constructor for a nd_mapping. This constructor takes the
   /// values which a nd_mapping requires for it's operation directly, and the
   /// other constructors delegate to it.
@@ -1384,29 +1385,29 @@ public:
   {
   } ;
   
-  /// convenience variant constructing a nd_mapping from a bspline object
-  /// A bspline object has all components needed to construct a nd_mapping:
-  /// It has a set of boundary condition codes (those the spline was constructed with),
-  /// the spline's degree and the core shape of it's coefficient array, giving us the limits
-  /// for the mapping. Creating the mapping like this keeps us from using mapping modes
-  /// RAW, LIMIT and REJECT, since these are only codes for mappings, not for boundary
-  /// conditions a spline can be constructed with. But all the BC codes used for spline
-  /// construction, like MIRROR, REFLECT, NATURAL and PERIODIC have corresponding
-  /// mappings.
-
-  template < class bspline >
-  nd_mapping ( const bspline & bspl )
-  : nd_mapping ( bspl.bcv ,
-                 bspl.spline_degree ,
-                 bspl.core_shape )
-  { } ;
-  
   /// since we have members which need delete, we need a copy constructor
   
   nd_mapping ( const nd_mapping& other )
   : nd_mapping ( other.bcv , other.spline_degree , other.shape )
   { } ;
   
+//   /// convenience variant constructing a nd_mapping from a bspline object
+//   /// A bspline object has all components needed to construct a nd_mapping:
+//   /// It has a set of boundary condition codes (those the spline was constructed with),
+//   /// the spline's degree and the core shape of it's coefficient array, giving us the limits
+//   /// for the mapping. Creating the mapping like this keeps us from using mapping modes
+//   /// RAW, LIMIT and REJECT, since these are only codes for mappings, not for boundary
+//   /// conditions a spline can be constructed with. But all the BC codes used for spline
+//   /// construction, like MIRROR, REFLECT, NATURAL and PERIODIC have corresponding
+//   /// mappings.
+// 
+//   template < class bspline >
+//   nd_mapping ( const bspline & bspl )
+//   : nd_mapping ( bspl.bcv ,
+//                  bspl.spline_degree ,
+//                  bspl.core_shape )
+//   { } ;
+  
   /// assignment operator, for the same reason
   
   nd_mapping& operator= ( const nd_mapping& other )
@@ -1476,9 +1477,13 @@ public:
   /// finally, operation on Vc vectors of incoming coordinates. Again, first the
   /// 1D version with the 'axis' parameter
   
-  void operator() ( Vc::Vector < real_t > v ,
-                    Vc::SimdArray < int_t , Vc::Vector<real_t>::size() >& iv ,
-                    Vc::Vector < real_t >& fv ,
+//   void operator() ( Vc::Vector < real_t > v ,
+//                     Vc::SimdArray < int_t , Vc::Vector<real_t>::size() >& iv ,
+//                     Vc::Vector < real_t >& fv ,
+//                     const int & axis )
+  void operator() ( real_v v ,
+                    int_v & iv ,
+                    real_v & fv ,
                     const int & axis )
   {
     ( * ( map [ axis ] ) ) ( v , iv , fv ) ;
diff --git a/prefilter.h b/prefilter.h
index ccdddd9..0b06b47 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -173,19 +173,41 @@ void solve_vigra ( input_array_type & input ,
               nslices ) ;
 }
 
-#ifdef USE_VC
+/// overload of solve_vigra taking a single boundary condition code which is used
+/// for all axes. This also saves us a specific 1D version.
 
 template < typename input_array_type ,      // type of array with knot point data
            typename output_array_type >     // type of array for coefficients (may be the same)
-void solve_vc ( input_array_type & input ,
+void solve_vigra ( input_array_type & input ,
                    output_array_type & output ,
-                   TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
+                   bc_code bc ,
                    int degree ,
                    double tolerance ,
                    int nslices = ncores )
 {
   filter_nd ( input ,
               output ,
+              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
+              degree / 2 ,
+              precomputed_poles [ degree ] ,
+              tolerance ,
+              false ,
+              nslices ) ;
+}
+
+#ifdef USE_VC
+
+template < typename input_array_type ,      // type of array with knot point data
+           typename output_array_type >     // type of array for coefficients (may be the same)
+void solve_vc ( input_array_type & input ,
+                output_array_type & output ,
+                TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
+                int degree ,
+                double tolerance ,
+                int nslices = ncores )
+{
+  filter_nd ( input ,
+              output ,
               bcv ,
               degree / 2 ,
               precomputed_poles [ degree ] ,
@@ -194,6 +216,28 @@ void solve_vc ( input_array_type & input ,
               nslices ) ;
 }
 
+/// overload of solve_vc taking a single boundary condition code which is used
+/// for all axes. This also saves us a specific 1D version.
+
+template < typename input_array_type ,      // type of array with knot point data
+           typename output_array_type >     // type of array for coefficients (may be the same)
+void solve_vc ( input_array_type & input ,
+                output_array_type & output ,
+                bc_code bc ,
+                int degree ,
+                double tolerance ,
+                int nslices = ncores )
+{
+  filter_nd ( input ,
+              output ,
+              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
+              degree / 2 ,
+              precomputed_poles [ degree ] ,
+              tolerance ,
+              true ,
+              nslices ) ;
+}
+
 #endif
 
 /// An interlude: restoration of the original knot point data from the spline coefficients.
@@ -205,7 +249,7 @@ void solve_vc ( input_array_type & input ,
 /// border treatment manifest in the brace is used, and splines with arbitrary border conditions
 /// can be verified.
 /// TODO: bit rough and ready - restoration from braced should really only produce the inner
-/// part, not the area covered by the brace.
+/// part, not the area covered by the brace. And it's neither multithreaded nor vectorized...
 
 template < class array >
 void restore_from_braced ( array &spline , array& target , int SplineDegree )
diff --git a/remap.h b/remap.h
index 40dad3b..f958a74 100644
--- a/remap.h
+++ b/remap.h
@@ -93,6 +93,36 @@ namespace vspline {
 using namespace std ;
 using namespace vigra ;
 
+/// in st_remap() we want to derive the elementary type of a coordinate from
+/// the template argument 'coordinate_type'. We use a traits class for this purpose.
+/// per default, whatever the coordinate_type may be, take it to be the rc_type as well.
+/// This works for simple 1D coordinates like float or double.
+
+template < class coordinate_type >
+struct coordinate_traits
+{
+  typedef coordinate_type rc_type ;
+} ;
+
+/// here we have a partial specialization of class coordinate_traits for coordinates
+/// coming as vigra TinyVectors, which is used throughout for nD coordinates. Here we
+/// pick the TinyVector's value_type as our rc_type
+
+template < typename T , int N >
+struct coordinate_traits < TinyVector < T , N > >
+{
+  typedef typename vigra::TinyVector < T , N >::value_type rc_type ;
+} ;
+
+/// since remap can also operate with pre-split coordinates, we need another partial
+/// specialization of coordinate_traits for split_type, which also defines a value_type
+
+template < int N , typename IT , typename FT >
+struct coordinate_traits < split_type < N , IT , FT > >
+{
+  typedef typename split_type < N , IT , FT >::value_type rc_type ;
+} ;
+
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class coordinate_type , // usually TinyVector of real for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
@@ -104,7 +134,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
            )
 {
   typedef bspline < value_type , dim_in > spline_type ;
-  typedef typename coordinate_type::value_type rc_type ;
+  typedef typename coordinate_traits < coordinate_type > :: rc_type rc_type ;
   typedef evaluator < dim_in , value_type , rc_type , int > evaluator_type ;
   typedef typename evaluator_type::ele_type ele_type ;
 
@@ -117,14 +147,14 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
 
   evaluator_type ev ( bspl ) ;
   
-  ele_type * p_workspace = new ele_type [ ev.workspace_size() ] ;
+//   ele_type * p_workspace = new ele_type [ ev.workspace_size() ] ;
 
 #ifdef USE_VC
 
   typedef typename evaluator_type::ele_v ele_v ;
   const int vsize = evaluator_type::vsize ;
 
-  ele_v * p_workspace_v = new ele_v [ ev.workspace_size() ] ;
+//   ele_v * p_workspace_v = new ele_v [ ev.workspace_size() ] ;
 
 #endif
   
@@ -160,7 +190,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
         // best case: output array has consecutive memory, no need to buffer
         for ( int a = 0 ; a < aggregates ; a++ , source += vsize , destination += vsize )
         {
-          ev ( source , destination , p_workspace_v ) ;
+          ev ( source , destination ) ; // , p_workspace_v ) ;
         }
         target_it += aggregates * vsize ;
       }
@@ -172,7 +202,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
         value_type target_buffer [ vsize ] ;
         for ( int a = 0 ; a < aggregates ; a++ , source += vsize )
         {
-          ev ( source , target_buffer , p_workspace_v ) ;
+          ev ( source , target_buffer ) ; // , p_workspace_v ) ;
           for ( int e = 0 ; e < vsize ; e++ )
           {
             *target_it = target_buffer[e] ;
@@ -196,7 +226,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
             source_buffer[e] = *source_it ;
             ++source_it ;
           }
-          ev ( source_buffer , destination , p_workspace_v ) ;
+          ev ( source_buffer , destination ) ; // , p_workspace_v ) ;
         }
         target_it += aggregates * vsize ;
       }
@@ -211,7 +241,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
             source_buffer[e] = *source_it ;
             ++source_it ;
           }
-          ev ( source_buffer , target_buffer , p_workspace_v ) ;
+          ev ( source_buffer , target_buffer ) ; // , p_workspace_v ) ;
           for ( int e = 0 ; e < vsize ; e++ )
           {
             *target_it = target_buffer[e] ;
@@ -220,7 +250,7 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
         }
       }
     }
-    delete[] p_workspace_v ;
+//     delete[] p_workspace_v ;
   }
   
 #endif // USE_VC
@@ -229,12 +259,12 @@ int st_remap ( const bspline < value_type , dim_in > & bspl ,
   while ( leftover-- )
   {
     // process leftovers with single-value evaluation
-    *target_it = ev ( *source_it , p_workspace ) ;
+    *target_it = ev ( *source_it ) ; // , p_workspace ) ;
     ++source_it ;
     ++target_it ;
   }
 
-  delete[] p_workspace ;
+//   delete[] p_workspace ;
   return 0 ;
 }
 
@@ -307,6 +337,7 @@ int remap ( MultiArrayView < dim_in , value_type > input ,
   }
 
   // create the bspline object
+  // TODO may want to specify tolerance here instead of using default
   bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
   // copy in the data
   bsp.core = input ;
@@ -339,18 +370,20 @@ using transform
 
 /// We use this type alias, since the type is rather unwieldy
 /// what we need to define is a SimdArray of rc_type, which has just as
-/// many elements as a Vc::Vector of value_type's elementary type:
+/// many elements as a simdized_type of value_type's elementary type:
 
 template < class rc_type , class value_type >
 using rc_v
-= Vc::SimdArray < rc_type ,
-                  Vc::Vector < typename ExpandElementResult < value_type >::type > ::Size > ;
-
+= typename
+   vector_traits < rc_type ,
+                   vector_traits < typename ExpandElementResult < value_type > :: type > :: vsize
+                 > :: simdized_type ;
+                 
 /// This type alias defines a vectorized coordinate transformation.
 /// it is the equivalent of the unvectorized type above, taking TinyVectors
 /// of the appropriate SimdArray objects instead of singular values.
 
-template < class rc_v ,            // SimdArray of vsize real coordinates
+template < class rc_v ,            // simdized type of vsize real coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
 using vtransform
@@ -509,6 +542,10 @@ public :
 /// a few values does no harm.
 // TODO this is utility code and might be placed into another header, probably next to
 // the routine producing successive gather indices for vectorized array traversal
+// TODO when experimenting with coordinate iterators, I found that vectorized code for
+// creating gather/scatter indexes performed worse than simply filling the index vector
+// with successive single indices generated by vigra, and using vigra's facilities
+// also simplifies matters, see the use in filter.h, ele_aggregating_filter.
 
 template < class _rc_v , int _dimension >
 struct coordinate_iterator_v
@@ -610,7 +647,7 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
   typedef evaluator < dim_in , value_type , rc_type , int > evaluator_type ;
   typedef typename evaluator_type::ele_type ele_type ;
   evaluator_type ev ( bspl ) ;
-  ele_type * p_workspace = new ele_type [ ev.workspace_size() ] ;
+//   ele_type * p_workspace = new ele_type [ ev.workspace_size() ] ;
 
 #ifdef USE_VC
 
@@ -619,7 +656,7 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
   typedef typename evaluator_type::mc_ele_v mc_ele_v ;
   const int vsize = evaluator_type::vsize ;
 
-  ele_v * p_workspace_v = new ele_v [ ev.workspace_size() ] ;
+//   ele_v * p_workspace_v = new ele_v [ ev.workspace_size() ] ;
 
   TinyVector < rc_v , dim_out > c_in ; // incoming coordinates (into output, which has dim_out dims)
   TinyVector < rc_v , dim_in > c_out ; // transformed coordinates (into input, which has dim_in dims)
@@ -649,13 +686,13 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
       if ( output.isUnstrided() )
       {
         // finally, here we evaluate the spline
-        ev ( c_out , destination , p_workspace_v ) ;
+        ev ( c_out , destination ) ; // , p_workspace_v ) ;
         destination += vsize ;
       }
       else
       {
         // alternative evaluation if we can't write straight to destination 
-        ev ( c_out , target_buffer , p_workspace_v ) ;
+        ev ( c_out , target_buffer ) ; // , p_workspace_v ) ;
         for ( int e = 0 ; e < vsize ; e++ )
         {
           target_it.get<1>() = target_buffer[e] ;
@@ -663,7 +700,7 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
         }
       }
     }
-    delete[] p_workspace_v ;
+//     delete[] p_workspace_v ;
     if ( output.isUnstrided() )
       target_it += aggregates * vsize ;
   }
@@ -679,11 +716,11 @@ int st_tf_remap ( const bspline < value_type , dim_in > & bspl ,
     // process leftovers with single-value evaluation of transformed coordinate
     cs_in = target_it.get<0>() + offset ;
     tf ( cs_in , cs_out ) ;
-    target_it.get<1>() = ev ( cs_out , p_workspace ) ;
+    target_it.get<1>() = ev ( cs_out ) ; // , p_workspace ) ;
     ++target_it ;
   }
 
-  delete[] p_workspace ;
+//   delete[] p_workspace ;
   return 0 ;
 }
 
@@ -795,9 +832,9 @@ int st_make_warp_array ( transformation < value_type , rc_type , dim_in , dim_ou
 #ifdef USE_VC
 
   typedef typename ExpandElementResult<value_type>::type ele_type ;
-  typedef Vc::Vector < ele_type > ele_v ;
+  typedef typename vector_traits < ele_type > :: simdized_type ele_v ;
   const int vsize = ele_v::Size ;
-  typedef Vc::SimdArray < rc_type , vsize > rc_v ;
+  typedef typename vector_traits < rc_type , vsize > :: simdized_type rc_v ;
 
   /// vecorized incoming coordinates (into output, which has dim_out dims)
   typedef TinyVector < rc_v , dim_out > nd_rc_in_v ;
diff --git a/vspline.doxy b/vspline.doxy
index 5f4edf5..60b89fd 100644
--- a/vspline.doxy
+++ b/vspline.doxy
@@ -769,7 +769,7 @@ FILE_PATTERNS          =
 # be searched for input files as well.
 # The default value is: NO.
 
-RECURSIVE              = NO
+RECURSIVE              = YES
 
 # The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git