[vspline] 09/72: more work on evaluation, class warper introduced I did some more work on class evaluator, adding code to work with separate sources of integral and real components of split coordinates. This made it possible to use a new class 'warper' in remap.h, which contains two MultiArray(View)s, one to nD integral coordinates, one to nD real remainders. I wrote the corresponding remap function (for now requiring the arrays in the warper to be unstrided) and tested, seems like all is well, but only the unvectorized code seems to profit from the new scheme.

Kay F. Jahnke kfj-guest at moszumanska.debian.org
Sun Jul 2 09:02:37 UTC 2017


This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 25947232de3677db042a78017e4200132b15815f
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Tue Nov 8 11:31:43 2016 +0100

    more work on evaluation, class warper introduced
    I did some more work on class evaluator, adding code to work with
    separate sources of integral and real components of split coordinates.
    This made it possible to use a new class 'warper' in remap.h, which contains
    two MultiArray(View)s, one to nD integral coordinates, one to nD real remainders.
    I wrote the corresponding remap function (for now requiring the arrays in
    the warper to be unstrided) and tested, seems like all is well, but only the
    unvectorized code seems to profit from the new scheme.
---
 brace.h                     |   8 +-
 common.h                    | 116 +++++-
 eval.h                      | 335 ++++++++++++----
 example/impulse_response.cc |  18 +-
 example/roundtrip.cc        |  28 +-
 mapping.h                   | 942 ++++++++++++++++++--------------------------
 remap.h                     | 186 +++++++++
 7 files changed, 997 insertions(+), 636 deletions(-)

diff --git a/brace.h b/brace.h
index 25895e3..e729172 100644
--- a/brace.h
+++ b/brace.h
@@ -278,15 +278,17 @@ struct bracer
   /// the margin's width. But to compile, we also have to give a procedure
   /// for the other cases (not 2D), so this is first:
   
-  template < typename value_type > shift_assign ( value_type target , value_type source )
+  template < typename value_type >
+  void shift_assign ( value_type target , value_type source )
   {
     // should not ever get used, really...
   }
 
   /// specialized routine for the 2D case (the slice itself is 1D)
 
-  template < typename value_type > shift_assign ( MultiArrayView < 1 , value_type > target ,
-                                                  MultiArrayView < 1 , value_type > source )
+  template < typename value_type >
+  void shift_assign ( MultiArrayView < 1 , value_type > target ,
+                      MultiArrayView < 1 , value_type > source )
   {
     // bit sloppy here, with pathological data (very small source.size()) this will
     // be imprecise for odd sizes, for even sizes it's always fine. But then full
diff --git a/common.h b/common.h
index 8f9e720..e6e2da1 100644
--- a/common.h
+++ b/common.h
@@ -92,6 +92,30 @@ struct vector_traits
 
 #endif
 
+/// struct aggregate_traits is a traits class to help with aggregation of values
+/// into aggregates of values. The code, which is dimension-agnostic, has to
+/// operate on aggregate types as well as on singular types. With this traits class
+/// we can enable the code to pick the singular type as the 'aggregate type' when
+/// the 'aggregate' size is 1, and a vigra::TinyVector otherwise. This fixes the
+/// formation of small aggregates in one central place and removes unnecessary
+/// aggegations where N==1.
+  
+template < typename T , int N >
+struct aggregate_traits
+{
+  typedef T value_type ;
+  enum { size = N } ;
+  typedef vigra::TinyVector < T , N > type ;
+} ;
+
+template < typename T >
+struct aggregate_traits < T , 1 >
+{
+  typedef T value_type ;
+  enum { size = 1 } ;
+  typedef T type ;
+} ;
+
 namespace vspline {
 
 /// dimension-mismatch is thrown if two arrays have different dimensions
@@ -418,7 +442,7 @@ struct divide_and_conquer
     if ( data.isUnstrided() )
     {
       typedef typename vector_traits < int , vsize > :: simdized_type ic_v ;
-      typedef vigra::TinyVector < ic_v , channels > mc_ic_v ;
+      typedef typename aggregate_traits < ic_v , channels > :: type mc_ic_v ;
 
       // if data is unstrided, we can process it's contents by gather/scatter
       // operations to vectors which we process with the functor vf. This is
@@ -518,7 +542,6 @@ template < class view_type ,   // type of first array
            class view_type_2 > // type of second array
 struct divide_and_conquer_2
 {
-  typedef typename view_type::value_type value_type ;
   enum { dim = view_type::actual_dimension } ;
   typedef typename view_type::difference_type shape_type ;
 
@@ -590,6 +613,95 @@ struct divide_and_conquer_2
   }
 } ;
 
+/// divide_and_conquer_3 works just like divide_and_conquer_2, but operates on
+/// three arrays synchronously.
+
+template < class view_type ,   // type of first array
+           class view_type_2 , // type of second array
+           class view_type_3 >
+struct divide_and_conquer_3
+{
+  enum { dim = view_type::actual_dimension } ;
+  typedef typename view_type::difference_type shape_type ;
+
+  // run() takes two MultiArrayViews of the same shape, splits them
+  // in the same way, and applies a functor to each resulting pair of chunks. This is
+  // needed for functions like remap().
+  // TODO: we might even formulate a general manifold by accepting a vector of arrays,
+  // but currently there is no code in vspline which uses more than two views together.
+
+  typedef std::function < void ( view_type & ,   // chunk of data to process
+                                 view_type_2 & , // chunk of data2 to process
+                                 view_type_3 & , // chunk of data3 to process
+                                 shape_type )    // offset of chunk in 'whole' data set
+                        > chunk_func_3 ;
+
+  static void run ( view_type & data ,   // array of data to divide (into chunks)
+                    view_type_2 & data2 , // second array to coprocess with data
+                    view_type_3 & data3 , // third array to coprocess with data
+                    chunk_func_3 func ,  // functor to apply to each pair of chunks
+                    int forbid = -1 ,    // axis that should not be split (default: no restriction)
+                    int nthreads = ncores )
+  {
+    if (    view_type_2::actual_dimension != dim
+         || view_type_3::actual_dimension != dim
+       )
+      throw ( dimension_mismatch ( "all views must have the same dimension" ) ) ;
+
+    // make sure that data, data2 and data3 are shape-compatible
+    // we silently assume that we can coiterate both arrays in scan order.
+    if (    data.shape() != data2.shape()
+         || data.shape() != data3.shape() )
+    {
+      throw shape_mismatch ( "all views must have the same shape" ) ;
+    }
+
+    if ( nthreads > 1 )
+    {
+      // we split both arrays into equal chunks
+      // to coprocess them by one thread each
+
+      std::vector<view_type> dn ; // space for chunks of data
+      nthreads = split_array_to_chunks <view_type> ( data , dn , nthreads , forbid ) ;
+
+      std::vector<view_type_2> d2n ; // space for chunks of data2
+      nthreads = split_array_to_chunks <view_type_2> ( data2 , d2n , nthreads , forbid ) ;
+
+      std::vector<view_type_3> d3n ; // space for chunks of data3
+      nthreads = split_array_to_chunks <view_type_3> ( data3 , d3n , nthreads , forbid ) ;
+
+      int new_threads = nthreads - 1 ;
+      
+      std::thread * t[new_threads] ;
+      shape_type offset ;
+
+      int s ;
+      for ( s = 0 ; s < new_threads ; s++ )
+      {
+        t[s] = new std::thread ( func ,
+                                 std::ref(dn[s]) , std::ref(d2n[s]), std::ref(d3n[s]) ,
+                                 offset ) ;
+        for ( int d = 0 ; d < dim ; d ++ )
+        {
+          if ( dn[s].shape ( d ) != data.shape(d) )
+            offset[d] += dn[s].shape(d) ;
+        }
+      }
+      // process the last chunk in this thread
+      func ( std::ref(dn[s]) , std::ref(d2n[s]) , std::ref(d3n[s]), offset ) ;
+      for ( int s = 0 ; s < new_threads ; s++ )
+      {
+        t[s]->join() ;
+        delete t[s] ;
+      }
+    }
+    else // if we're single-threaded we call func in this thread
+    {
+      func ( data , data2 , data3 , shape_type() ) ;
+    }
+  }
+} ;
+
 } ; // end of namespace vspline
 
 #endif // VSPLINE_COMMON
\ No newline at end of file
diff --git a/eval.h b/eval.h
index 1fab204..5184b18 100644
--- a/eval.h
+++ b/eval.h
@@ -33,8 +33,8 @@
 
     \brief code to evaluate uniform b-splines
 
-    This body of code contains the central class evaluator and auxilliary classes
-    which are needed for it's smooth operation.
+    This body of code contains class evaluator and auxilliary classes which are
+    needed for it's smooth operation.
 
     The evaluation is a reasonably straightforward process: A subset of the coefficient
     array, containing coefficients 'near' the point of interest, is picked out, and
@@ -59,18 +59,6 @@
 #ifndef VSPLINE_EVAL_H
 #define VSPLINE_EVAL_H
 
-#include <thread>
-#include <math.h>
-#include <complex>
-#include <cmath>
-#include <iostream>
-#include <array>
-#include <assert.h>
-
-#include <vigra/multi_array.hxx>
-#include <vigra/multi_iterator.hxx>
-
-#include "basis.h"
 #include "mapping.h"
 #include "bspline.h"
 
@@ -79,6 +67,8 @@ namespace vspline {
 using namespace std ;
 using namespace vigra ;
 
+// TODO describe the use of the 'weight matrix' rather than technical details
+
 /// The routine 'calculate_weight_matrix' originates from vigra. I took the original
 /// routine BSplineBase<ORDER, T>::calculateWeightMatrix() from vigra and changed it
 /// in several ways:
@@ -192,6 +182,9 @@ struct bspline_derivative_weights
 {
   typedef typename MultiArray<2,rc_type>::iterator wm_iter ;
 
+  // TODO I would make this const, but in vigra, the iterator obtained by calling begin()
+  // on a const array is protected (multi_ierator.hxx, 438) why is this so?
+
   MultiArray<2,rc_type> weight_matrix ;
   const int degree ;
   const int derivative ;
@@ -443,17 +436,25 @@ struct alt_bspline_derivative_weight_functor
 /// The code in class evaluator begins with a sizeable constructor which sets up as much
 /// as possible to support the evaluation code to do it's job as fast as possible. Next follows
 /// the unvectorized code, finally the vectorized code.
-/// It's possible to not use class evaluator's built-in weight generation funtions, which are
-/// specific to b-spline evaluation, and instead run the core evaluation process with other
-/// weights generated in outside, presenting something like a generalized interpolator routine
-/// for separable problems with equally-sized weight vectors for the dimensions. To use this
-/// facility, calling code should use the operator() overloads taking
-/// const MultiArrayView < 2 , ... > & weight
-/// which are the final delegate before the recursive weighted sum calculation is called.
+/// There is a great variety of overloads of class evaluator's operator() to make it as versatile
+/// as possible. The strategy is to have all dependencies of the evaluation except for the actual
+/// coordinates taken care of by the constructor - and immutable for the evaluator's lifetime.
+/// The resulting object has no state which is modified after construction, making it thread-safe.
+/// The operator() variants take only coordinates (in their various guises) as arguments, relying
+/// on the state of the evaluator fixed at it's construction. The operator() overloads also
+/// form a hierarchy, as evaluation progresses from accepting unsplit real coordinates to
+/// split coordinates and finally offsets and weights. This offers calling code to handle
+/// parts of the delegation hierarchy itself, only using class evaluator at a specific level.
+/// By providing the evaluation in this way, it becomes easy for calling code to integrate
+/// the evaluation into more complex functors by using std::bind. Consider, for example, code
+/// which generates coordinates with a functor, then evaluates a b-spline at these coordinates,
+/// and finally subjects the resultant values to some postprocessing. All these processing
+/// steps can be bound into a single functor, and the calling code can be reduced to polling
+/// this functor until it has obtained the desired number of output values.
 
 // TODO: if evaluator objects are only ever used one per thread, the 'workspace' for the
-// weights might as well be made a member of class evaluator. The current implementation is
-// thread-safe, though, and would allow to use the same evaluator object from several threads.
+// weights can be made a member of class evaluator. The current implementation is thread-safe, 
+// though, and does allow to use the same evaluator object from several threads.
 
 template < int dimension ,         ///< dimension of the spline
            class value_type ,      ///< type of spline coefficients/result (pixels etc.)
@@ -492,9 +493,11 @@ public:
 
   typedef typename array_type::difference_type                     shape_type ;
 
-  /// type for a multidimensional real coordinate
+  /// types for a multidimensional real/integral coordinate
+  // TODO this is duplicating the typedef in mapping.h
 
   typedef TinyVector < rc_type , dimension >                       nd_rc_type ;
+  typedef TinyVector < ic_type , dimension >                       nd_ic_type ;
 
   /// type for a 'split' coordinate, mainly used internally in the evaluator, but
   /// also in 'warp arrays' used for remapping
@@ -549,9 +552,15 @@ public:
 
   typedef TinyVector < ic_v , channels >  mc_ic_v ;
 
+#else
+  
+  enum { vsize = 1 } ;
+  
 #endif // USE_VC
   
-  typedef nd_mapping < split_coordinate_type , dimension , vsize > nd_mapping_type ;
+  /// object used to map arbitrary incoming coordinates into the defined range
+
+  typedef nd_mapping < ic_type , rc_type , dimension , vsize > nd_mapping_type ;
 
   typedef weight_functor_base < ele_type > weight_functor_base_type ;
 
@@ -580,7 +589,7 @@ private:
   
   nd_weight_functor fweight ;       ///< set of pointers to weight functors, one per dimension
   const array_type& coefficients ;  ///< b-spline coefficient array
-  const expanded_shape_type expanded_stride ;        ///< strides in terms of expanded value_type
+  const shape_type expanded_stride ;                 ///< strides in terms of expanded value_type
   MultiArray < 1 , ic_type > offsets ;               ///< offsets in terms of value_type
   MultiArray < 1 , ic_type > component_offsets ;     ///< offsets in terms of ele_type, for vc op
   component_base_type component_base ;
@@ -590,6 +599,7 @@ private:
   const int spline_degree ;
   const int ORDER ;
   const int workspace_size ;
+  const int window_size ;
 
 public:
 
@@ -612,11 +622,12 @@ public:
     spline_degree ( _spline_degree ) ,
     ORDER ( _spline_degree + 1 ) ,
     component_view ( _coefficients.expandElements ( 0 ) ) ,
-    expanded_stride ( _coefficients.expandElements ( 0 ).stride() ) ,
+    expanded_stride ( channels * _coefficients.stride() ) ,
     wfd0 ( _spline_degree , 0 ) ,
     mmap ( _mmap ) ,             // I enforce the passing in of a mapping, even though it
                                  // may not be used at all. TODO: can I do better?
-    workspace_size ( ( _spline_degree + 1 ) * dimension )
+    workspace_size ( ( _spline_degree + 1 ) * dimension ) ,
+    window_size ( pow ( ORDER , dimension ) )
   {
     // initalize the weight functors. In this constructor, we use only bspline weight
     // functors, even though the evaluator can operate with all weight functors
@@ -646,12 +657,12 @@ public:
     // window by means of stride/shape arithmetic. Coding the window in this fashion
     // also makes it easy to vectorize the code.
     
-    int noffsets = ORDER ;
-    for ( int exp = 1 ; exp < dimension ; exp++ )
-      noffsets *= ORDER ;
+//     int noffsets = ORDER ;
+//     for ( int exp = 1 ; exp < dimension ; exp++ )
+//       noffsets *= ORDER ;
     
-    offsets = MultiArray < 1 , ptrdiff_t > ( noffsets ) ;
-    component_offsets = MultiArray < 1 , ptrdiff_t > ( noffsets ) ;
+    offsets = MultiArray < 1 , ptrdiff_t > ( window_size ) ;
+    component_offsets = MultiArray < 1 , ptrdiff_t > ( window_size ) ;
     
     // we fill the offset array in a simple fashion: we do a traversal of the window
     // and calculate the pointer difference for every element reached. We use the
@@ -773,7 +784,14 @@ public:
   /// currently the bracing takes care of the boundary conditions.
   ///  
   /// I'd write a plain function template and partially specialize it for 'level', but that's
-  /// not allowed, so I use a functor instead:
+  /// not allowed, so I use a functor instead.
+  // TODO might add code here which receives a set of weights, one per value in the window,
+  // to apply in sequence (zip with window values). The set of weights would be the tensor
+  // product of the weights along each axis. Even more involved code is thinkable where some
+  // of the tensor product is produced and some remains as residual tensor(s). The 'done'
+  // part of the product would then be applied to as many windowed values as it holds and
+  // the weighted sum fed upwards into a recursion where it's treated with a factor from
+  // the tensor for 'one level up'.
   
   template < int level , class dtype >
   struct _eval
@@ -820,29 +838,152 @@ public:
   // next are the variants of operator(). there are quite a few, since I've coded for operation
   // from different starting points and both for vectorized and nonvectorized operation.
   
-  /// variant of operator() which takes a shape and a set of weights. This is the final delegate,
+  /// variant of operator() which takes an offset and a set of weights. This is the final delegate,
   /// calling the recursive _eval method. Having the weights passed in via a const MultiArrayViev &
   /// allows calling code to provide their own weights, together with their shape, in one handy
   /// packet. And in the normal sequence of delegation inside class eval, the next routine 'up'
   /// can also package the weights nicely in a MultiArrayView.
   
-  value_type eval ( const shape_type& select ,
-                    const MultiArrayView < 2 , ele_type > & weight )
+  value_type operator() ( const ic_type& select ,
+                          const MultiArrayView < 2 , ele_type > & weight )
   {
-    const value_type * base = & coefficients [ select ] ;
+    const value_type * base = coefficients.data() + select ;
     offset_iterator ofs = offsets.begin() ;  // offsets reflects the positions inside the subarray
     return _eval<level,value_type>() ( base , ofs , weight ) ;
   }
 
+  /// alternative implementation of the final delegate. Here, we calculate the tensor
+  /// product of the component vectors in 'weight' and use flat_eval to obtain the
+  /// weighted sum over the coefficient window. Surprisingly enough, this seems to
+  /// be just as fast as the other variant.
+  /// TODO test more thoroughly, would be nice to use instead, with an additional
+  /// operator() taking self-same tensor product.
+
+  /// evaluation using the tensor product of the weight component vectors. This is simple,
+  /// no need for recursion here, it's simply the sum of all values in the window, weighted
+  /// with their corresponding weights. note that there are no checks; if there aren't enough
+  /// weights th code may crash. Unused for now, code is explicitly inlined further down.
+  
+//   template < class dtype , class weight_type >
+//   dtype flat_eval ( const dtype* & pdata ,
+//                     const weight_type * const weight )
+//   {
+//     dtype result = dtype() ;
+//     for ( int i = 0 ; i < window_size ; i++ )
+//       result += weight[i] * pdata[offsets[i]] ;
+//     return result ;
+//   }
+
+  /// calculation of the tensor product of the component vectors of the weights:
+
+  template < int _level , class dtype >
+  struct tensor_product
+  {
+    void operator() ( dtype * & target ,
+                      const MultiArrayView < 2 , dtype > & weight ,
+                      dtype factor
+                    )
+    {
+      if ( _level == level ) // both are template args, this conditional has no runtime cost
+      {
+        // if _level == level this is the outermost call, and factor is certainly
+        // 1.0. hence we can omit it:
+        for ( int i = 0 ; i < weight.shape(0) ; i++ )
+        {
+          tensor_product < _level - 1 , dtype >() ( target ,
+                                                    weight ,
+                                                    weight [ Shape2 ( i , level ) ] ) ;
+        }
+      }
+      else
+      {
+        // otherwise we need it.
+        for ( int i = 0 ; i < weight.shape(0) ; i++ )
+        {
+          tensor_product < _level - 1 , dtype >() ( target ,
+                                                    weight ,
+                                                    weight [ Shape2 ( i , level ) ] * factor ) ;
+        }
+      }
+    }
+  } ;
+  
+  template < class dtype >
+  struct tensor_product < 0 , dtype >
+  {
+    void operator() ( dtype * & target ,
+                      const MultiArrayView < 2 , dtype > & weight ,
+                      dtype factor
+                    )
+    {
+     if ( level == 0 )
+      {
+        // if level == 0, this is a 1D scenario, and all that is left is copying the weights
+        // verbatim to target. TODO: for this special case (only occuring in 1D)
+        // we could avoid this copy and use the data from 'weight' directly
+        for ( int i = 0 ; i < weight.shape(0) ; i++ )
+        {
+          *target = weight [ Shape2 ( i , 0 ) ] ;
+          ++target ;
+        }
+      }
+      else
+      {
+        for ( int i = 0 ; i < weight.shape(0) ; i++ )
+        {
+          *target = weight [ Shape2 ( i , 0 ) ] * factor ;
+          ++target ;
+        }
+      }
+    }
+  } ;
+
+//   /// operator() variant which first calculates the final weights and then applies them
+//   /// using a simple summation loop. currently unused, since it is slower.
+  
+//   // TODO: try buffering the window of coefficients as well - also on the stack.
+//   // then the final evaluation collapses to a loop like sum += *(a++) + *(b++)
+//   // this might be faster, and the coefficient fetching and the weight calculations
+//   // might even run in parallel since they are totally independent of each other.
+// 
+//   value_type alt_operator ( const ic_type& select ,
+//                             const MultiArrayView < 2 , ele_type > & weight )
+//   {
+//     const value_type * base = coefficients.data() + select ;
+//     ele_type flat_weight [ window_size ] ;
+//     ele_type * iter = flat_weight ;
+//     tensor_product < level , ele_type >() ( iter , weight , 1.0 ) ;
+//     value_type sum ;
+//     for ( int i = 0 ; i < window_size ; i++ )
+//       sum += flat_weight[i] * base[offsets[i]] ;
+//     return sum ;
+//   }
+
+  /// 'penultimate' delegate, taking a multidimensional index 'select' to the beginning of
+  /// the coefficient window to process. Here the multidimensional index is translated
+  /// into an offset from the coefficient array's base adress. Carrying the information as
+  /// a reference to a multidimensional index all the way to this point does no harm, it's
+  /// only a reference after all.
+
+  value_type operator() ( const nd_ic_type& select ,
+                          const MultiArrayView < 2 , ele_type > & weight )
+  {
+    // here's the choice of strategy TODO codify to use eiter option
+//     return alt_operator ( sum ( select * coefficients.stride() ) , weight ) ;
+    return operator() ( sum ( select * coefficients.stride() ) , weight ) ;
+  }
+
   /// variant of operator() taking the components of a split coordinate, first the shape_type
   /// representing the origin of the coefficient window to process, second the fractional parts
   /// of the coordinate which are needed to calculate the weights to apply to the coefficient
   /// window. Note that the weights aren't as many values as the window has coefficients, but
   /// only ORDER weights per dimension; the 'final' weights would be the tensor product of the
-  /// sets of weights for each dimension, which we don't explicitly use here.
+  /// sets of weights for each dimension, which we don't explicitly use here. Also note that
+  /// we limit the code here to have the same number of weights for each axis, which might be
+  /// changed to accept differently sized tensors.
   
-  value_type eval ( const shape_type& select ,
-                    const nd_rc_type& tune )
+  value_type operator() ( const nd_ic_type& select ,
+                          const nd_rc_type& tune )
   {
     // calculate the weights. We want to produce efficient storage for the weights, being
     // very much inner-loop here, but at the same time we want adequate packaging for the
@@ -853,15 +994,20 @@ public:
     MultiArrayView < 2 , ele_type >           // package them in a MultiArrayView
      weight ( Shape2 ( ORDER , dimension ) ,  // with dimension sets of ORDER weights
               weight_data ) ;                 // using the space claimed above
-    obtain_weights ( weight , tune ) ;        // obtain the weights for given 'select' and 'tune'
-    return eval ( select , weight ) ;         // delegate to the final delegate (above)
+    obtain_weights ( weight , tune ) ;        // obtain the weights for given 'tune'
+    return operator() ( select , weight ) ;   // delegate to the final delegate (above)
   }
 
-  /// this variant of eval takes a split coordinate:
+  /// TODO: I think the next few operator() overloads should go.
+  /// I'd like the notion of a 'split coordinate' as a separate entity to disappear
+  /// into a pair of parameters passed into class evaluator's operator().
+  /// I leave the code in for now bcause some of my examples use split_types
+  
+  /// this variant of operator() takes a split coordinate:
   
   value_type operator() ( const split_coordinate_type& sc )  // presplit coordinate
   {
-    return eval ( sc.select , sc.tune ) ;
+    return operator() ( sc.select , sc.tune ) ;
   }
 
   /// this variant take a coordinate which hasn't been mapped yet, so it uses
@@ -873,20 +1019,21 @@ public:
   {
     split_coordinate_type sc ;
     mmap ( c , sc ) ;  /// apply the mappings
-    return eval ( sc.select , sc.tune ) ;
+    return operator() ( sc.select , sc.tune ) ;
   }
-
+  
   /// variant taking a plain rc_type for a coordinate. only for 1D splines!
   /// This is to avoid hard-to-track errors which might ensue from allowing
-  /// broadcasting of single rc_types to nd_rc_types for D>1.
+  /// broadcasting of single rc_types to nd_rc_types for D>1. We convert the
+  /// single coordinate (of rc_type) to an aggregate of 1 to fit the signature
+  /// of the nD code above
   
   value_type operator() ( const rc_type& c )         /// coordinate
   {
     static_assert ( dimension == 1 ,
                     "evaluation at a single real coordinate is only allowed for 1D splines" ) ;
-    split_coordinate_type sc ;
-    mmap ( nd_rc_type(c) , sc ) ;  /// apply the mappings - need a nd_rc_type here
-    return eval ( sc.select , sc.tune ) ;
+    nd_rc_type cc ( c ) ;
+    return operator() ( cc ) ;
   }
 
 #ifdef USE_VC
@@ -958,39 +1105,60 @@ public:
 
   // vectorized variants of the evaluation routines:
   
-  /// this is the vectorized version of the final delegate calling _v_eval.
+  /// this is the vectorized version of the final delegate calling _v_eval. The first
+  /// argument is avector of offsets to windows of coefficients.
+  /// The second argument is a 2D array of vecorized weights.
   
+  mc_ele_v operator() ( const ic_v& select ,  // offsets to lower corners of the subarrays
+                        const MultiArrayView < 2 , ele_v > & weight ) // vectorized weights
+  {
+    // we need an instance of this iterator because it's passed into _v_eval by reference
+    // and manipulated by the code there:
+    
+    offset_iterator ofs = component_offsets.begin() ;
+    
+    // now we can call the recursive _v_eval routine
+    
+    return _v_eval < mc_ele_v , level >() ( component_base , select , ofs , weight ) ;
+  }
+
+  /// in the penultimate delegate, we move from nD coordinates to offsets
+
   mc_ele_v operator() ( const nd_ic_v& select ,  // lower corners of the subarrays
                         const MultiArrayView < 2 , ele_v > & weight ) // vectorized weights
   {
-    // first we sum up the coordinates in all dimensions into origin values. this is analogous
-    // to forming origin = & coefficients [ ... ] - coefficients.data() in unvectorized code
-    // but since we can't do vectorized adress arithmetic (TODO can we?) ...
+    // here we transform the incoming nD coordinates into offsets into the coefficient
+    // array's memory.
     // note that we use both the strides and offsets appropriate for an expanded array,
-    // and component_base has pointers to the component type.
+    // and component_base has pointers to the element type.
     
-    ic_v origin = select[0] * ic_type ( expanded_stride [ 1 ] ) ;
+    ic_v origin = select[0] * ic_type ( expanded_stride [ 0 ] ) ;
     for ( int d = 1 ; d < dimension ; d++ )
-      origin += select[d] * ic_type ( expanded_stride [ d + 1 ] ) ;
+      origin += select[d] * ic_type ( expanded_stride [ d ] ) ;
     
-    // to iterate over the positions of all coefficients in the window over which the weighted sum
-    // is formed, we use this iterator:
+    // now we delegate to the final delegate
     
-    offset_iterator ofs = component_offsets.begin() ;
-    
-    // now we can call the recursive _eval routine
-    
-    return _v_eval < mc_ele_v , level >() ( component_base , origin , ofs , weight ) ;
+    return operator() ( origin , weight ) ;
   }
 
   // again, pretty much like the unvectorized version, now using simdized data:
   
   mc_ele_v operator() ( const nd_ic_v& select ,  // lower corners of the subarrays
-                        const nd_rc_v& tune ) 
+                        const nd_rc_v& tune )    // fractional parts of the incoming coordinates
   {
+    // like in the unvectorized code, the 2D MultiArrayView to the weights is created
+    // as a view to data on the stack (weight_data) which is lightweight and fast
+    
     ele_v weight_data [ workspace_size ] ;
     MultiArrayView < 2 , ele_v > weight ( Shape2 ( ORDER , dimension ) , weight_data ) ;
+    
+    // obtain_weights is the same code for vectorized operation, the arguments
+    // suffice to pick the right template argiments
+  
     obtain_weights ( weight , tune ) ;
+    
+    // delegate to operator() above
+
     return operator() ( select , weight ) ;
   }
 
@@ -1030,6 +1198,36 @@ public:
       v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
   }
 
+  /// this variant is roughly the same as the one above, but it receives separate
+  /// pointers for the integral and real parts of the coordinates. This is for the
+  /// case where these aspects are stored in different containers. This type of access
+  /// is more efficient, since we can use gather operations
+  
+  void operator() ( const ic_type * pi ,  // pointer to integral parts of coordinates
+                    const rc_type * pr ,  // pointer to real parts fo coordinates
+                    value_type* result )  // pointer to vsize result values
+  {
+    nd_ic_v select ;
+    nd_rc_v tune ;
+
+    // gather the integral and real coordinate parts from the pointer passed in
+    
+    for ( int d = 0 ; d < dimension ; d++ )
+    {
+      select[d].gather ( pi , nd_interleave[d] )  ;
+      tune[d].gather ( pr , nd_interleave[d] )  ;
+    }
+
+    // delegate
+    
+    mc_ele_v v_result = operator() ( select , tune ) ;
+
+    // and deposit it in the memory the caller provides
+    
+    for ( int ch = 0 ; ch < channels ; ch++ )
+      v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
+  }
+
   /// This variant of operator() works directly on vector data (of unsplit coordinates)
   /// This burdens the calling code with (de)interleaving the data. But often the calling
   /// code performs a traversal of a large body of data and is therefore in a better position
@@ -1051,7 +1249,7 @@ public:
     result = operator() ( select , tune ) ;
   }
 
-  /// This variant operates on unsplit coordinates. Here again we require the calling function
+  /// This variant also operates on unsplit coordinates. Here again we require the calling function
   /// to pass pointers to vsize input and output values in contiguous memory. The data are
   /// (de)interleved in this routine, the actual evaluation is delegated to the variant working
   /// on vectorized data.
@@ -1063,13 +1261,16 @@ public:
     mc_ele_v v_result ;
 
     // gather the incoming (interleaved) coordinates
+    
     for ( int d = 0 ; d < dimension ; d++ )
       input[d].gather ( (const rc_type* const)pmc , nd_interleave[d] ) ;
 
     // call operator() for vectorized data
+    
     operator() ( input , v_result ) ;
 
     // and deposit it in the memory the caller provides
+    
     for ( int ch = 0 ; ch < channels ; ch++ )
       v_result[ch].scatter ( (ele_type*)result , mc_interleave[ch] ) ;
   }
diff --git a/example/impulse_response.cc b/example/impulse_response.cc
index 791f24a..657ba2b 100644
--- a/example/impulse_response.cc
+++ b/example/impulse_response.cc
@@ -120,14 +120,26 @@ int main ( int argc , char * argv[] )
     std::cout << v1[k] ;
   std::cout << " } ;" << std::endl ;
   
-  typedef vspline::evaluator < 1 , double > eval_type ; // get the appropriate evaluator type 
+  typedef vspline::evaluator < 1 , double , double > eval_type ; // get the appropriate evaluator type 
   eval_type ev ( bsp ) ;                                   // create the evaluator
-  std::cout << ev ( 5001.5 ) << std::endl ;
+//   std::cout << ev ( 5001.5 ) << std::endl ;
   
   vigra::MultiArray < 1 , double > coords ( 12 ) ;
   coords[5] = 5000.1 ;
   vigra::MultiArray < 1 , double > target ( 12 ) ;
-  vspline::remap < double , double , 1 , 1 > ( bsp , coords , target ) ;
+  typedef vspline::split_type < 1 , int , double > splt ;
+  if ( degree & 1 )
+  {
+    for ( int i = 0 ; i < 12  ; i++ )
+      target[i] = ev ( coords[i] , vspline::odd_mapping_mirror < splt , 1 > ( 12 ) ) ;
+  }
+  else
+  {
+    for ( int i = 0 ; i < 12  ; i++ )
+      target[i] = ev ( coords[i] , vspline::even_mapping_mirror < splt , 1 > ( 12 ) ) ;
+  }
+    
+//   vspline::remap < double , double , 1 , 1 > ( bsp , coords , target ) ;
   for ( auto v : target )
     std::cout << v << std::endl ;
 }
\ No newline at end of file
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index 1e3ef3c..7bc63d3 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -164,7 +164,7 @@ void roundtrip ( view_type & data ,
   typedef typename eval_type::nd_rc_type coordinate_type ;
   typedef typename eval_type::split_coordinate_type split_type ;
   
-  typedef vspline::nd_mapping < split_type , 2 , vsize > mmap ;
+  typedef vspline::nd_mapping < int , rc_type , 2 , vsize > mmap ;
   
   // obtain the mapping from the evaluator:
   mmap map = ev.get_mapping() ;
@@ -174,6 +174,8 @@ void roundtrip ( view_type & data ,
   typedef vigra::MultiArray<2, split_type> warp_array ;
   typedef vigra::MultiArrayView<2, split_type> warp_view ;
 
+  typedef vspline::warper < int , rc_type , 2 , 2 > warper_type ;
+
   int Tx = Nx ;
   int Ty = Ny ;
 
@@ -182,7 +184,8 @@ void roundtrip ( view_type & data ,
   // (fwarp) and the other storing pre-split coordinates. Also create a target array
   // to contain the result.
 
-  coordinate_array fwarp ( Shape(Tx,Ty) ) ;
+  coordinate_array fwarp ( Shape ( Tx , Ty ) ) ;
+  warper_type wwarp ( Shape ( Tx , Ty ) ) ;
   warp_array _warp ( Shape(Tx+5,Ty+4) ) ;
   warp_view warp = _warp.subarray ( Shape(2,1) , Shape(-3,-3) ) ;
   array_type target ( Shape(Tx,Ty) ) ;
@@ -205,6 +208,9 @@ void roundtrip ( view_type & data ,
         // and apply the mapping to (fx, fy), storing the result to warp[x,y]
         map ( fx , warp [ Shape ( x , y ) ] , 0 ) ;
         map ( fy , warp [ Shape ( x , y ) ] , 1 ) ;
+
+        map ( fx , wwarp.select [ Shape ( x , y ) ] [ 0 ] , wwarp.tune [ Shape ( x , y ) ] [ 0 ] , 0 ) ;
+        map ( fy , wwarp.select [ Shape ( x , y ) ] [ 1 ] , wwarp.tune [ Shape ( x , y ) ] [ 1 ] , 1 ) ;
       }
     }
   }
@@ -239,6 +245,24 @@ void roundtrip ( view_type & data ,
 #endif
   
   for ( int times = 0 ; times < TIMES ; times++ )
+    vspline::remap<pixel_type,int,rc_type,2,2>
+      ( bsp , wwarp , target , use_vc ) ;
+
+  
+#ifdef PRINT_ELAPSED
+  end = std::chrono::system_clock::now();
+  cout << "avg " << TIMES << " x remap1 from pre-split coordinates in warper object: "
+       << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / float(TIMES)
+       << " ms" << endl ;
+#endif
+       
+  check_diff<view_type> ( target , data ) ;
+  
+#ifdef PRINT_ELAPSED
+  start = std::chrono::system_clock::now();
+#endif
+  
+  for ( int times = 0 ; times < TIMES ; times++ )
     vspline::remap<pixel_type,coordinate_type,2,2>
       ( bsp , fwarp , target , use_vc ) ;
 
diff --git a/mapping.h b/mapping.h
index 5cbb85b..1c9da36 100644
--- a/mapping.h
+++ b/mapping.h
@@ -72,7 +72,7 @@
     in the evaluation: the first one determines the origin of the subarray of the braced spline
     which will be processed into the result, and the second component constitutes a
     'nd_fraction', which is needed to calculate the evaluator's weights. So we go with
-    the struct of arrays approach instead an array of structs. A slight disadvantage here is
+    the class of arrays approach instead an array of classs. A slight disadvantage here is
     that 1D split coordinates are also formulated as manifolds (TinyVector < ... , 1  >).
 
     Next we define 'mappings'. Coordinates at which the spline is to be evaluated come in
@@ -116,8 +116,8 @@ namespace vspline {
 using namespace std ;
 using namespace vigra ;
 
-typedef int default_ic_type ;
-typedef float default_rc_type ;
+// typedef int default_ic_type ;
+// typedef float default_rc_type ;
 
 // type naming scheme:
 // nd_ : multidimensional
@@ -126,15 +126,15 @@ typedef float default_rc_type ;
 // rc  : real coordinate/real component
 // value : 'payload' type, like pixel
 // ele : elementary type of val
-// _v: vecorized type, single Vc:Vector or SimdArray, or structure thereof
+// _v: vecorized type, single Vc:Vector or SimdArray, or classure thereof
 
 /// nd_ic_type is simply a TinyVector of integral numbers. These are the
 /// integral parts of the incoming real coordinates, which are used to determine the
 /// location of the window into the coefficient array which will be used to produce the
 /// evaluation result for the incoming coordinates.
 
-template < int N = 1 ,
-           typename IT = default_ic_type >
+template < int N ,
+           typename IT >
 struct nd_ic_type
 :public TinyVector < IT , N >
 {
@@ -150,8 +150,8 @@ struct nd_ic_type
 /// weights of the weighted summation of the coefficients in the window which is defined
 /// by the integral parts of the coordinates.
 
-template < int N = 1 ,
-           typename FT = default_rc_type >
+template < int N ,
+           typename FT >
 struct nd_rc_type
 :public TinyVector < FT , N >
 {
@@ -160,20 +160,20 @@ struct nd_rc_type
   enum { dimension = N } ;
 } ;
 
-/// struct split_type contains n-dimensional 'split coordinates', consisting of the
+/// class split_type contains n-dimensional 'split coordinates', consisting of the
 /// integral and fracional part of the original real coordinates, separated so that
 /// they can be processed by the evaluation routine.
 
-template < int N = 1 ,
-           typename IT = default_ic_type ,
-           typename FT = default_rc_type >
+template < int N ,
+           typename IT ,
+           typename FT >
 struct split_type
 {
   typedef IT ic_type ;
   typedef FT rc_type ;
   typedef FT value_type ;
   enum { dimension = N } ;
-  nd_ic_type<N,IT>   select ; ///< named select because it selects the range of coefficients
+  nd_ic_type<N,IT> select ; ///< named select because it selects the range of coefficients
   nd_rc_type<N,FT> tune ;   ///< named tune because it is the base for the weights 
 } ;
 
@@ -183,15 +183,15 @@ struct split_type
 /// to simply iterate over the vector's components but write 'proper' vector code to make this
 /// operation as efficient as possible.
 
-template <class real_v>
-real_v v_modf ( const real_v& source ,
-                real_v * const iptr )
+template <typename rc_v>
+rc_v v_modf ( const rc_v& source ,
+              rc_v * const iptr )
 {
-  typedef typename real_v::mask_type mask_type ;
-  typedef typename real_v::EntryType single_type ;
+  typedef typename rc_v::mask_type mask_type ;
+  typedef typename rc_v::EntryType single_type ;
   
   mask_type negative = Vc::isnegative ( source ) ;
-  real_v help ( source ) ;
+  rc_v help ( source ) ;
   
   // we treat the case that any of the incoming vector components is negative separately
   // to avoid the corresponding code altogether in the - hopefully - most common case
@@ -217,15 +217,19 @@ real_v v_modf ( const real_v& source ,
 /// for now, I'm taking the easy road to a vectorized fmod by using apply and a lambda expression
 /// TODO: handcoding this might increase performance
 
-template <class real_v>
-real_v v_fmod ( const real_v& lhs ,
-                const typename real_v::EntryType rhs )
+template <typename rc_v>
+rc_v v_fmod ( const rc_v& lhs ,
+              const typename rc_v::EntryType rhs )
 {
-  return lhs.apply ( [&rhs] ( typename real_v::EntryType lhs )
-                     {
-                       return fmod ( lhs , rhs ) ;
-                     }
-                   ) ;
+  typedef Vc::SimdArray < int , rc_v::Size > ic_v ;
+  
+  ic_v iv ( lhs / rhs ) ;
+  return lhs - rhs * rc_v ( iv ) ;
+//   return lhs.apply ( [&rhs] ( typename rc_v::EntryType lhs )
+//                      {
+//                        return fmod ( lhs , rhs ) ;
+//                      }
+//                    ) ;
 }
 
 #endif // USE_VC
@@ -262,86 +266,30 @@ real_v v_fmod ( const real_v& lhs ,
 /// I feel the sacrifice of one slice's worth of memory is worth the performance gain and increased
 /// code transparency.
 
-template < typename split_type , int vsize = 1 >
-class mapping
-{
-public:
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-
-public:
-  
-  /// we define a virtual operator(), this is what each specific mapping
-  /// has to provide. Currently all operator() variants are pure virtual
-  /// so that they fail to compile if they aren't defined 'further down'.
-  
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv ) = 0 ;
-  
-  /// operator() for this parameter signature is delegated to the first one.
-  /// while the first one operates on single values, here we have a split_type
-  /// (which is n-D), and pick out the split parts for a specific dimension
-  
-  void operator() ( real_t v , split_type& s , const int & dim )
-  {
-    operator() ( v , s.select[dim] , s.tune[dim] ) ;
-  }
-  
-#ifdef USE_VC
-
-  // we fix the simdized type by looking at vector_traits (in common.h)
-
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
-
-  /// this is the operator() for vectorized operation
-
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv ) = 0 ;
-
-#endif
-
-  /// produce the spline coordinates expessed as a floating point value.
-  /// this is to test the mappings and make sure they do what they're supposed to do.
-  
-  real_t test ( real_t v )
-  {
-    int_t iv ;
-    real_t fv ;
-    this->operator() ( v , iv , fv ) ;
-    return real_t(iv) + real_t(fv) ;
-  }
-
-} ;
-
 /// the simplest and fastest mapping is the 'raw' mapping. Here, only the split is performed,
 /// come what may. The user is responsible for the suitability of incoming coordinates, it is
 /// silently assumed that 0.0 <= v <= M-1; if this is not true, the results may be false or
 /// the program may crash.
 
-template < typename split_type , int vsize = 1 >
-class odd_mapping_raw: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_raw
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-  
 public:
   
   odd_mapping_raw ( int M )
     { } ;
     
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
+    rc_t fl_i ;
     fv = modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -355,39 +303,34 @@ public:
 #endif
 } ;
 
-template < typename split_type , int vsize = 1 >
-class even_mapping_raw: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_raw
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-    
 public:
-  
+
   even_mapping_raw ( int M )
     { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
-    fv = modf ( v + real_t ( 0.5 ) , &fl_i ) - real_t ( 0.5 ) ;
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    rc_t fl_i ;
+    fv = modf ( v + rc_t ( 0.5 ) , &fl_i ) - rc_t ( 0.5 ) ;
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
   virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
-    v += real_t ( 0.5 ) ;
+    v += rc_t ( 0.5 ) ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder in [-0.5 ... 0.5]
-    fv -= real_t ( 0.5 ) ;
+    fv -= rc_t ( 0.5 ) ;
     iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
@@ -400,15 +343,10 @@ public:
 /// the next mapping mode is LIMIT. Here any out-of-bounds coordinates are set to the
 /// nearest valid value.
 
-template < typename split_type , int vsize = 1 >
-class odd_mapping_limit: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_limit
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-  
-  const real_t _ceiling ;
+  const rc_t _ceiling ;
   
 public:
   
@@ -416,7 +354,7 @@ public:
   : _ceiling ( M - 1 )
     { } ;
     
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v < 0.0 )
     {
@@ -426,19 +364,19 @@ public:
     }
    else if ( v > _ceiling )
     {
-      iv = int_t ( _ceiling ) ;
+      iv = ic_t ( _ceiling ) ;
       fv = 0.0 ;
       return ;
     }    
-    real_t fl_i ;
+    rc_t fl_i ;
     fv = modf ( v , &fl_i ) ;   // split v into integral and remainder from [0...1[
-    iv = int_t ( fl_i ) ;       // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;       // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -446,7 +384,7 @@ public:
   {
     v ( v > _ceiling ) = _ceiling ;
     
-    v ( v < real_t ( 0.0 ) ) = real_t ( 0.0 ) ;
+    v ( v < rc_t ( 0.0 ) ) = rc_t ( 0.0 ) ;
     
     rc_v fl_i ;
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
@@ -456,15 +394,10 @@ public:
 #endif
 } ;
 
-template < typename split_type , int vsize = 1 >
-class even_mapping_limit: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_limit
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-    
-  const real_t _ceiling ;
+  const rc_t _ceiling ;
   
 public:
   
@@ -472,23 +405,23 @@ public:
   : _ceiling ( M - 1 )
     { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v > _ceiling )
-      v = real_t ( _ceiling ) ;
+      v = rc_t ( _ceiling ) ;
 
     else if ( v < 0 )
-      v = real_t ( 0.0 ) ;
+      v = rc_t ( 0.0 ) ;
 
-    real_t fl_i ;
-    fv = modf ( v + real_t(0.5) , &fl_i ) - real_t(0.5) ;
-    iv = int_t ( fl_i ) ;
+    rc_t fl_i ;
+    fv = modf ( v + rc_t(0.5) , &fl_i ) - rc_t(0.5) ;
+    iv = ic_t ( fl_i ) ;
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -496,12 +429,12 @@ public:
   {
     v ( v > _ceiling ) = _ceiling ;
 
-    v ( v < real_t ( 0.0 ) ) = real_t ( 0.0 ) ;
+    v ( v < rc_t ( 0.0 ) ) = rc_t ( 0.0 ) ;
 
     rc_v fl_i ;
-    v += real_t(0.5) ;
+    v += rc_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder in [-0.5 ... 0.5]
-    fv -= real_t(0.5) ;
+    fv -= rc_t(0.5) ;
     iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
@@ -517,15 +450,10 @@ public:
 /// TODO: We might simply throw the exception and not bother with the -1 masking.
 /// The exception object might be made to contain the offending value.
 
-template < typename split_type , int vsize = 1 >
-class odd_mapping_reject: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_reject
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-  
-  const real_t _ceiling ;
+  const rc_t _ceiling ;
   
 public:
   
@@ -533,21 +461,21 @@ public:
   : _ceiling ( M - 1 )
     { } ;
     
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v < 0.0 || v > _ceiling )    // reject out-of-bounds values
       throw out_of_bounds() ;
 
     // now we are sure that v is safely inside [ 0 : _ceiling ]
-    real_t fl_i ;
+    rc_t fl_i ;
     fv = modf ( v , &fl_i ) ;   // split v into integral and remainder from [0...1[
-    iv = int_t ( fl_i ) ;       // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;       // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -555,8 +483,8 @@ public:
   {
     rc_v fl_i ;
 
-    auto too_large = ( v > real_t ( _ceiling ) ) ;
-    auto too_small = ( v < real_t ( 0.0 ) ) ;
+    auto too_large = ( v > rc_t ( _ceiling ) ) ;
+    auto too_small = ( v < rc_t ( 0.0 ) ) ;
     auto mask = too_small | too_large ;
     
     if ( any_of ( mask ) )
@@ -565,7 +493,7 @@ public:
       fv = v_modf ( v , &fl_i ) ;    // split v into integral and remainder from [0...1[
       iv = ic_v ( fl_i )  ;         // set integer part from float representing it
       // set iv to -1 at out-of-bounds values. mask cast is safe as sizes match
-      iv ( typename ic_v::mask_type ( mask ) ) = int_t ( -1 ) ;
+      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( -1 ) ;
       throw out_of_bounds() ;
     }
     
@@ -580,15 +508,10 @@ public:
 /// one might argue that coordinate values from -0.5 to 0.0 and M - 1.0 to M - 0.5
 /// can be processed meaningfully and hence should not be rejected.
 
-template < typename split_type , int vsize = 1 >
-class even_mapping_reject: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_reject
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-    
-  const real_t _ceiling ;
+  const rc_t _ceiling ;
 
 public:
   
@@ -596,21 +519,21 @@ public:
   : _ceiling ( M - 1 )
     { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
+    rc_t fl_i ;
     if ( v < 0.0 || v > _ceiling )  // test range
       throw out_of_bounds() ;
 
     // split v into integral and remainder in [-0.5 ... 0.5]
-    fv = modf ( v + real_t(0.5) , &fl_i ) - real_t(0.5) ;
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    fv = modf ( v + rc_t(0.5) , &fl_i ) - rc_t(0.5) ;
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -619,7 +542,7 @@ public:
     rc_v fl_i ;
     
     auto too_large = ( v > _ceiling ) ;
-    auto too_small = ( v < real_t ( 0.0 ) ) ;
+    auto too_small = ( v < rc_t ( 0.0 ) ) ;
     auto mask = too_small | too_large ;
     
     if ( any_of ( mask ) )
@@ -628,13 +551,13 @@ public:
       fv = v_modf ( v , &fl_i ) ;    // split v into integral and remainder from [0...1[
       iv = ic_v ( fl_i )  ;         // set integer part from float representing it
       // set iv to -1 at out-of-bounds values. mask cast is safe as sizes match.
-      iv ( typename ic_v::mask_type ( mask ) ) = int_t ( -1 ) ;
+      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( -1 ) ;
       throw out_of_bounds() ;
     }
     // now we are sure that v is safely inside [ 0 : _ceiling [
-    v += real_t(0.5) ;
+    v += rc_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;           // split v into integral and remainder in [-0.5 ... 0.5]
-    fv -= real_t(0.5) ;
+    fv -= rc_t(0.5) ;
     iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
@@ -654,16 +577,11 @@ public:
 ///
 /// f ( (M-1) + x ) == f ( (M-1) - x )
 
-template < typename split_type , int vsize = 1 >
-class odd_mapping_mirror: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_mirror
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-  
-  const real_t _ceiling ;
-  const real_t _ceilx2 ;
+  const rc_t _ceiling ;
+  const rc_t _ceilx2 ;
   
 public:
   
@@ -675,9 +593,9 @@ public:
   // with odd splines we have to be careful not to access the coefficient matrix
   // with iv == M - 1 and this results in extra safeguarding code.
     
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
+    rc_t fl_i ;
     if ( v < 0.0 )              // apply mirror left boundary condition
       v = -v ;
     if ( v > _ceiling )
@@ -690,15 +608,15 @@ public:
     }
     // now we are sure that v is safely inside [ 0 : _ceiling ]
     fv = modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
   // vectorized version of operator()
   
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -733,16 +651,11 @@ public:
 /// for even splines is just as large as the one for the next higher odd spline -
 /// makes them less commonly used.
 
-template < typename split_type , int vsize = 1 >
-class even_mapping_mirror: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_mirror
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-    
-  const real_t _ceiling ;
-  const real_t _ceilx2 ;
+  const rc_t _ceiling ;
+  const rc_t _ceilx2 ;
 
 public:
   
@@ -752,9 +665,9 @@ public:
     { } ;
   
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
+    rc_t fl_i ;
     if ( v < 0.0 )               // apply mirror left boundary condition
       v = -v ;
     if ( v > _ceiling )
@@ -767,14 +680,14 @@ public:
     }
     // now v is <= _ceiling.
     // split v into integral and remainder in [-0.5 ... 0.5]
-    fv = modf ( v + real_t(0.5) , &fl_i ) - real_t(0.5) ;
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    fv = modf ( v + rc_t(0.5) , &fl_i ) - rc_t(0.5) ;
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
@@ -791,9 +704,9 @@ public:
       v = _ceiling - v ;                // flip
     }
     // now we are sure that v is safely inside [ 0 : _ceiling ]
-    v += real_t(0.5) ;
+    v += rc_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder in [-0.5 ... 0.5]
-    fv -= real_t(0.5) ;
+    fv -= rc_t(0.5) ;
     iv = ic_v ( fl_i )  ;              // set integer part from float representing it
   }
 
@@ -809,28 +722,23 @@ public:
 /// Which fits nicely with an even spline, which has a wider defined range due to
 /// the smaller support. In that case we could possibly even save ourselves the last coefficient.
 /// The first period finishes one unit spacing beyond the location of the last knot
-/// point, so if the spline is constructed over N values, the mapping has to be constructed
+/// point, so if the spline is conclassed over N values, the mapping has to be conclassed
 /// with parameter M = N + 1. The bracing applied with the periodic bracer makes sure that
 /// coordinates up to M can be processed.
 
-template < typename split_type , int vsize = 1 >
-class odd_mapping_periodic: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_periodic
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-  
-  const real_t _ceiling ;
+  const rc_t _ceiling ;
   
 public:
   
   odd_mapping_periodic ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
+    rc_t fl_i ;
     if ( v < 0.0 )
     {
       v = _ceiling + fmod ( v , _ceiling ) ; // force to period - 1 and shift to first period
@@ -840,23 +748,23 @@ public:
       v = fmod ( v , _ceiling ) ; // force to first period (this also results in v <= _ceiling)
     }
     fv = modf ( v , &fl_i ) ;   // split v into integral and remainder from [0...1[
-    iv = int_t ( fl_i ) ;       // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;       // set integer part from float representing it
   }
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
   virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
-    if ( any_of ( v < real_t(0) ) || any_of ( v >= _ceiling ) )
+    if ( any_of ( v < rc_t(0) ) || any_of ( v >= _ceiling ) )
     {
       v = v_fmod ( v , _ceiling ) ;       // apply modulo (this also results in v <= _ceiling)
-      v ( v < real_t(0) ) += _ceiling ;   // shift values below zero up one period
+      v ( v < rc_t(0) ) += _ceiling ;   // shift values below zero up one period
     }
     fv = v_modf ( v , &fl_i ) ;        // split v into integral and remainder from [0...1[
     iv = ic_v ( fl_i )  ;             // set integer part from float representing it
@@ -865,24 +773,19 @@ public:
 #endif
 } ;
 
-template < typename split_type , int vsize = 1 >
-class even_mapping_periodic: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_periodic
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-    
-  const real_t _ceiling ;
+  const rc_t _ceiling ;
 
 public:
   
   even_mapping_periodic ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
+    rc_t fl_i ;
     
     if ( v < 0.0 )
     {
@@ -893,28 +796,28 @@ public:
       v = fmod ( v , _ceiling ) ; // force to first period
     }
     // split v into integral and remainder in [-0.5 ... 0.5]
-    fv = modf ( v + real_t(0.5) , &fl_i ) - real_t(0.5) ;
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    fv = modf ( v + rc_t(0.5) , &fl_i ) - rc_t(0.5) ;
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
   virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
-    if ( any_of ( v < real_t(0) ) || any_of ( v >= _ceiling ) )
+    if ( any_of ( v < rc_t(0) ) || any_of ( v >= _ceiling ) )
     {
       v = v_fmod ( v , _ceiling ) ;     // apply modulo
-      v ( v < real_t(0) ) += _ceiling ; // shift values below zero up one period
+      v ( v < rc_t(0) ) += _ceiling ; // shift values below zero up one period
     }
-    v += real_t(0.5) ;
+    v += rc_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;        // split v into integral and remainder in [-0.5 ... 0.5]
-    fv -= real_t(0.5) ;
+    fv -= rc_t(0.5) ;
     iv = ic_v ( fl_i )  ;             // set integer part from float representing it
   }
 
@@ -940,24 +843,19 @@ public:
 /// TODO: this is silly, since it's not really specific to natural BCs.
 /// Might call it 'clamp' instead.
 
-template < typename split_type , int vsize = 1 >
-class odd_mapping_constant: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_constant
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-  
-  real_t _ceiling ;
+  const rc_t _ceiling ;
   
 public:
   
   odd_mapping_constant ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    real_t fl_i ;
+    rc_t fl_i ;
     if ( v < 0.0 )
     {
       iv = 0 ;               // if v is below 0.0, we pass the value for v == 0.0
@@ -966,25 +864,25 @@ public:
     }
    else if ( v > _ceiling )
     {
-      iv = int_t ( _ceiling ) ;
+      iv = ic_t ( _ceiling ) ;
       fv = 0.0 ;
       return ;
     }
     fv = modf ( v , &fl_i ) ;   // split v into integral and remainder from [0...1[
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
   virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
-    v ( v < real_t ( 0 ) ) = real_t ( 0 ) ;
+    v ( v < rc_t ( 0 ) ) = rc_t ( 0 ) ;
     v ( v > _ceiling ) = _ceiling ;
     fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder from [0...1[
     iv = ic_v ( fl_i )  ;              // set integer part from float representing it
@@ -993,22 +891,17 @@ public:
 #endif
 } ;
 
-template < typename split_type , int vsize = 1 >
-class even_mapping_constant: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_constant
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-    
-  real_t _ceiling ;
+  const rc_t _ceiling ;
 
 public:
   
   even_mapping_constant ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v < 0.0 )
     {
@@ -1018,31 +911,31 @@ public:
     }
    else if ( v > _ceiling )
     {
-      iv = int_t ( _ceiling ) ;
+      iv = ic_t ( _ceiling ) ;
       fv = 0.0 ;
       return ;
     }
     // split v into integral and remainder in [-0.5 ... 0.5]
-    real_t fl_i ;
-    fv = modf ( v + real_t(0.5) , &fl_i ) - real_t(0.5) ;
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    rc_t fl_i ;
+    fv = modf ( v + rc_t(0.5) , &fl_i ) - rc_t(0.5) ;
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
   virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
-    v ( v < real_t(0) ) = real_t(0) ;
+    v ( v < rc_t(0) ) = rc_t(0) ;
     v ( v > _ceiling ) = _ceiling ;
-    v += real_t(0.5) ;
+    v += rc_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ;        // split v into integral and remainder in [-0.5 ... 0.5]
-    fv -= real_t(0.5) ;
+    fv -= rc_t(0.5) ;
     iv = ic_v ( fl_i )  ;             // set integer part from float representing it
   }
 
@@ -1062,16 +955,11 @@ public:
 /// coordinates, which is used in the roundtrip test, will produce the expected result.
 /// So, the left point of reflection as at -0.5, the right point of reflection at M - 0.5.
 
-template < typename split_type , int vsize = 1 >
-class odd_mapping_reflect: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_reflect
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-  
-  const real_t _ceiling ;
-  const real_t _ceilx2 ;
+  const rc_t _ceiling ;
+  const rc_t _ceilx2 ;
   
 public:
   
@@ -1080,11 +968,11 @@ public:
     _ceilx2 ( M + M - 4 )
     { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    v += real_t ( 0.5 ) ;        // delete this to change back to origin at reflection
-    real_t fl_i ;
-    if ( v < real_t ( 0.0 ) )    // apply reflect left boundary condition
+    v += rc_t ( 0.5 ) ;        // delete this to change back to origin at reflection
+    rc_t fl_i ;
+    if ( v < rc_t ( 0.0 ) )    // apply reflect left boundary condition
       v = -v ;
     if ( v > _ceiling )
     {
@@ -1098,27 +986,27 @@ public:
     // which corresponds to spline coordinates [ 0.5 , _ceiling + 0.5 ]
     v += 0.5 ;
     fv = modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
   
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
   virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
-    v += real_t ( 0.5 ) ;
+    v += rc_t ( 0.5 ) ;
     v = abs ( v ) ;
     if ( any_of ( v > _ceiling ) )
     {
       v = v_fmod ( v , _ceilx2 ) ;        // map to one full period
       v ( v > _ceiling ) = _ceilx2 - v ;
     }
-    v += real_t(0.5) ;
+    v += rc_t(0.5) ;
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
     iv = ic_v ( fl_i )  ;      // set integer part from float representing it
   }
@@ -1133,16 +1021,11 @@ public:
 /// be mapped to M, -0.5 which is outside the defined range. In this special case
 /// we use M-1, 0.5 instead which is equivalent and inside the range.
 
-template < typename split_type , int vsize = 1 >
-class even_mapping_reflect: public mapping < split_type , vsize >
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_reflect
 {
-  
-  
-  typedef typename split_type::ic_type int_t ;
-  typedef typename split_type::rc_type real_t ;
-    
-  const real_t _ceiling ;
-  const real_t _ceilx2 ;
+  const rc_t _ceiling ;
+  const rc_t _ceilx2 ;
 
 public:
   
@@ -1151,10 +1034,10 @@ public:
     _ceilx2 ( M + M )
   { } ;
   
-  virtual void operator() ( real_t v , int_t& iv , real_t& fv )
+  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    v += real_t ( 0.5 ) ;
-    real_t fl_i ;
+    v += rc_t ( 0.5 ) ;
+    rc_t fl_i ;
     if ( v < 0.0 )              // apply reflect left boundary condition
       v = -v ;
     if ( v > _ceiling )
@@ -1168,41 +1051,41 @@ public:
     if ( v >= _ceiling ) // guard against special == case; defensively use >= instead of ==
     {
       iv = _ceiling - 1 ;
-      fv = real_t ( 0.5 ) ;
+      fv = rc_t ( 0.5 ) ;
       return ;
     }
     // now we have v in [ 0 | _ceiling ]
     // which corresponds to spline coordinates [ -0.5 | _ceiling + 0.5 [
     // split v into integral and remainder in [-0.5 ... 0.5]
-    fv = modf ( v , &fl_i ) - real_t(0.5) ;
-    iv = int_t ( fl_i ) ;     // set integer part from float representing it
+    fv = modf ( v , &fl_i ) - rc_t(0.5) ;
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
 
 #ifdef USE_VC
 
-  typedef typename vector_traits < int_t , vsize > :: simdized_type ic_v ;
-  typedef typename vector_traits < real_t , vsize > :: simdized_type rc_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
 
   /// this is the operator() for vectorized operation
 
   virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
-    v += real_t ( 0.5 ) ;
+    v += rc_t ( 0.5 ) ;
     v = abs ( v ) ;
     if ( any_of ( v > _ceiling ) )
     {
       v = v_fmod ( v , _ceilx2 ) ;        // map to one full period
       v ( v > _ceiling ) = _ceilx2 - v ;
     }
-    fv = v_modf ( v , &fl_i ) - real_t(0.5) ;
+    fv = v_modf ( v , &fl_i ) - rc_t(0.5) ;
     iv = ic_v ( fl_i )  ;      // set integer part from float representing it
     auto mask = ( v >= _ceiling ) ;  // guard against special == case; defensively use >= instead of ==
     if ( any_of ( mask ) )
     {
       // mask cast is safe as sizes match
       iv ( typename ic_v::mask_type ( mask ) ) = _ceiling - 1 ;
-      fv ( mask ) = real_t ( 0.5 ) ;
+      fv ( mask ) = rc_t ( 0.5 ) ;
     }
   }
 
@@ -1212,264 +1095,232 @@ public:
 /// create a mapping for an odd-degree spline given a BC code, the spline degree,
 /// and the extent along the axis in question
 
-template < typename split_type , int vsize = 1 >
-mapping < split_type , vsize > * create_odd_mapping ( bc_code bc , int spline_degree , int M )
+template < typename ic_t ,
+           typename rc_t ,
+           int vsize ,
+           typename map_func >
+map_func pick_mapping ( bc_code bc , int spline_degree , int M )
 {
-  switch ( bc )
+  if ( spline_degree & 1 )
   {
-    case RAW :
-    {
-      return new odd_mapping_raw < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case LIMIT :
-    {
-      return new odd_mapping_limit < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case REJECT :
-    {
-      return new odd_mapping_reject < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case CONSTANT :
-    case NATURAL :
-    {
-      return new odd_mapping_constant < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case MIRROR :
+    switch ( bc )
     {
-      return new odd_mapping_mirror < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case REFLECT :
-    case SPHERICAL :
-    {
-      return new odd_mapping_reflect < split_type , vsize > ( M + 2 ) ;
-      break ;
-    }
-    case PERIODIC :
-    {
-      return new odd_mapping_periodic < split_type , vsize > ( M + 1 ) ;
-      break ;
+      case RAW :
+      {
+        return odd_mapping_raw < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case LIMIT :
+      {
+        return odd_mapping_limit < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case REJECT :
+      {
+        return odd_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case CONSTANT :
+      case NATURAL :
+      {
+        return odd_mapping_constant < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case MIRROR :
+      {
+        return odd_mapping_mirror < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case REFLECT :
+      case SPHERICAL :
+      {
+        return odd_mapping_reflect < ic_t , rc_t , vsize > ( M + 2 ) ;
+        break ;
+      }
+      case PERIODIC :
+      {
+        return odd_mapping_periodic < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
+      default:
+      {
+        // TODO: throw exception instead?
+        cerr << "mapping for BC code " << bc_name[bc] << " is not supported" << endl ;
+        return odd_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
     }
-    default:
+  }
+  else
+  {
+    switch ( bc )
     {
-      // TODO: throw exception instead?
-      cerr << "mapping for BC code " << bc_name[bc] << " is not supported" << endl ;
-      return new odd_mapping_reject < split_type , vsize > ( M ) ;
-      break ;
+      case RAW :
+      {
+        return even_mapping_raw < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case LIMIT :
+      {
+        return even_mapping_limit < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case REJECT :
+      {
+        return even_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case NATURAL :
+      case CONSTANT :
+      {
+        return even_mapping_constant < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case MIRROR :
+      {
+        return even_mapping_mirror < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case REFLECT :
+      case SPHERICAL :
+      {
+        return even_mapping_reflect < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case PERIODIC :
+      {
+        return even_mapping_periodic < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
+      default:
+      {
+        // TODO: throw exception instead?
+        cerr << "mapping for BC code " << bc_name[bc] << " is not supported" << endl ;
+        return even_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
     }
   }
 }
 
-/// create a mapping for an even-degree spline given a BC code, the spline degree,
-/// and the extent along the axis in question
+/// struct mapping contains a std::function for singular mappings, and, if USE_VC
+/// is defined, a std::function for vectorized mappings. The specific function(s) used
+/// are determined by class pick_mapping.
 
-template < typename split_type , int vsize = 1 >
-mapping < split_type , vsize > * create_even_mapping ( bc_code bc , int spline_degree , int M )
+template < typename ic_t , typename rc_t , int vsize = 1 >
+struct mapping
 {
-  switch ( bc )
+  typedef std::function < void ( rc_t v , ic_t & iv , rc_t & fv ) > map_func ;
+  map_func map ;
+
+#ifdef USE_VC
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef std::function < void ( rc_v , ic_v & , rc_v & ) > map_func_v ;
+  map_func_v map_v ;
+#endif
+  
+  mapping ( bc_code bc , int spline_degree , int M )
   {
-    case RAW :
-    {
-      return new even_mapping_raw < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case LIMIT :
-    {
-      return new even_mapping_limit < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case REJECT :
-    {
-      return new even_mapping_reject < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case NATURAL :
-    case CONSTANT :
-    {
-      return new even_mapping_constant < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case MIRROR :
-    {
-      return new even_mapping_mirror < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case REFLECT :
-    case SPHERICAL :
-    {
-      return new even_mapping_reflect < split_type , vsize > ( M ) ;
-      break ;
-    }
-    case PERIODIC :
-    {
-      return new even_mapping_periodic < split_type , vsize > ( M + 1 ) ;
-      break ;
-    }
-    default:
-    {
-      // TODO: throw exception instead?
-      cerr << "mapping for BC code " << bc_name[bc] << " is not supported" << endl ;
-      return new even_mapping_reject < split_type , vsize > ( M ) ;
-      break ;
-    }
+    map = pick_mapping < ic_t , rc_t , vsize , map_func > ( bc , spline_degree , M ) ;
+#ifdef USE_VC
+    map_v = pick_mapping < ic_t , rc_t , vsize , map_func_v > ( bc , spline_degree , M ) ;
+#endif
   }
-}
+  
+  mapping()
+  { } ;
+} ;
 
 /// class nd_mapping handles a set of mappings, one per axis.
 /// It provides the same operations as the 1D mapping class; for 1D use the axis
 /// to which the mapping is applied is determined by an additional parameter,
-/// axis. Internally, the mappings are accessed via a pointer to the base class.
-/// The additional routines (without exis parameter) iterate over all axes.
+/// axis.
+/// The additional routines (without axis parameter) iterate over all axes.
 /// Since in this object we keep base class pointers to the actual mappings,
 /// we need additional code to assure proper deletion of the mapping objects,
 /// which are created by new. Hence the copy constructor and assignment operator.
 
-template < typename split_type , int dimension , int vsize = 1 >
-class nd_mapping
+template < typename ic_t , typename rc_t , int dimension , int vsize = 1 >
+struct nd_mapping
 {
-public:
-  
-  typedef mapping < split_type , vsize > mapping_type ;
-
-  typedef typename mapping_type::int_t int_t ;
-  typedef typename mapping_type::real_t real_t ;
+  typedef mapping < ic_t , rc_t , vsize > mapping_type ;
   typedef TinyVector < bc_code , dimension > bcv_type ;
-  typedef TinyVector < MultiArrayIndex , dimension > shape_type ;
-  typedef TinyVector < real_t , dimension > nd_real_t ;
-  typedef TinyVector < int_t , dimension > nd_int_t ;
+  typedef TinyVector < rc_t , dimension > nd_rc_t ;
+  typedef TinyVector < ic_t , dimension > nd_ic_t ;
+  typedef split_type < dimension , ic_t , rc_t > split_t ;
 
 #ifdef USE_VC
 
-  typedef typename mapping_type::rc_v real_v ;
-  typedef typename mapping_type::ic_v int_v ;
-  typedef TinyVector < real_v , dimension > nd_real_v ;
-  typedef TinyVector < int_v , dimension > nd_int_v ;
+  typedef typename vector_traits < ic_t , vsize > :: simdized_type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: simdized_type rc_v ;
+  typedef TinyVector < ic_v , dimension > nd_ic_v ;
+  typedef TinyVector < rc_v , dimension > nd_rc_v ;
 
 #endif
 
-  TinyVector < mapping_type* , dimension > map ; // container for the mappings
+  TinyVector < mapping_type , dimension > map ; // container for the mappings
   bcv_type bcv ;
   int spline_degree ;
-  shape_type shape ;
+  nd_ic_t shape ;
   
   /// 'standard' constructor for a nd_mapping. This constructor takes the
   /// values which a nd_mapping requires for it's operation directly, and the
   /// other constructors delegate to it.
 
-  nd_mapping ( const bcv_type&  _bcv ,
-                  const int&        _spline_degree ,
-                  const shape_type& _shape  )
+  nd_mapping ( const bcv_type&   _bcv ,
+               const int&        _spline_degree ,
+               const nd_ic_t&    _shape  )
   : bcv ( _bcv ) ,
     spline_degree ( _spline_degree ) ,
     shape ( _shape )
   {
     for ( int d = 0 ; d < dimension ; d++ )
-    {
-      if ( spline_degree & 1 )
-        map[d] = create_odd_mapping < split_type , vsize > ( bcv[d] , spline_degree , shape[d] ) ;
-      else
-        map[d] = create_even_mapping < split_type , vsize > ( bcv[d] , spline_degree , shape[d] ) ;
-    }
+      map [ d ] = mapping_type ( bcv[d] , spline_degree , shape[d] ) ;
   }
   
+  nd_mapping()
+  { } ;
+  
   /// convenience variant taking a single boundary condition code, which is used for all axes
   
   nd_mapping ( const bc_code&    bc ,
                const int&        _spline_degree ,
-               const shape_type& _shape  )
+               const nd_ic_t&    _shape  )
   : nd_mapping ( bcv_type ( bc ) , spline_degree , _shape )
   {
   } ;
   
-  /// since we have members which need delete, we need a copy constructor
-  
-  nd_mapping ( const nd_mapping& other )
-  : nd_mapping ( other.bcv , other.spline_degree , other.shape )
-  { } ;
-  
-//   /// convenience variant constructing a nd_mapping from a bspline object
-//   /// A bspline object has all components needed to construct a nd_mapping:
-//   /// It has a set of boundary condition codes (those the spline was constructed with),
-//   /// the spline's degree and the core shape of it's coefficient array, giving us the limits
-//   /// for the mapping. Creating the mapping like this keeps us from using mapping modes
-//   /// RAW, LIMIT and REJECT, since these are only codes for mappings, not for boundary
-//   /// conditions a spline can be constructed with. But all the BC codes used for spline
-//   /// construction, like MIRROR, REFLECT, NATURAL and PERIODIC have corresponding
-//   /// mappings.
-// 
-//   template < class bspline >
-//   nd_mapping ( const bspline & bspl )
-//   : nd_mapping ( bspl.bcv ,
-//                  bspl.spline_degree ,
-//                  bspl.core_shape )
-//   { } ;
-  
-  /// assignment operator, for the same reason
-  
-  nd_mapping& operator= ( const nd_mapping& other )
-  {
-    bcv = other.bcv ;
-    shape = other.shape ;
-    spline_degree = other.spline_degree ;
-    for ( int d = 0 ; d < dimension ; d++ )
-    {
-      if ( map[d] )
-        delete map[d] ;
-      if ( spline_degree & 1 )
-        map[d] = create_odd_mapping < split_type , vsize > ( bcv[d] , spline_degree , shape[d] ) ;
-      else
-        map[d] = create_even_mapping < split_type , vsize > ( bcv[d] , spline_degree , shape[d] ) ;
-    }
-  }
-  
-  /// the destructor deletes the mapping objects. If the nd_mapping was
-  /// default-constructed, these will be 0, so we have to safeguard against this case.
-  /// It would be nice if we could provide a default constructor, but there is no
-  /// sensible default shape or spline degree we could provide.
-  
-  ~nd_mapping()
-  {
-    for ( int d = 0 ; d < dimension ; d++ )
-    {
-      if ( map[d] )
-        delete map[d] ;
-    }
-  }
-  
   /// apply the mapping along axis 'axis' to coordinate v, resulting in the setting
   /// of the integral part iv and the fraczional part fv
   
-  void operator() ( real_t v , int_t& iv , real_t& fv , const int & axis )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv , const int & axis )
   {
-    ( * ( map [ axis ] ) ) ( v , iv , fv ) ;
+    map [ axis ] . map ( v , iv , fv ) ;
   }
   
   /// same operation, but along all axes, taking a multi-coordinate and setting
   /// the corresponding n-dimensional objects for the integral and fractional parts
   
-  void operator() ( nd_real_t v , nd_int_t& iv , nd_real_t& fv )
+  void operator() ( nd_rc_t v , nd_ic_t& iv , nd_rc_t& fv )
   {
     for ( int axis = 0 ; axis < dimension ; axis++ )
-      ( * ( map [ axis ] ) ) ( v[axis] , iv[axis] , fv[axis] ) ;
+      map [ axis ] . map ( v[axis] , iv[axis] , fv[axis] ) ;
   }
-  
+
   /// different signature, now we handle a split_type object as the operation's target
 
-  void operator() ( real_t v , split_type& s , const int & axis )
+  void operator() ( rc_t v , split_t& s , const int & axis )
   {
-    ( * ( map [ axis ] ) ) ( v , s.select[axis] , s.tune[axis] ) ;
+    map [ axis ] . map ( v , s.select[axis] , s.tune[axis] ) ;
   }
   
   /// the same in nD
   
-  void operator() ( nd_real_t v , split_type& s )
+  void operator() ( nd_rc_t v , split_t& s )
   {
     for ( int axis = 0 ; axis < dimension ; axis++ )
-      ( * ( map [ axis ] ) ) ( v[axis] , s.select[axis] , s.tune[axis] ) ;
+      map [ axis ] . map ( v[axis] , s.select[axis] , s.tune[axis] ) ;
   }
   
 #ifdef USE_VC
@@ -1477,82 +1328,55 @@ public:
   /// finally, operation on Vc vectors of incoming coordinates. Again, first the
   /// 1D version with the 'axis' parameter
   
-//   void operator() ( Vc::Vector < real_t > v ,
-//                     Vc::SimdArray < int_t , Vc::Vector<real_t>::size() >& iv ,
-//                     Vc::Vector < real_t >& fv ,
-//                     const int & axis )
-  void operator() ( real_v v ,
-                    int_v & iv ,
-                    real_v & fv ,
+  void operator() ( rc_v v ,
+                    ic_v & iv ,
+                    rc_v & fv ,
                     const int & axis )
   {
-    ( * ( map [ axis ] ) ) ( v , iv , fv ) ;
+    map [ axis ] . map_v ( v , iv , fv ) ;
   }
   
   /// and the nD version, operating on vector aggregates
   
-  void operator() ( nd_real_v v ,
-                    nd_int_v & iv ,
-                    nd_real_v & fv )
+  void operator() ( nd_rc_v v ,
+                    nd_ic_v & iv ,
+                    nd_rc_v & fv )
   {
     for ( int axis = 0 ; axis < dimension ; axis++ )
-      ( * ( map [ axis ] ) ) ( v[axis] , iv[axis] , fv[axis] ) ;
+      map [ axis ] . map_v ( v[axis] , iv[axis] , fv[axis] ) ;
   }
 
 #endif // USE_VC
 } ;
 
-/*
-void mapping_test()
-{
-  float in[] = { -1.0 , -0.5 , 0.0 , 0.5 , 8.5 , 9.0 , 9.4999 , 9.5 , 9.501 , 10.0 , 10.5 , 11.0 , 11.5 , 12.0 } ;
-  const int innum = sizeof(in) / sizeof(float) ;
-  
-  typedef split_type<> st ;
-  
-  mapping<st,innum> * pmap[] = { new odd_mapping_mirror<st,innum>(10) ,
-                             new even_mapping_mirror<st,innum>(10) ,
-                             new odd_mapping_periodic<st,innum>(10) ,
-                             new even_mapping_periodic<st,innum>(10) ,
-                             new odd_mapping_reflect<st,innum>(12) ,
-                             new even_mapping_reflect<st,innum>(10) ,
-                             new odd_mapping_constant<st,innum>(10) ,
-                             new even_mapping_constant<st,innum>(10) } ;
-  char * nmap[] = { "odd_mapping_mirror(10)" ,
-                    "even_mapping_mirror(10)" ,
-                    "odd_mapping_periodic(10)" ,
-                    "even_mapping_periodic(10)" ,
-                    "odd_mapping_reflect(12)" ,
-                    "even_mapping_reflect(10)" ,
-                    "odd_mapping_constant(10)" ,
-                    "even_mapping_constant(10)" } ;
- mapping<st,innum>::int_t oi ;
- mapping<st,innum>::real_t of ;
- mapping<st,innum>::ic_v v_oi ;
- mapping<st,innum>::rc_v v_of ;
- mapping<st,innum>::rc_v v_in ;
- 
- for ( int i = 0 ; i < sizeof ( pmap ) / sizeof ( mapping<st,innum>* ) ; i++ )
- {
-   cout << "testing " << nmap[i] << endl ;
-   for ( int k = 0 ; k < sizeof ( in ) / sizeof ( float ) ; k++ )
-   {
-     (*(pmap[i])) ( in[k] , oi , of ) ;
-     cout << in[k] << " -> " << oi << ", " << of << endl ;
-   }
- }
- for ( int i = 0 ; i < sizeof ( pmap ) / sizeof ( mapping<st,innum>* ) ; i++ )
- {
-   cout << "testing " << nmap[i] << endl ;
-//    for ( int k = 0 ; k < sizeof ( in ) / sizeof ( float ) ; k++ )
-//    {
-     v_in = mapping<st,innum>::rc_v ( in ) ;
-     (*(pmap[i])) ( v_in , v_oi , v_of ) ;
-     cout << v_in << " -> " << v_oi << ", " << v_of << endl ;
-//    }
- }
-}
-*/
+// void mapping_test()
+// {
+//   float in[] = { -1.0 , -0.5 , 0.0 , 0.5 , 8.5 , 9.0 , 9.4999 , 9.5 , 9.501 , 10.0 , 10.5 , 11.0 , 11.5 , 12.0 } ;
+//   const int innum = sizeof(in) / sizeof(float) ;
+//   bc_code test_bc[] = { MIRROR , PERIODIC , REFLECT , NATURAL } ;
+//   const int vsize = 8 ;
+//   for ( auto bc : test_bc )
+//   {
+//     mapping < int , float , vsize > m ( bc , 3 , 10 ) ;
+//     for ( auto v : in )
+//     {
+//       int i ;
+//       float f ;
+//       m.map ( v , i , f ) ;
+//       std::cout << "in: " << v << " -> i " << i << " f " << f << std::endl ;
+// #ifdef USE_VC
+//       typedef typename vector_traits < int , vsize > :: simdized_type ic_v ;
+//       typedef typename vector_traits < float , vsize > :: simdized_type rc_v ;
+//       rc_v vv ( v ) ;
+//       ic_v iv ;
+//       rc_v fv ;
+//       m.map_v ( vv , iv , fv ) ;
+//       std::cout << "in: " << vv << " -> i " << iv << " f " << fv << std::endl ;
+// #endif      
+//     }
+//   }
+// }
+
 } ; // end of namespace vspline
 
 #endif // VSPLINE_MAPPING_H
diff --git a/remap.h b/remap.h
index f39b29a..f9acdd1 100644
--- a/remap.h
+++ b/remap.h
@@ -346,6 +346,192 @@ int remap ( MultiArrayView < dim_in , value_type > input ,
   return 0 ;
 }
 
+/// definition of an object containing separate arrays for the components of split coordinates.
+/// This way the data can be accessed by gather/scatter operations, which should be more efficient
+/// than having these data in interleaved format, where, due to their mixed nature, we have to
+/// 'manually' de/interleave them. In my tests, this improved performance in unvectorized code,
+/// while for vectorized code it made no noticeable difference.
+// TODO: runs, but needs more testing
+
+template < class ic_type , class rc_type , int dimension , int coordinate_dimension >
+class warper
+{
+public:
+  
+  typedef vigra::TinyVector < ic_type , coordinate_dimension > nd_ic_type ;
+  typedef vigra::TinyVector < rc_type , coordinate_dimension > nd_rc_type ;
+
+  vigra::MultiArray < dimension , nd_ic_type > _select ;
+  vigra::MultiArray < dimension , nd_rc_type > _tune ;
+  
+public:
+  
+  vigra::MultiArrayView < dimension , nd_ic_type > select ;
+  vigra::MultiArrayView < dimension , nd_rc_type > tune ;
+
+  typedef typename vigra::MultiArrayShape < dimension > :: type shape_type ;
+  
+public:
+  
+  /// constructor taking a shape argument. This creates the containers for integral and
+  /// remainder parts of the coordinates internally.
+
+  warper ( const shape_type & shape )
+  : _select ( shape ) ,
+    _tune ( shape )
+  {
+    select = _select ;
+    tune = _tune ;
+  } ;
+  
+  /// constructor taking MultiArrayViews to externally managed containers
+
+  warper ( vigra::MultiArrayView < dimension , nd_ic_type > & _select ,
+           vigra::MultiArrayView < dimension , nd_rc_type > & _tune )
+  {
+    select ( _select ) ;
+    tune ( _tune ) ;
+  } ;
+} ;  
+
+/// single-threaded remap taking separate arrays 'tune' and 'select', which are chunks of
+/// the arrays in a warper object. This routine is called by the one below it via
+/// divide_and_conquer_3.
+
+template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
+           class nd_ic_type , 
+           class nd_rc_type ,
+           int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
+           int dim_out >           // number of dimensions of output array
+int st_wremap ( const bspline < value_type , dim_in > & bspl ,
+                MultiArrayView < dim_out , nd_ic_type > select ,
+                MultiArrayView < dim_out , nd_rc_type > tune ,
+                MultiArrayView < dim_out , value_type > output ,
+                bool use_vc = true
+           )
+{
+  typedef bspline < value_type , dim_in > spline_type ;
+  typedef typename nd_rc_type::value_type rc_type ;
+  typedef typename nd_ic_type::value_type ic_type ;
+  typedef evaluator < dim_in , value_type , rc_type , ic_type > evaluator_type ;
+  typedef typename evaluator_type::ele_type ele_type ;
+
+  evaluator_type ev ( bspl ) ;
+  
+#ifdef USE_VC
+
+  typedef typename evaluator_type::ele_v ele_v ;
+  const int vsize = evaluator_type::vsize ;
+
+#endif
+  
+  auto select_it = select.begin() ;
+  auto tune_it = tune.begin() ;
+  auto target_it = output.begin() ;
+  
+  int leftover = select.elementCount() ;
+
+#ifdef USE_VC
+
+  if ( use_vc )
+  {
+    int aggregates = select.elementCount() / vsize ;            // number of full vectors
+    leftover = select.elementCount() - aggregates * vsize ;     // any leftover single values
+    nd_ic_type * pselect = select.data() ;                      // acces to memory
+    nd_rc_type * ptune = tune.data() ;                          // acces to memory
+    value_type * destination = output.data() ;
+
+    // for now we require that both select and tune are unstrided!
+    assert ( select.isUnstrided() && tune.isUnstrided() ) ;
+    
+    if ( output.isUnstrided() )
+    {
+      // best case: output array has consecutive memory, no need to buffer
+      for ( int a = 0 ;
+           a < aggregates ;
+           a++ , pselect += vsize , ptune += vsize , destination += vsize )
+      {
+        ev ( (ic_type*) pselect , (rc_type*) ptune , destination ) ;
+      }
+      target_it += aggregates * vsize ;
+    }
+    else
+    {
+      // we fall back to buffering and storing individual result values
+      // TODO while this is a straightforward implementation, it should be more efficient
+      // to (de)interleave here and call the fully vectorized evaluation code.
+      value_type target_buffer [ vsize ] ;
+      for ( int a = 0 ; a < aggregates ; a++ , pselect += vsize , ptune += vsize )
+      {
+        ev ( (ic_type*) pselect , (rc_type*) ptune , target_buffer ) ;
+        for ( int e = 0 ; e < vsize ; e++ )
+        {
+          *target_it = target_buffer[e] ;
+          ++target_it ;
+        }
+      }
+    }
+    select_it += aggregates * vsize ;
+    tune_it += aggregates * vsize ;
+  }
+  
+#endif // USE_VC
+
+  // process leftovers, if any - if vc isn't used, this loop does all the processing
+  while ( leftover-- )
+  {
+    // process leftovers with single-value evaluation
+    *target_it = ev ( *select_it , *tune_it ) ;
+    ++select_it ;
+    ++tune_it ;
+    ++target_it ;
+  }
+
+  return 0 ;
+}
+
+template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
+           class ic_type ,         // integral coordinate part
+           class rc_type ,         // real coordinate part
+           int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
+           int dim_out >           // number of dimensions of output array
+int remap ( const bspline < value_type , dim_in > & bspl ,
+            warper < ic_type , rc_type , dim_out , dim_in > & warp ,
+            MultiArrayView < dim_out , value_type > output ,
+            bool use_vc = true ,
+            int nthreads = ncores
+           )
+{
+  using namespace std::placeholders ;
+  
+  typedef warper < ic_type , rc_type , dim_out , dim_in > warper_type ;
+  typedef typename warper_type::nd_ic_type nd_ic_type ;
+  typedef typename warper_type::nd_rc_type nd_rc_type ;
+
+  // we use bind to bind the first and last argument of the single-threaded
+  // remap function, leaving two free arguments which will accept the two arrays
+  // of data to coiterate over (being the chunks of warp and output which are created
+  // by divide_and_conquer_2)
+
+  auto chunk_func_3
+  = std::bind ( st_wremap < value_type , nd_ic_type , nd_rc_type , dim_in , dim_out > ,
+                std::ref(bspl) ,     // bspline passed in above
+                _1 ,       // placeholders to accept data chunks
+                _2 ,       // from divide_and_conquer
+                _3 ,
+                use_vc ) ; // use_vc flag as passed in above
+
+  divide_and_conquer_3 < decltype ( warp.select ) ,
+                         decltype ( warp.tune ) ,
+                         decltype ( output ) >
+  :: run ( warp.select ,                   // on the warp array
+           warp.tune ,
+           output ,                        // and the output array
+           chunk_func_3 ,                  // applying the functor we've made with bind
+           -1 ,                            // allowing array splitting along any axis
+           nthreads ) ;                    // using nthreads threads
+} ;
+
 // Next we have some collateral code to get ready for the transformation-based remap
 
 /// type alias for a coordinate transformation functor, to (hopefully) make the signature

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git



More information about the debian-science-commits mailing list