Sun Jul 2 09:02:38 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit a98fa86e0539de74844b9ba424cdf356b606bb67
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sun Dec 4 11:29:01 2016 +0100

    polishing, new bc code TAG, remap now takes evaluator, not bspline
    Initially I had plans to have a pure virtual base class interpolator, specifying
    an interface for interpolating code, and inherit this interface in class
    evaluator. For now I've abandoned this path. what I have changed, though, is
    the remap routines: they now take an evaluator object instead of a bspline.
    This is a step in the same direction; a bspline object is pretty much what
    I had in mind for an 'interpolator' type object, so I may proceed from
    here to introduce the pure virtual base. What's the reason for this?
    After all it burdens the calling code with an extra step (to create the
    evaluator)? the idea is to be able to use interpolators other than
    b-splines in the remap routines, which only require objects able to
    produce an interpolated value for a coordinate and not specifically only
    b-splines. So the remap code can become more general.
    I also worked on the mapping code. For the mappings REJECT, LIMIT, and the
    new mapping TAG there are now also variants (REJECTP, LIMITP and TAGP)
    which reject with a higher ceiling, akin to PERIODIC mappings.
    Another idea which hasn't made it into the code base is to capture the
    sequence coordinate->split coordinate->offset+deltas in eval.h in separate
    code and have less variants of eval/v_eval. So far this isn't working
    satisfactorily, so I left it out.
---
 README.rst                  |   4 +-
 brace.h                     |   8 +-
 bspline.h                   |  61 ++++++---
 common.h                    |  41 ++++--
 eval.h                      |  88 +++++++++----
 example/impulse_response.cc |  37 +++---
 example/pano_extract.cc     |   9 +-
 example/roundtrip.cc        |  10 +-
 filter.h                    |  64 +++++----
 mapping.h                   | 308 ++++++++++++++++++++++++++++++++++++--------
 multithread.h               |   6 +-
 prefilter.h                 | 132 +++++++------------
 remap.h                     | 173 ++++++++++++++-----------
 13 files changed, 607 insertions(+), 334 deletions(-)

diff --git a/README.rst b/README.rst
index df91f62..70d2313 100644
--- a/README.rst
+++ b/README.rst
@@ -53,6 +53,7 @@ The second approach to B-splines comes from signal processing, and it's the one
 I have made an attempt to generalize the code so that it can handle
 
 - arbitrary real data types and their aggregates [1]_
+- coming in strided memory
 - a reasonable selection of boundary conditions
 - used in either an implicit or an explicit scheme of extrapolation
 - arbitrary spline orders
@@ -70,13 +71,14 @@ On the evaluation side I provide
 
 The code at the very core of my B-spline coefficient generation code evolved from the code by Philippe Thévenaz which he published here_, with some of the boundary condition treatment code derived from formulae which Philippe Thévenaz communicated to me. Next I needed code to handle multidimensional arrays in a generic fashion in C++. I chose to use Vigra_. Since my work has a focus on signal (and, more specifically image) processing, it's an excellent tool, as it provides a large body o [...]
 
-I did all my programming on a Kubuntu_ 16.04 system, running on an intel(R) Core (TM) i5-4570 CPU, and used GNU gcc_ 5.4.0 to compile the code in C++11 dialect. While I am confident that the code runs on other CPUs, I have not tested it with different compilers or operating systems (yet).
+I did all my programming on a Kubuntu_ system, running on an intel(R) Core (TM) i5-4570 CPU, and used GNU gcc_ 5.4.0 and clang_ 3.8.0 to compile the code in C++11 dialect. While I am confident that the code runs on other CPUs, I have not tested it with other compilers or operating systems (yet).
 
 .. _here: http://bigwww.epfl.ch/thevenaz/interpolation/
 .. _Vigra: http://ukoethe.github.io/vigra/
 .. _Vc: https://compeng.uni-frankfurt.de/index.php?id=vc 
 .. _Kubuntu: http://kubuntu.org/
 .. _gcc: https://gcc.gnu.org/
+.. _clang: http://http://clang.llvm.org/
 
 .. [CIT2000] Interpolation Revisited by Philippe Thévenaz, Member,IEEE, Thierry Blu, Member, IEEE, and Michael Unser, Fellow, IEEE in IEEE TRANSACTIONS ON MEDICAL IMAGING, VOL. 19, NO. 7, JULY 2000, available online here_
 
diff --git a/brace.h b/brace.h
index 0d6c703..fdf106e 100644
--- a/brace.h
+++ b/brace.h
@@ -293,6 +293,8 @@ struct bracer
     // bit sloppy here, with pathological data (very small source.size()) this will
     // be imprecise for odd sizes, for even sizes it's always fine. But then full
     // sphericals always have size 2N * N, so odd sizes should not occur at all for dim 0
+    target = source ;
+    return ;
     auto si = source.begin() + source.size() / 2 ;
     auto se = source.end() ;
     for ( auto& ti : target )
@@ -341,8 +343,8 @@ struct bracer
     int lp = l0 + 1 ;  // index of leftmost occupied slice (p for pivot)
     int rp = r0 - 1 ;  // index of rightmost occupied slice
 
-    int l1 = -1 ;     // index one before outermost empty slice to the left; like end()
-    int r1 = w ;      // index one after outermost empty slice on the right
+    int l1 = -1 ;     // index one before outermost empty slice to the left
+    int r1 = w ;      // index one after outermost empty slice on the right; like end()
 
     int lt = l0 ;     // index to left target slice
     int rt = r0 ;     // index to right target slice ;
@@ -413,7 +415,7 @@ struct bracer
             // easiest would be:
             // a.bindAt ( axis , lt ) = a.bindAt ( axis , lp ) * value_type(2) - a.bindAt ( axis , ls ) ;
             // but this fails in 1D TODO: why?
-            auto target = a.bindAt ( axis , lt ) ; // get a view to left targte slice
+            auto target = a.bindAt ( axis , lt ) ; // get a view to left target slice
             target = a.bindAt ( axis , lp ) ;      // assign value of left pivot slice
             target *= ele_type(2) ;                // double that
             target -= a.bindAt ( axis , ls ) ;     // subtract left source slice
diff --git a/bspline.h b/bspline.h
index e5b07f4..d88e856 100644
--- a/bspline.h
+++ b/bspline.h
@@ -179,6 +179,7 @@ struct bspline
                             bcv_type ,
                             int ,
                             double ,
+                            double ,
                             int ) ;
 
   /// elementary type of value_type, float or double
@@ -197,6 +198,7 @@ public:
   int spline_degree ;       ///< degree of the spline (3 == cubic spline)
   bcv_type bcv ;            ///< boundary condition, see common.h
   double tolerance ;        ///< acceptable error
+  double smoothing ;        ///< E ] 0 : 1 [; apply smoothing to data before prefiltering
   bool braced ;             ///< whether coefficient array is 'braced' or not
   int horizon ;             ///< additional frame size for explicit scheme
   shape_type left_brace ;   ///< width(s) of the left handside bracing
@@ -308,11 +310,13 @@ public:
             prefilter_strategy _strategy = BRACED , ///< default strategy is the 'implicit' scheme
             int _horizon = -1 ,                     ///< width of frame for explicit scheme
             view_type _space = view_type() ,        ///< coefficient storage to 'adopt'
-            double _tolerance = -1.0                ///< acceptable error (relative to unit pulse)
+            double _tolerance = -1.0 ,              ///< acceptable error (relative to unit pulse)
+            double _smoothing = 0.0                 ///< apply smoothing to data before prefiltering
           )
   : core_shape ( _core_shape ) ,
     spline_degree ( _spline_degree ) ,
     bcv ( _bcv ) ,
+    smoothing ( _smoothing ) ,
     strategy ( _strategy )
   {
     if ( _tolerance < 0.0 )
@@ -323,14 +327,20 @@ public:
       else if ( std::is_same < real_type , double > :: value )
         tolerance = .0000000000001 ;
       else
-        tolerance = 0.0 ;
+        tolerance = 0.0000000000000000001 ;
     }
 
     // heuristic: horizon for reasonable precision - we assume that no one in their right
     // minds would want a negative horizon ;)
 
+    real_type max_pole = .00000000000000000001 ;
+    if ( spline_degree > 1 )
+      max_pole = fabs ( precomputed_poles [ spline_degree ] [ 0 ] ) ;
+    if ( smoothing > max_pole )
+      max_pole = smoothing ;
+
     if ( _horizon < 0 )
-      horizon = log2 ( max ( spline_degree , 1 ) ) * 3 * sizeof ( real_type ) ;
+      horizon = ceil ( log ( tolerance ) / log ( max_pole ) ) ; // TODO what if tolerance == 0.0?
     else
       horizon = _horizon ; // whatever the user specifies
 
@@ -409,21 +419,21 @@ public:
       data = core ;
     }
 
-    // we call the solver via a function pointer
-    p_solve solve ;
-
-    // for simplicity's sake, if USE_VC isn't defined we use solve_vigra
-    // always and simply ignore the flag use_vc.
-
-#ifdef USE_VC
-//     we have two variants, one is using Vc and the other doesn't
-    if ( use_vc )
-      solve = & solve_vc < view_type , view_type > ;
-    else
-      solve = & solve_vigra < view_type , view_type > ;
-#else
-    solve = & solve_vigra < view_type , view_type > ;
-#endif
+//     // we call the solver via a function pointer
+//     p_solve solve ;
+// 
+//     // for simplicity's sake, if USE_VC isn't defined we use solve_vigra
+//     // always and simply ignore the flag use_vc.
+// 
+// #ifdef USE_VC
+// //     we have two variants, one is using Vc and the other doesn't
+//     if ( use_vc )
+//       solve = & solve_vc < view_type , view_type > ;
+//     else
+//       solve = & solve_vigra < view_type , view_type > ;
+// #else
+//     solve = & solve_vigra < view_type , view_type > ;
+// #endif
 
     // per default the output will be braced. This does require the output
     // array to be sufficiently larger than the input; class bracer has code
@@ -431,7 +441,12 @@ public:
 
     bracer<view_type> br ;
 
-    bcv_type explicit_bcv ( IGNORE ) ;
+    // for the explicit scheme, we use boundary condition 'guess' which tries to
+    // provide a good guess for the initial coefficients with a small computational
+    // cost. using zero-padding instead introduces a sharp discontinuity at the
+    // margins, which we want to avoid.
+
+    bcv_type explicit_bcv ( GUESS ) ;
 
     switch ( strategy )
     {
@@ -442,6 +457,8 @@ public:
                 bcv ,
                 spline_degree ,
                 tolerance ,
+                smoothing ,
+                use_vc ,
                 nthreads
               ) ;
         break ;
@@ -453,6 +470,8 @@ public:
                 bcv ,
                 spline_degree ,
                 tolerance ,
+                smoothing ,
+                use_vc ,
                 nthreads
               ) ;
         for ( int d = 0 ; d < dimension ; d++ )
@@ -474,6 +493,8 @@ public:
                 explicit_bcv ,
                 spline_degree ,
                 tolerance ,
+                smoothing ,
+                use_vc ,
                 nthreads
               ) ;
         break ;
@@ -489,6 +510,8 @@ public:
                 explicit_bcv ,
                 spline_degree ,
                 tolerance ,
+                smoothing ,
+                use_vc ,
                 nthreads
               ) ;
         break ;
diff --git a/common.h b/common.h
index 50cb7ff..748e0ed 100644
--- a/common.h
+++ b/common.h
@@ -43,7 +43,6 @@
 #define VSPLINE_COMMON
 
 #include <vigra/multi_array.hxx>
-#include <vspline/multithread.h>
 
 #ifdef USE_VC
 
@@ -92,6 +91,15 @@ struct vector_traits
 
 #endif
 
+/// for interfaces which need specific implementations we use:
+
+struct not_implemented
+: std::invalid_argument
+{
+  not_implemented ( const char * msg )
+  : std::invalid_argument ( msg ) { }  ;
+} ;
+
 /// dimension-mismatch is thrown if two arrays have different dimensions
 /// which should have the same dimensions.
 
@@ -129,6 +137,19 @@ struct out_of_bounds
 {
 } ;
 
+/// if out-of-bounds coordinates are encountered and the mapping used doesn't replace
+/// them with values inside the bounds (like by mirroring etc.), the mapping code will
+/// produce these two values for the integral and remainder part. Setting the integral
+/// part to 0 insures that even if the failure to map out-of-bounds values is not acted
+/// upon, the code won't access memory outside the coefficient array. Using 1.0 for the
+/// remainder part is (with the widened brace on the right margin) a value which can't
+/// occur with coordinates inside the permitted range, but it's a value which will produce
+/// an in-range value when processed by the evaluation code.
+
+const long ic_out_of_bounds = 0 ;
+const double rc_out_of_bounds = 1.0 ;
+const double rv_out_of_bounds = 0.0 ;
+
 /// This enumeration is used for codes connected to boundary conditions. There are
 /// three aspects to boundary conditions: During prefiltering, if the implicit scheme is used,
 /// the initial causal and anticausal coefficients have to be calculated in a way specific to
@@ -147,10 +168,14 @@ typedef enum { PERIODIC,   ///< periodic boundary conditions / periodic mapping
                ZEROPAD ,   ///< used for boundary condition, bracing
                IGNORE ,    ///< used for boundary condition, bracing
                IDENTITY ,  ///< used as solver argument, mostly internal use
-//                GUESS ,
+               GUESS ,
                SPHERICAL , ///< intended use for spherical panoramas, needs work
                REJECT ,    ///< mapping mode, throws out_of_bounds for out-of-bounds coordinates
+               REJECTP ,   ///< mapping mode, like REJECT, but ceiling 1 higher
                LIMIT ,     ///< mapping mode, maps out-of-bounds coordinates to left/right boundary
+               LIMITP ,    ///< mapping mode, like LIMIT, but ceiling 1 higher
+               TAG ,       ///< mapping mode, out-of-bounds coordinates produce delta -2.0
+               TAGP ,      ///< mapping mode, like TAG, but ceiling 1 higher
                RAW   ,     ///< mapping mode, processes coordinates unchecked, may crash the program
 } bc_code;
 
@@ -176,17 +201,17 @@ std::vector < std::string > bc_name =
   "ZEROPAD" ,
   "IGNORE" ,
   "IDENTITY" ,
+  "GUESS" ,
   "SPHERICAL" ,
   "REJECT" ,
+  "REJECTP" ,
   "LIMIT" ,
+  "LIMITP" ,
+  "TAG" ,
+  "TAGP" ,
   "RAW"
 } ;
 
-/// number of CPU cores in the system; per default, multithreading routines
-/// will perform their task with as many threads.
-
-const int ncores = std::thread::hardware_concurrency() ;
-
 } ; // end of namespace vspline
 
-#endif // VSPLINE_COMMON
\ No newline at end of file
+#endif // VSPLINE_COMMON
diff --git a/eval.h b/eval.h
index 4bf3108..9874e72 100644
--- a/eval.h
+++ b/eval.h
@@ -508,7 +508,7 @@ public:
 
   /// the evaluator can handle raw coordinates, but it needs a mapping to do so.
  
-  typedef typename MultiArray < 1 , ic_type > :: iterator          offset_iterator ;
+  typedef typename MultiArrayView < 1 , ic_type > :: const_iterator offset_iterator ;
   typedef MultiArrayView < dimension + 1 , ele_type >              component_view_type ;
   typedef typename component_view_type::difference_type            expanded_shape_type ;
   typedef vigra::TinyVector < ele_type* , channels >               component_base_type ;
@@ -736,7 +736,7 @@ public:
       throw not_supported ( "for spline degree > 1: evaluation needs braced coefficients" ) ;
   } ;
   
-  nd_mapping_type& get_mapping()
+  const nd_mapping_type& get_mapping() const
   {
     return mmap ;
   }
@@ -761,7 +761,7 @@ public:
   
   template < typename nd_rc_type , typename weight_type >
   void obtain_weights ( const MultiArrayView < 2 , weight_type > & weight ,
-                        const nd_rc_type& c )
+                        const nd_rc_type& c ) const
   {
     auto ci = c.cbegin() ;
     for ( int axis = 0 ; axis < dimension ; ++ci , ++axis )
@@ -800,7 +800,7 @@ public:
     dtype operator() ( const dtype* & pdata ,
                        offset_iterator & ofs ,
                        const MultiArrayView < 2 , ele_type > & weight
-                     )
+                     ) const
     {
       dtype result = dtype() ;
       for ( int i = 0 ; i < weight.shape(0) ; i++ )
@@ -824,7 +824,7 @@ public:
     dtype operator() ( const dtype* & pdata ,
                        offset_iterator & ofs ,
                        const MultiArrayView < 2 , ele_type > & weight
-                     )
+                     ) const
     {
       dtype result = dtype() ;
       for ( int i = 0 ; i < weight.shape(0) ; i++ )
@@ -848,7 +848,7 @@ public:
   
   void eval ( const ic_type & select ,
               const MultiArrayView < 2 , ele_type > & weight ,
-              value_type & result )
+              value_type & result ) const
   {
     const value_type * base = coefficients.data() + select ;
     offset_iterator ofs = offsets.begin() ;  // offsets reflects the positions inside the subarray
@@ -863,7 +863,7 @@ public:
 
   void eval ( const nd_ic_type & select ,
               const MultiArrayView < 2 , ele_type > & weight ,
-              value_type & result )
+              value_type & result ) const
   {
     eval ( sum ( select * coefficients.stride() ) , weight , result ) ;
   }
@@ -879,7 +879,7 @@ public:
   
   void eval ( const nd_ic_type& select ,
               const nd_rc_type& tune ,
-              value_type & result )
+              value_type & result ) const
   {
     // To calculate the weights we want efficient storage, being very much inner-loop here,
     // but at the same time we want adequate packaging for the weights as well. So we obtain
@@ -901,7 +901,7 @@ public:
   /// this variant of eval() takes a split coordinate:
   
   void eval ( const split_coordinate_type & sc , // presplit coordinate
-              value_type & result )
+              value_type & result ) const
   {
     eval ( sc.select , sc.tune , result ) ;
   }
@@ -911,7 +911,7 @@ public:
   /// also the slowest, due to the mapping, which is quite expensive computationally.
   
   void eval ( const nd_rc_type& c , // unsplit coordinate
-              value_type & result )
+              value_type & result ) const
   {
     split_coordinate_type sc ;
     mmap ( c , sc ) ;  /// apply the mappings
@@ -925,7 +925,7 @@ public:
   /// of the nD code above
   
   void eval ( const rc_type& c , // single 1D coordinate
-              value_type & result )
+              value_type & result ) const
   {
     static_assert ( dimension == 1 ,
                     "evaluation at a single real coordinate is only allowed for 1D splines" ) ;
@@ -946,7 +946,7 @@ public:
   template < class dtype , class weight_type >
   void flat_eval ( const dtype * const & pdata ,
                    const weight_type * const pweight ,
-                   value_type & result )
+                   value_type & result ) const
   {
     result = pweight[0] * pdata[offsets[0]] ;
     for ( int i = 1 ; i < window_size ; i++ )
@@ -961,7 +961,7 @@ public:
     void operator() ( dtype * & target ,
                       const MultiArrayView < 2 , dtype > & weight ,
                       dtype factor
-                    )
+                    ) const
     {
       if ( _level == level ) // both are template args, this conditional has no runtime cost
       {
@@ -993,7 +993,7 @@ public:
     void operator() ( dtype * & target ,
                       const MultiArrayView < 2 , dtype > & weight ,
                       dtype factor
-                    )
+                    ) const
     {
      if ( level == 0 )
       {
@@ -1023,7 +1023,7 @@ public:
   
   void flat_eval ( const ic_type & select ,
                    const MultiArrayView < 2 , ele_type > & weight ,
-                   value_type & result )
+                   value_type & result ) const
   {
     // gat a pointer to the coefficient window's beginning
     const value_type * base = coefficients.data() + select ;
@@ -1064,7 +1064,7 @@ public:
     dtype operator() ( const component_base_type& base , ///< base adresses of components
                        const ic_v& origin ,              ///< offsets to evaluation window origins
                        offset_iterator & ofs ,           ///< offsets to coefficients inside this window
-                       const MultiArrayView < 2 , ele_v > & weight ) ///< weights to apply
+                       const MultiArrayView < 2 , ele_v > & weight ) const ///< weights to apply
     {
       dtype sum = dtype() ;    ///< to accumulate the result
       dtype subsum ; ///< to pick up the result of the recursive call
@@ -1089,7 +1089,7 @@ public:
     dtype operator() ( const component_base_type& base , ///< base adresses of components
                        const ic_v& origin ,              ///< offsets to evaluation window origins
                        offset_iterator & ofs ,           ///< offsets to coefficients in this window
-                       const MultiArrayView < 2 , ele_v > & weight ) ///< weights to apply
+                       const MultiArrayView < 2 , ele_v > & weight ) const ///< weights to apply
     {
       dtype sum = dtype() ;
 
@@ -1115,7 +1115,7 @@ public:
   
   void v_eval ( const ic_v& select ,  // offsets to lower corners of the subarrays
                 const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
-                mc_ele_v & result )
+                mc_ele_v & result ) const
   {
     // we need an instance of this iterator because it's passed into _v_eval by reference
     // and manipulated by the code there:
@@ -1131,7 +1131,7 @@ public:
 
   void v_eval ( const nd_ic_v& select ,  // lower corners of the subarrays
                 const MultiArrayView < 2 , ele_v > & weight , // vectorized weights
-                mc_ele_v & result )
+                mc_ele_v & result ) const
   {
     // here we transform the incoming nD coordinates into offsets into the coefficient
     // array's memory.
@@ -1151,14 +1151,25 @@ public:
 
   void v_eval ( const nd_ic_v& select ,  // lower corners of the subarrays
                 const nd_rc_v& tune ,    // fractional parts of the incoming coordinates
-                mc_ele_v & result )
+                mc_ele_v & result ) const
   {
     // like in the unvectorized code, the 2D MultiArrayView to the weights is created
     // as a view to data on the stack (weight_data) which is lightweight and fast
     
+    // ... but clang++ does not accept this (variable sized array of non-POD type)
+    // When the second variant is used, the code runs here compiled with clang++,
+    // but not when compiled with gcc (probably due to an alignment problem),
+    // so I have this TODO ugly ifdef :( and TODO make sure the clang-specific code
+    // is safe regarding alignment
+
+ #ifndef __clang__
     ele_v weight_data [ workspace_size ] ;
     MultiArrayView < 2 , ele_v > weight ( Shape2 ( ORDER , dimension ) , weight_data ) ;
-    
+#else
+    ele_type weight_data [ workspace_size * vsize ] ;
+    MultiArrayView < 2 , ele_v > weight ( Shape2 ( ORDER , dimension ) , (ele_v*)weight_data ) ;
+#endif
+
     // obtain_weights is the same code for vectorized operation, the arguments
     // suffice to pick the right template argiments
   
@@ -1167,6 +1178,29 @@ public:
     // delegate
 
     v_eval ( select , weight , result ) ;
+
+#ifdef TEST_FOR_TAGGED
+    
+    // this bit of code is optional, it's only needed if mapping mode TAG is used.
+    // If TEST_FOR_TAGGED is not defined and mapping mode TAG is used, out-of-range
+    // coordinates produce undefined results but the program will not crash.
+    // TODO: this isn't good style, there should be a way of parametrizing this
+    // behaviour. This would best be done by introducing a 'strategy' template parameter
+    // to class evaluator and changing the remap code to accept an evaluator instead of
+    // a spline, which is desirable anyway.
+
+    for ( int d = 0 ; d < dimension ; d++ )
+    {
+      auto mask = ( tune[d] == rc_type ( rc_out_of_bounds ) ) ; // test for oob value
+      if ( any_of ( mask ) )
+      {
+        for ( int ch = 0 ; ch < channels ; ch++ ) // coordinate is oob, set result to 0
+          result[ch] ( mask ) = ele_type ( rv_out_of_bounds ) ;
+      }
+    }
+
+#endif
+
   }
 
   /// here we take the approach to require the calling function to present pointers to
@@ -1177,7 +1211,7 @@ public:
   /// data are strided. Anyway, it doesn't hurt to have the option.
   
   void v_eval ( const split_coordinate_type* const psc , // pointer to vsize presplit coordinates
-                value_type * result )                    // pointer to vsize result values
+                value_type * result )  const                   // pointer to vsize result values
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1211,7 +1245,7 @@ public:
   
   void v_eval ( const ic_type * const pi ,  // pointer to integral parts of coordinates
                 const rc_type * const pr ,  // pointer to real parts fo coordinates
-                value_type * result )  // pointer to vsize result values
+                value_type * result ) const  // pointer to vsize result values
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1240,7 +1274,7 @@ public:
   /// the data in simdized form.
   
   void v_eval ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
-                mc_ele_v & result )      // number of channels * value vectors
+                mc_ele_v & result )  const     // number of channels * value vectors
   {
     nd_ic_v select ;
     nd_rc_v tune ;
@@ -1263,7 +1297,7 @@ public:
   // relying on the optimizer to shed the unnecessary TinyVector<T,1> container
   
   void v_eval ( const nd_rc_type * const pmc ,  // pointer to vsize muli_coordinates
-                value_type * result )           // pointer to vsize result values
+                value_type * result ) const           // pointer to vsize result values
   {
     nd_rc_v input ;
     mc_ele_v v_result ;
@@ -1298,7 +1332,7 @@ public:
   }
 
   void v_eval ( const rc_type * const pmc ,  // pointer to vsize 1D coordinates
-                value_type * result )        // pointer to vsize result values
+                value_type * result ) const        // pointer to vsize result values
   {
     static_assert ( dimension == 1 ,
                     "this v_eval variant is intended for 1D splines only" ) ;
@@ -1331,7 +1365,7 @@ public:
   /// and output goes to interleaved memory
 
   void v_eval ( const nd_rc_v & input ,  // number of dimensions * coordinate vectors
-                value_type * result )    // pointer to vsize result values
+                value_type * result ) const    // pointer to vsize result values
   {
     mc_ele_v v_result ;
 
diff --git a/example/impulse_response.cc b/example/impulse_response.cc
index 8c43343..f9e5387 100644
--- a/example/impulse_response.cc
+++ b/example/impulse_response.cc
@@ -70,12 +70,11 @@
 /// using lower-level access to the filter are commented out.
 
 // when playing with the filter a bit, I noticed that feeding the filter with the
-// absolute values of the poles produces an impulse response which looks much like
+// absolute values of the poles produces an impulse response which looks remotely like
 // a gaussian and also represents a partition of unity (like the b-spline reconstruction
 // filter). Using this filter instead of the prefilter produces a blurred image, as one
 // might expect from a filter with an impulse response like this - the precise maths
-// aren't clear to me yet. I may have reinvented a very efficient method of calculating
-// a gaussian smoothing filter...
+// aren't clear to me yet. 
 // the impulse resonse produced by passing in the absolute values of the poles is the
 // absolute of the 'normal' impulse response, scaled by a factor.
 // to keep the filter stable, the poles must be less than 1.0. positive poles approaching
@@ -101,10 +100,10 @@ int main ( int argc , char * argv[] )
   
 // using the highest-level access to prefiltering, we code:
 
-  vspline::bspline < double , 1 > bsp ( 10001 , degree , vspline::PERIODIC ) ; // , vspline::EXPLICIT ) ;
-  auto v1 = bsp.core ;
-  v1 [ 5000 ] = 1.0 ;
-  bsp.prefilter() ;
+//   vspline::bspline < double , 1 > bsp ( 10001 , degree , vspline::PERIODIC ) ; // , vspline::EXPLICIT ) ;
+//   auto v1 = bsp.core ;
+//   v1 [ 5000 ] = 1.0 ;
+//   bsp.prefilter() ;
 
 // using slightly lower-level access to the prefiltering code, we could achieve
 // the same result with:
@@ -115,17 +114,17 @@ int main ( int argc , char * argv[] )
   
 // and, going yet one level lower, this code also produces the same result:
   
-//   vigra::MultiArray < 1 , double > v1 ( 10001 ) ;
-//   v1[5000] = 1.0 ;
-//   typedef decltype ( v1.begin() ) iter_type ;
-//   vspline::filter < iter_type , iter_type , double >
-//     f ( v1.size() ,
-//         vspline::overall_gain ( degree / 2 , precomputed_poles [ degree ] ) ,
-//         vspline::MIRROR ,
-//         degree / 2 ,
-//         precomputed_poles [ degree ] ,
-//         0.00001 ) ;
-//   f.solve ( v1.begin() ) ;
+  vigra::MultiArray < 1 , double > v1 ( 10001 ) ;
+  v1[5000] = 1.0 ;
+  typedef decltype ( v1.begin() ) iter_type ;
+  vspline::filter < iter_type , iter_type , double >
+    f ( v1.size() ,
+        vspline::overall_gain ( 1 , precomputed_poles [ degree ] ) ,
+        vspline::MIRROR ,
+        1 ,
+        precomputed_poles [ degree ] ,
+        0.00001 ) ;
+  f.solve ( v1.begin() ) ;
         
   std::cout << "double ir_" << degree << "[] = {" << std::endl ;
   std::cout << std::fixed << std::showpos << std::showpoint << std::setprecision(16);
@@ -133,7 +132,7 @@ int main ( int argc , char * argv[] )
   for ( k = 0 ; k < 10001 ; k++ )
   {
     if ( fabs ( v1[k] ) > cutoff )
-      std::cout << v1[k] / << " ," << std::endl ;
+      std::cout << v1[k] << " ," << std::endl ;
   }
   std::cout << " } ;" << std::endl ;
 }
\ No newline at end of file
diff --git a/example/pano_extract.cc b/example/pano_extract.cc
index 939f921..56e0881 100644
--- a/example/pano_extract.cc
+++ b/example/pano_extract.cc
@@ -694,10 +694,12 @@ void process_image ( char * name ,
   start = std::chrono::system_clock::now();
 #endif
   
+  evaluator < 2 , pixel_type , rc_type , int > ev ( bspl ) ;
+  
   for ( int times = 0 ; times < 1 ; times++ )
   {
     vspline::tf_remap < pixel_type , rc_type , 2 , 2 >
-      ( bspl , tf , result ) ;
+      ( ev , tf , result ) ;
   }
 
   if ( extra_bands )
@@ -726,9 +728,12 @@ void process_image ( char * name ,
       tf_alpha ( tf_se ) ;
 #endif
 
+    // create an evaluator for bspl_alpha
+    evaluator < 2 , float , rc_type , int > ev_alpha ( bspl_alpha ) ;
+
     // now we do a transformation-based remap of the alpha channel
     vspline::tf_remap < float , rc_type , 2 , 2 >
-      ( bspl_alpha , tf_alpha , alpha_result ) ;
+      ( ev_alpha , tf_alpha , alpha_result ) ;
   }
 
 #ifdef PRINT_ELAPSED
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index db5a6cb..445548a 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -167,7 +167,7 @@ void roundtrip ( view_type & data ,
   typedef vspline::nd_mapping < int , rc_type , 2 , vsize > mmap ;
   
   // obtain the mapping from the evaluator:
-  mmap map = ev.get_mapping() ;
+  const mmap map = ev.get_mapping() ;
 
   typedef vigra::MultiArray<2, coordinate_type> coordinate_array ;
   
@@ -228,7 +228,7 @@ void roundtrip ( view_type & data ,
   
   for ( int times = 0 ; times < TIMES ; times++ )
     vspline::remap<pixel_type,split_type,2,2>
-      ( bsp , warp , target , use_vc ) ;
+      ( ev , warp , target , use_vc ) ;
 
   
 #ifdef PRINT_ELAPSED
@@ -246,7 +246,7 @@ void roundtrip ( view_type & data ,
   
   for ( int times = 0 ; times < TIMES ; times++ )
     vspline::remap<pixel_type,int,rc_type,2,2>
-      ( bsp , wwarp , target , use_vc ) ;
+      ( ev , wwarp , target , use_vc ) ;
 
   
 #ifdef PRINT_ELAPSED
@@ -264,7 +264,7 @@ void roundtrip ( view_type & data ,
   
   for ( int times = 0 ; times < TIMES ; times++ )
     vspline::remap<pixel_type,coordinate_type,2,2>
-      ( bsp , fwarp , target , use_vc ) ;
+      ( ev , fwarp , target , use_vc ) ;
 
   
 #ifdef PRINT_ELAPSED
@@ -334,7 +334,7 @@ void roundtrip ( view_type & data ,
   
   for ( int times = 0 ; times < TIMES ; times++ )
     vspline::tf_remap < pixel_type , rc_type , 2 , 2 >
-      ( bsp , tf , target , use_vc ) ;
+      ( ev , tf , target , use_vc ) ;
 
   // note:: with REFLECT this doesn't come out right, because of the .5 difference!
       
diff --git a/filter.h b/filter.h
index d49be89..cadd889 100644
--- a/filter.h
+++ b/filter.h
@@ -152,7 +152,7 @@ class filter
   /// the solving routine and initial coefficient finding routines are called via method pointers.
   /// these pointers are typedefed for better legibility:
   
-  typedef int       ( filter_type::*p_solve )  ( in_iter  input , out_iter output ) ;
+  typedef void       ( filter_type::*p_solve )  ( in_iter  input , out_iter output ) ;
   typedef value_type ( filter_type::*p_icc1 )  ( in_iter  input , int k ) ;
   typedef value_type ( filter_type::*p_icc2 )  ( out_iter input , int k ) ;
   typedef value_type ( filter_type::*p_iacc )  ( out_iter input , int k ) ;
@@ -170,7 +170,7 @@ public:
  /// solve() takes two iterators, one to the input data and one to the output space.
  /// The containers must have the same size. It's safe to use solve() in-place.
 
- int solve ( in_iter input , out_iter output )
+ void solve ( in_iter input , out_iter output )
  {
    (this->*_p_solve) ( input , output ) ;
  }
@@ -179,7 +179,7 @@ public:
  /// I checked: a handcoded in-place routine using only a single
  /// iterator is not noticeably faster than using one with two separate iterators.
  
- int solve ( out_iter data )
+ void solve ( out_iter data )
  {
    (this->*_p_solve) ( data , data ) ;
  }
@@ -484,19 +484,23 @@ value_type iacc_safe_beyond ( out_iter c , int k )
 }
 
 // guess the initial coefficient. This tries to minimize the effect
-// of starting out with a hard discontinuity as it occurs with zero-padding.
-// TODO needs some thought, currently unused.
-
-// template < class IT >
-// value_type icc_guess ( IT c , int k )
-// {
-//   return c[0] * ( 1.0 / ( 1.0 - pole[k] ) ) ;
-// }
-// 
-// value_type iacc_guess ( out_iter c , int k )
-// {
-//   return c[M-1] * ( pole[k] / ( pole[k] * pole[k] - 1.0 ) ) ;
-// }
+// of starting out with a hard discontinuity as it occurs with zero-padding,
+// while at the same time requiring little arithmetic effort
+// TODO needs some thought
+
+// for the forward filter, we guess an extrapolation of the signal to the left
+// repeating c[0] indefinitely, which is cheap to compute:
+template < class IT >
+value_type icc_guess ( IT c , int k )
+{
+  return c[0] * real_type ( 1.0 / ( 1.0 - pole[k] ) ) ;
+}
+
+// for the backward filter, we assume mirror BC, which is also cheap to compute:
+value_type iacc_guess ( out_iter c , int k )
+{
+  return iacc_mirror ( c , k ) ;
+}
 
 template < class IT >
 value_type icc_identity ( IT c , int k )
@@ -520,7 +524,7 @@ value_type iacc_identity ( out_iter c , int k )
 /// recursion formula, which would read the previous value of the recursion from memory
 /// by accessing x[n-1], or, x[n+1], respectively.
 
-int solve_gain_inlined ( in_iter c , out_iter x )
+void solve_gain_inlined ( in_iter c , out_iter x )
 {
   assert ( M > 1 ) ;
   
@@ -600,7 +604,7 @@ int solve_gain_inlined ( in_iter c , out_iter x )
 /// solve routine without application of any gain, it is assumed that this has been
 /// done already during an initial run with the routine above, or in some other way.
 
-int solve_no_gain ( in_iter c , out_iter x )
+void solve_no_gain ( in_iter c , out_iter x )
 {
   assert ( M > 1 ) ;
 
@@ -657,10 +661,10 @@ int solve_no_gain ( in_iter c , out_iter x )
 ///
 /// this routine can also be used for splines of degree 0 and 1, for simplicity's sake
 
-int solve_identity ( in_iter c , out_iter x )
+void solve_identity ( in_iter c , out_iter x )
 {
   if ( x == c ) // if operation is in-place we needn't do anything
-    return 0 ;
+    return ;
   for ( int n = 0 ; n < M ; n++ ) // otherwise, copy input to output
     x[n] = c[n] ;
 }
@@ -770,12 +774,12 @@ filter ( int _M ,               ///< number of input/output elements (DataLength
   {
     _p_solve = & filter_type::solve_identity ;
   }
-//   else if ( bc == GUESS ) // currently unused
-//   {
-//     _p_icc1 = & filter_type::icc_guess<in_iter> ;
-//     _p_icc2 = & filter_type::icc_guess<out_iter> ;
-//     _p_iacc = & filter_type::iacc_guess ;
-//   }
+  else if ( bc == GUESS ) // currently unused
+  {
+    _p_icc1 = & filter_type::icc_guess<in_iter> ;
+    _p_icc2 = & filter_type::icc_guess<out_iter> ;
+    _p_iacc = & filter_type::iacc_guess ;
+  }
   else
     throw not_supported ( "boundary condition not supported by vspline::filter" ) ;
 }
@@ -1130,7 +1134,10 @@ void ele_aggregating_filter ( source_type &source ,
 
   // we have a traits class fixing the simdized type (in common.h):
   
-  typedef typename vector_traits < ele_type > :: type simdized_type ;
+//   typedef typename vector_traits < ele_type > :: type simdized_type ;
+
+  // for prefiltering, using Vc:Vectors seems faster than using SimdArrays of twice the size
+  typedef typename Vc::Vector < ele_type > simdized_type ;
 
   const int vsize ( simdized_type::Size ) ;              // number of ele_type in a simdized_type
   typedef typename simdized_type::IndexType index_type ; // indexes for gather/scatter
@@ -1469,7 +1476,8 @@ public:
   
 #ifdef USE_VC
  
-  const int vsize = vector_traits < ele_type > :: vsize ;
+//   const int vsize = vector_traits < ele_type > :: vsize ;
+  const int vsize = Vc::Vector < ele_type > :: Size ;
   
   // if we can use vector code, the number of lanes is multiplied by the
   // number of elements a simdized type inside the vector code can handle
diff --git a/mapping.h b/mapping.h
index 557733f..746cb07 100644
--- a/mapping.h
+++ b/mapping.h
@@ -201,9 +201,6 @@ rc_v v_fmod ( const rc_v& lhs ,
 /// - they split the resulting value into an integral and a real part
 /// - they take into account special requirements for odd and even splines
 ///
-/// we derive all mappings from this common base class and later on access mappings via
-/// a pointer to the base class, which has a virtual operator().
-///
 /// It's slightly annoying that the folding of the coordinates into the defined range
 /// and the splitting into real and integral part should be lumped together in one operation,
 /// but decoupling the two would require more arithmetic, and the goal is to be as fast as
@@ -224,11 +221,13 @@ rc_v v_fmod ( const rc_v& lhs ,
 /// one slice larger, and accessing the spline at M-1, 0.0 is now safe, so the test can be omitted.
 /// I feel the sacrifice of one slice's worth of memory is worth the performance gain and increased
 /// code transparency.
-
+///
 /// the simplest and fastest mapping is the 'raw' mapping. Here, only the split is performed,
 /// come what may. The user is responsible for the suitability of incoming coordinates, it is
 /// silently assumed that 0.0 <= v <= M-1; if this is not true, the results may be false or
 /// the program may crash.
+///
+/// Note how M is passed in but not used.
 
 template < typename ic_t , typename rc_t , int vsize = 1 >
 class odd_mapping_raw
@@ -238,11 +237,11 @@ public:
   odd_mapping_raw ( int M )
     { } ;
     
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     rc_t fl_i ;
     fv = modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
+    iv = ic_t ( fl_i ) ;      // set integer part from float representing it
   }
 
 #ifdef USE_VC
@@ -252,11 +251,11 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;       // set integer part from float representing it
   }
 
 #endif
@@ -270,7 +269,7 @@ public:
   even_mapping_raw ( int M )
     { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     rc_t fl_i ;
     fv = modf ( v + rc_t ( 0.5 ) , &fl_i ) - rc_t ( 0.5 ) ;
@@ -284,13 +283,13 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     v += rc_t ( 0.5 ) ;
-    fv = v_modf ( v , &fl_i ) ;         // split v into integral and remainder in [-0.5 ... 0.5]
+    fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder in [-0.5 ... 0.5]
     fv -= rc_t ( 0.5 ) ;
-    iv = ic_v ( fl_i )  ;              // set integer part from float representing it
+    iv = ic_v ( fl_i )  ;       // set integer part from float representing it
   }
 
 #endif
@@ -313,7 +312,7 @@ public:
   : _ceiling ( M - 1 )
     { } ;
     
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v < 0.0 )
     {
@@ -321,7 +320,7 @@ public:
       fv = 0.0 ;
       return ;               // in this case we're done prematurely
     }
-   else if ( v > _ceiling )
+    else if ( v > _ceiling )
     {
       iv = ic_t ( _ceiling ) ;
       fv = 0.0 ;
@@ -339,7 +338,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     v ( v > _ceiling ) = _ceiling ;
     
@@ -364,7 +363,7 @@ public:
   : _ceiling ( M - 1 )
     { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v > _ceiling )
       v = rc_t ( _ceiling ) ;
@@ -384,7 +383,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     v ( v > _ceiling ) = _ceiling ;
 
@@ -404,10 +403,17 @@ public:
 /// an exception (out_of_bounds). The calling code has to catch the exception if it
 /// wants to proceed. If the mapping is on vector types, some of the result may be
 /// valid. The out-of-bounds results can be recognized by the int part being set to
-/// -1, which is a value which can't be valid. Any out-of-bounds element triggers
-/// the exception.
-/// TODO: We might simply throw the exception and not bother with the -1 masking.
-/// The exception object might be made to contain the offending value.
+/// 0 and the float part to rc_out_of_bounds, which is a value which can't be valid.
+/// Any out-of-bounds element triggers the exception.
+/// I tried out code which caught the exception in the evaluator and assigned some
+/// value (like 0 or 128) to the result for out-of-bounds coordinates. While, as long
+/// as the out-of-bounds exception didn't occur, the code was fast, if the exception
+/// was thrown and caught, things slowed down a great deal. So for this kind of
+/// processing the next mapping (TAG, TAGP) is better, since it has next to no
+/// run-time cost for out-of-bounds values. REJECT mapping, throwing exceptions, should
+/// be used if the calling code expects the exception won't occur at all and needs
+/// to do somethig on a higher level to deal with the problem.
+/// TODO: Therefor we might simply throw the exception and not bother with the masking.
 
 template < typename ic_t , typename rc_t , int vsize = 1 >
 class odd_mapping_reject
@@ -420,10 +426,14 @@ public:
   : _ceiling ( M - 1 )
     { } ;
     
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v < 0.0 || v > _ceiling )    // reject out-of-bounds values
+    {
+      iv = ic_t ( ic_out_of_bounds ) ;
+      fv = rc_t ( rc_out_of_bounds ) ;
       throw out_of_bounds() ;
+    }
 
     // now we are sure that v is safely inside [ 0 : _ceiling ]
     rc_t fl_i ;
@@ -438,7 +448,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
 
@@ -446,18 +456,18 @@ public:
     auto too_small = ( v < rc_t ( 0.0 ) ) ;
     auto mask = too_small | too_large ;
     
+    fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
+    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
+
     if ( any_of ( mask ) )
     {
       // v has some out-of-bounds values
-      fv = v_modf ( v , &fl_i ) ;    // split v into integral and remainder from [0...1[
-      iv = ic_v ( fl_i )  ;         // set integer part from float representing it
-      // set iv to -1 at out-of-bounds values. mask cast is safe as sizes match
-      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( -1 ) ;
+      // set fv to ic_out_of_bounds , rc_out_of_bounds at out-of-bounds values.
+      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( ic_out_of_bounds ) ;
+      fv ( mask ) = rc_t ( rc_out_of_bounds ) ;
       throw out_of_bounds() ;
     }
     
-    fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
-    iv = ic_v ( fl_i )  ;      // set integer part from float representing it
   }
 
 #endif
@@ -478,13 +488,17 @@ public:
   : _ceiling ( M - 1 )
     { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
-    rc_t fl_i ;
-    if ( v < 0.0 || v > _ceiling )  // test range
+    if ( v < 0.0 || v > _ceiling )    // reject out-of-bounds values
+    {
+      iv = ic_t ( ic_out_of_bounds ) ;
+      fv = rc_t ( rc_out_of_bounds ) ;
       throw out_of_bounds() ;
+    }
 
     // split v into integral and remainder in [-0.5 ... 0.5]
+    rc_t fl_i ;
     fv = modf ( v + rc_t(0.5) , &fl_i ) - rc_t(0.5) ;
     iv = ic_t ( fl_i ) ;     // set integer part from float representing it
   }
@@ -496,7 +510,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     
@@ -510,7 +524,8 @@ public:
       fv = v_modf ( v , &fl_i ) ;    // split v into integral and remainder from [0...1[
       iv = ic_v ( fl_i )  ;         // set integer part from float representing it
       // set iv to -1 at out-of-bounds values. mask cast is safe as sizes match.
-      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( -1 ) ;
+      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( ic_out_of_bounds ) ;
+      fv ( mask ) = rc_t ( rc_out_of_bounds ) ;
       throw out_of_bounds() ;
     }
     // now we are sure that v is safely inside [ 0 : _ceiling [
@@ -523,6 +538,147 @@ public:
 #endif
 } ;
 
+/// with mapping mode TAG, out-of-bounds incoming coordinate values don't result in
+/// an exception, but offending values are 'tagged' by the remainder part being set to
+/// rc_out_of_bounds, which is a value which can't be valid. The calling code is responsible for
+/// taking appropriate action.
+/// If the calling code fails to treat the 'impossible' value appropriately, this
+/// will result in a wrong result but not in an access to an invalid memory location,
+/// which might be the result of passing some impossible value for the int part.
+
+// TODO: there is a fundamental problem here: if the data are, say, periodic, we need
+// a different ceiling to when they are, say, mirrored. There have to be different
+// mappings for these scenarios, because if we use the ceiling for mirror data (M-1)
+// we have a 1-pixel gap when the data are in fact periodic, but if we use M as the
+// ceiling, this would let out-of-bounds acesses at the right margin pass for mirror
+// data. And this applies to REJECT and RAW as well.
+// For now I provide the right behaviour via a variant bc code 'TAGP' which feeds
+// a value of M to the constructor which is 1 larger than that for plain TAG.
+
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class odd_mapping_tag
+{
+  const rc_t _ceiling ;
+  
+public:
+  
+  odd_mapping_tag ( int M )
+  : _ceiling ( M - 1 )
+    { } ;
+    
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  {
+    if ( v < 0.0 || v > _ceiling )    // tag out-of-bounds values
+    {
+      iv = ic_t ( ic_out_of_bounds ) ;
+      fv = rc_t ( rc_out_of_bounds ) ;
+      return ;
+    }
+
+    // now we are sure that v is safely inside [ 0 : _ceiling ]
+    rc_t fl_i ;
+    fv = modf ( v , &fl_i ) ;   // split v into integral and remainder from [0...1[
+    iv = ic_t ( fl_i ) ;       // set integer part from float representing it
+  }
+
+#ifdef USE_VC
+
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+
+  /// this is the operator() for vectorized operation
+
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  {
+    rc_v fl_i ;
+
+    auto too_large = ( v > rc_t ( _ceiling ) ) ;
+    auto too_small = ( v < rc_t ( 0.0 ) ) ;
+    auto mask = too_small | too_large ;
+    
+    if ( any_of ( mask ) )
+    {
+      // v has some out-of-bounds values
+      fv = v_modf ( v , &fl_i ) ;   // split v into integral and remainder from [0...1[
+      iv = ic_v ( fl_i )  ;         // set integer part from float representing it
+      // set fv to ic_out_of_bounds , rc_out_of_bounds at out-of-bounds values.
+      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( ic_out_of_bounds ) ;
+      fv ( mask ) = rc_t ( rc_out_of_bounds ) ;
+      return ;
+    }
+    
+    fv = v_modf ( v , &fl_i ) ; // split v into integral and remainder from [0...1[
+    iv = ic_v ( fl_i )  ;       // set integer part from float representing it
+  }
+
+#endif
+} ;
+
+/// I use the same rejectionion criterion here as in odd splines, which is debatable:
+/// one might argue that coordinate values from -0.5 to 0.0 and M - 1.0 to M - 0.5
+/// can be processed meaningfully and hence should not be tagged.
+
+template < typename ic_t , typename rc_t , int vsize = 1 >
+class even_mapping_tag
+{
+  const rc_t _ceiling ;
+
+public:
+  
+  even_mapping_tag ( int M )
+  : _ceiling ( M - 1 )
+    { } ;
+  
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  {
+    if ( v < 0.0 || v > _ceiling )    // reject out-of-bounds values
+    {
+      iv = ic_t ( ic_out_of_bounds ) ;
+      fv = rc_t ( rc_out_of_bounds ) ;
+      return ;
+    }
+
+    // split v into integral and remainder in [-0.5 ... 0.5]
+    rc_t fl_i ;
+    fv = modf ( v + rc_t(0.5) , &fl_i ) - rc_t(0.5) ;
+    iv = ic_t ( fl_i ) ;     // set integer part from float representing it
+  }
+
+#ifdef USE_VC
+
+  typedef typename vector_traits < ic_t , vsize > :: type ic_v ;
+  typedef typename vector_traits < rc_t , vsize > :: type rc_v ;
+
+  /// this is the operator() for vectorized operation
+
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  {
+    rc_v fl_i ;
+    
+    auto too_large = ( v > _ceiling ) ;
+    auto too_small = ( v < rc_t ( 0.0 ) ) ;
+    auto mask = too_small | too_large ;
+    
+    if ( any_of ( mask ) )
+    {
+      // v has some out-of-bounds values
+      fv = v_modf ( v , &fl_i ) ;    // split v into integral and remainder from [0...1[
+      iv = ic_v ( fl_i )  ;         // set integer part from float representing it
+      // set iv to -1 at out-of-bounds values. mask cast is safe as sizes match.
+      iv ( typename ic_v::mask_type ( mask ) ) = ic_t ( ic_out_of_bounds ) ;
+      fv ( mask ) = rc_t ( rc_out_of_bounds ) ;
+      return ;
+    }
+    // now we are sure that v is safely inside [ 0 : _ceiling [
+    v += rc_t(0.5) ;
+    fv = v_modf ( v , &fl_i ) ;           // split v into integral and remainder in [-0.5 ... 0.5]
+    fv -= rc_t(0.5) ;
+    iv = ic_v ( fl_i )  ;              // set integer part from float representing it
+  }
+
+#endif
+} ;
+
 /// the next mapping is for coefficients/data mirrored at the ends of the defined
 /// range. This is probably the most commonly used type and also the type which
 /// P. Thevenaz recommends. Like all mappings defined in this body of code, it comes
@@ -552,7 +708,7 @@ public:
   // with odd splines we have to be careful not to access the coefficient matrix
   // with iv == M - 1 and this results in extra safeguarding code.
     
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     rc_t fl_i ;
     if ( v < 0.0 )              // apply mirror left boundary condition
@@ -579,7 +735,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     
@@ -624,7 +780,7 @@ public:
     { } ;
   
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     rc_t fl_i ;
     if ( v < 0.0 )               // apply mirror left boundary condition
@@ -650,7 +806,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     
@@ -695,7 +851,7 @@ public:
   odd_mapping_periodic ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     rc_t fl_i ;
     if ( v < 0.0 )
@@ -717,7 +873,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     if ( any_of ( v < rc_t(0) ) || any_of ( v >= _ceiling ) )
@@ -742,7 +898,7 @@ public:
   even_mapping_periodic ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     rc_t fl_i ;
     
@@ -766,7 +922,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     if ( any_of ( v < rc_t(0) ) || any_of ( v >= _ceiling ) )
@@ -812,7 +968,7 @@ public:
   odd_mapping_constant ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     rc_t fl_i ;
     if ( v < 0.0 )
@@ -838,7 +994,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     v ( v < rc_t ( 0 ) ) = rc_t ( 0 ) ;
@@ -860,7 +1016,7 @@ public:
   even_mapping_constant ( int M )
   : _ceiling ( M - 1 ) { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     if ( v < 0.0 )
     {
@@ -887,7 +1043,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     v ( v < rc_t(0) ) = rc_t(0) ;
@@ -927,7 +1083,7 @@ public:
     _ceilx2 ( M + M - 4 )
     { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     v += rc_t ( 0.5 ) ;        // delete this to change back to origin at reflection
     rc_t fl_i ;
@@ -955,7 +1111,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     v += rc_t ( 0.5 ) ;
@@ -993,7 +1149,7 @@ public:
     _ceilx2 ( M + M )
   { } ;
   
-  virtual void operator() ( rc_t v , ic_t& iv , rc_t& fv )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv )
   {
     v += rc_t ( 0.5 ) ;
     rc_t fl_i ;
@@ -1027,7 +1183,7 @@ public:
 
   /// this is the operator() for vectorized operation
 
-  virtual void operator() ( rc_v v , ic_v & iv , rc_v & fv )
+  void operator() ( rc_v v , ic_v & iv , rc_v & fv )
   {
     rc_v fl_i ;
     v += rc_t ( 0.5 ) ;
@@ -1074,11 +1230,31 @@ map_func pick_mapping ( bc_code bc , int spline_degree , int M )
         return odd_mapping_limit < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
+      case LIMITP :
+      {
+        return odd_mapping_limit < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
       case REJECT :
       {
         return odd_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
+      case REJECTP :
+      {
+        return odd_mapping_reject < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
+      case TAG :
+      {
+        return odd_mapping_tag < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case TAGP :
+      {
+        return odd_mapping_tag < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
       case CONSTANT :
       case NATURAL :
       {
@@ -1122,11 +1298,31 @@ map_func pick_mapping ( bc_code bc , int spline_degree , int M )
         return even_mapping_limit < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
+      case LIMITP :
+      {
+        return even_mapping_limit < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
       case REJECT :
       {
         return even_mapping_reject < ic_t , rc_t , vsize > ( M ) ;
         break ;
       }
+      case REJECTP :
+      {
+        return even_mapping_reject < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
+      case TAG :
+      {
+        return even_mapping_tag < ic_t , rc_t , vsize > ( M ) ;
+        break ;
+      }
+      case TAGP :
+      {
+        return even_mapping_tag < ic_t , rc_t , vsize > ( M + 1 ) ;
+        break ;
+      }
       case NATURAL :
       case CONSTANT :
       {
@@ -1255,7 +1451,7 @@ struct nd_mapping
   /// apply the mapping along axis 'axis' to coordinate v, resulting in the setting
   /// of the integral part iv and the fraczional part fv
   
-  void operator() ( rc_t v , ic_t& iv , rc_t& fv , const int & axis )
+  void operator() ( rc_t v , ic_t& iv , rc_t& fv , const int & axis ) const
   {
     map [ axis ] . map ( v , iv , fv ) ;
   }
@@ -1263,7 +1459,7 @@ struct nd_mapping
   /// same operation, but along all axes, taking a multi-coordinate and setting
   /// the corresponding n-dimensional objects for the integral and fractional parts
   
-  void operator() ( nd_rc_t v , nd_ic_t& iv , nd_rc_t& fv )
+  void operator() ( nd_rc_t v , nd_ic_t& iv , nd_rc_t& fv ) const
   {
     for ( int axis = 0 ; axis < dimension ; axis++ )
       map [ axis ] . map ( v[axis] , iv[axis] , fv[axis] ) ;
@@ -1271,14 +1467,14 @@ struct nd_mapping
 
   /// different signature, now we handle a split_type object as the operation's target
 
-  void operator() ( rc_t v , split_t& s , const int & axis )
+  void operator() ( rc_t v , split_t& s , const int & axis ) const
   {
     map [ axis ] . map ( v , s.select[axis] , s.tune[axis] ) ;
   }
   
   /// the same in nD
   
-  void operator() ( nd_rc_t v , split_t& s )
+  void operator() ( nd_rc_t v , split_t& s ) const
   {
     for ( int axis = 0 ; axis < dimension ; axis++ )
       map [ axis ] . map ( v[axis] , s.select[axis] , s.tune[axis] ) ;
@@ -1292,7 +1488,7 @@ struct nd_mapping
   void operator() ( rc_v v ,
                     ic_v & iv ,
                     rc_v & fv ,
-                    const int & axis )
+                    const int & axis ) const
   {
     map [ axis ] . map_v ( v , iv , fv ) ;
   }
@@ -1301,7 +1497,7 @@ struct nd_mapping
   
   void operator() ( nd_rc_v v ,
                     nd_ic_v & iv ,
-                    nd_rc_v & fv )
+                    nd_rc_v & fv ) const
   {
     for ( int axis = 0 ; axis < dimension ; axis++ )
       map [ axis ] . map_v ( v[axis] , iv[axis] , fv[axis] ) ;
diff --git a/multithread.h b/multithread.h
index 21ab2c9..bb27cad 100644
--- a/multithread.h
+++ b/multithread.h
@@ -84,6 +84,10 @@
 
 namespace vspline
 {
+/// number of CPU cores in the system; per default, multithreading routines
+/// will perform their task with as many threads.
+
+const int ncores = std::thread::hardware_concurrency() ;
 
 /// given limit_type, we define range_type as a TinyVector of two limit_types,
 /// the first denoting the beginning of the range and the second it's end, with
@@ -471,4 +475,4 @@ void apply_v_point_func ( void (*pvfunc) ( typename view_type::value_type * ) ,
 
 } ; // end if namespace vspline
 
-#endif // #ifndef VSPLINE_MULTITHREAD_H
\ No newline at end of file
+#endif // #ifndef VSPLINE_MULTITHREAD_H
diff --git a/prefilter.h b/prefilter.h
index cff1266..e5e5560 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -151,104 +151,62 @@ using namespace vigra ;
 /// done very quickly, since no additional memory access is needed beyond a buffer's worth of data
 /// already present in core memory.
 ///
-/// solve_vigra and solve_vc are just thin wrappers around filter_nd in filter.h,
-/// injecting the actual number of poles, the poles themselves and a flag wether to
-/// use vector code or not.
-
-template < typename input_array_type ,      // type of array with knot point data
-           typename output_array_type >     // type of array for coefficients (may be the same)
-void solve_vigra ( input_array_type & input ,
-                   output_array_type & output ,
-                   TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
-                   int degree ,
-                   double tolerance ,
-                   int nslices = ncores )
-{
-  filter_nd ( input ,
-              output ,
-              bcv ,
-              degree / 2 ,
-              precomputed_poles [ degree ] ,
-              tolerance ,
-              false ,
-              nslices ) ;
-}
+/// solve is just athin wrapper around filter_nd in filter.h, injecting the actual number of poles,
+/// the poles themselves and a flag wether to use vector code or not.
+///
+/// Note how smoothing comes into play here: it's done simply by
+/// prepending an additional pole to the filter cascade, taking a positive value between
+/// 0 (no smoothing) and 1 (total blur) if 'smoothing' is not 0.0. While I'm not sure about
+/// the precise mathematics (yet) this does what is intended very efficiently. Why smoothe?
+/// If the signal is scaled down when remapping, we'd have aliasing of higher frequencies
+/// into the output, producing artifacts. Pre-smoothing with an adequate factor removes the
+/// higher frequencies (more or less), avoiding the problem.
+///
+/// Using this simple method, pre-smoothing is computationally cheap, but the method used
+/// here isn't equivalent to convolving with a gaussian, though the effect is quite similar.
+/// I think the method is called exponential smoothing.
 
-/// overload of solve_vigra taking a single boundary condition code which is used
-/// for all axes. This also saves us a specific 1D version.
+// TODO: establish the proper maths for this smoothing method
 
 template < typename input_array_type ,      // type of array with knot point data
            typename output_array_type >     // type of array for coefficients (may be the same)
-void solve_vigra ( input_array_type & input ,
+void solve ( input_array_type & input ,
                    output_array_type & output ,
-                   bc_code bc ,
+                   TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
                    int degree ,
                    double tolerance ,
+                   double smoothing = 0.0 ,
+                   bool use_vc = true ,
                    int nslices = ncores )
 {
-  filter_nd ( input ,
-              output ,
-              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
-              degree / 2 ,
-              precomputed_poles [ degree ] ,
-              tolerance ,
-              false ,
-              nslices ) ;
+  if ( smoothing != 0.0 )
+  {
+    assert ( smoothing > 0.0 && smoothing < 1.0 ) ;
+    int npoles = degree / 2 + 1 ;
+    double pole [ npoles ] ;
+    pole[0] = smoothing ;
+    for ( int i = 1 ; i < npoles ; i++ )
+      pole[i] = precomputed_poles [ degree ] [ i - 1 ] ;
+    filter_nd ( input ,
+                output ,
+                bcv ,
+                npoles ,
+                pole ,
+                tolerance ,
+                use_vc ,
+                nslices ) ;
+  }
+  else
+    filter_nd ( input ,
+                output ,
+                bcv ,
+                degree / 2 ,
+                precomputed_poles [ degree ] ,
+                tolerance ,
+                use_vc ,
+                nslices ) ;
 }
 
-#ifdef USE_VC
-
-/// solve_vc is the equivalent to solve_vigra, but uses vectorization. If your machine
-/// supports vectorization (SSE, AVX...), this is the variant you should try if you can,
-/// to establish if you can get the process to speed up. This may not be the case - on my
-/// system (core i5, 4 cores, AVX2), depending on other parameters, sometimes this routine
-/// is faster than solve_vigra. Prefiltering is very memory-intensive, the amount of actual
-/// caclulation is, by contrast, small, so the effect of vectorization for prefiltering
-/// isn't as large as it's efect on evaluation.
-
-template < typename input_array_type ,      // type of array with knot point data
-           typename output_array_type >     // type of array for coefficients (may be the same)
-void solve_vc ( input_array_type & input ,
-                output_array_type & output ,
-                TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
-                int degree ,
-                double tolerance ,
-                int nslices = ncores )
-{
-  filter_nd ( input ,
-              output ,
-              bcv ,
-              degree / 2 ,
-              precomputed_poles [ degree ] ,
-              tolerance ,
-              true ,
-              nslices ) ;
-}
-
-/// overload of solve_vc taking a single boundary condition code which is used
-/// for all axes. This also saves us a specific 1D version.
-
-template < typename input_array_type ,      // type of array with knot point data
-           typename output_array_type >     // type of array for coefficients (may be the same)
-void solve_vc ( input_array_type & input ,
-                output_array_type & output ,
-                bc_code bc ,
-                int degree ,
-                double tolerance ,
-                int nslices = ncores )
-{
-  filter_nd ( input ,
-              output ,
-              TinyVector<bc_code,input_array_type::actual_dimension> ( bc ) ,
-              degree / 2 ,
-              precomputed_poles [ degree ] ,
-              tolerance ,
-              true ,
-              nslices ) ;
-}
-
-#endif
-
 /// An interlude: restoration of the original knot point data from the spline coefficients.
 /// This is easily done by a simple convolution with the values of the basis function
 /// taken at discrete points inside the defined range.
diff --git a/remap.h b/remap.h
index 9383245..87fad06 100644
--- a/remap.h
+++ b/remap.h
@@ -109,6 +109,7 @@ template < class coordinate_type >
 struct coordinate_traits
 {
   typedef coordinate_type rc_type ;
+  typedef int ic_type ;
 } ;
 
 /// here we have a partial specialization of class coordinate_traits for coordinates
@@ -119,6 +120,7 @@ template < typename T , int N >
 struct coordinate_traits < vigra::TinyVector < T , N > >
 {
   typedef typename vigra::TinyVector < T , N >::value_type rc_type ;
+  typedef int ic_type ;
 } ;
 
 /// since remap can also operate with pre-split coordinates, we need another partial
@@ -128,8 +130,19 @@ template < int N , typename IT , typename FT >
 struct coordinate_traits < split_type < N , IT , FT > >
 {
   typedef typename split_type < N , IT , FT >::value_type rc_type ;
+  typedef int ic_type ;
 } ;
 
+/// declaring an appropriate evaluator is quite a mouthful, so we use
+/// this 'using' expresssion to mak it more handy:
+
+template < int dimension , typename value_type , typename coordinate_type >
+using evaluator_type
+= evaluator < dimension ,
+              value_type ,
+              typename coordinate_traits < coordinate_type > :: rc_type ,
+              typename coordinate_traits < coordinate_type > :: ic_type > ;
+
 /// st_remap is the single-thread routine to perform a remap with a given warp array
 /// _warp (containing coordinates) into a target array _output, which takes the interpolated
 /// values at the locations the coordinates indicate, using a bspline as interpolator.
@@ -155,7 +168,7 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
 void st_remap ( shape_range_type < dim_out >                 range ,
-                const bspline < value_type , dim_in >        *p_bspl ,
+                const evaluator_type < dim_in , value_type , coordinate_type > * const p_ev ,
                 MultiArrayView < dim_out , coordinate_type > *p_warp ,
                 MultiArrayView < dim_out , value_type >      *p_output ,
                 bool use_vc = true
@@ -177,9 +190,9 @@ void st_remap ( shape_range_type < dim_out >                 range ,
   auto warp = p_warp->subarray ( range[0] , range[1] ) ;
   auto output = p_output->subarray ( range[0] , range[1] ) ;
   
-  // create an evaluator from the b-spline
-
-  evaluator_type ev ( *p_bspl ) ;
+//   // create an evaluator from the b-spline
+// 
+//   evaluator_type ev ( *p_bspl ) ;
   
 #ifdef USE_VC
 
@@ -218,7 +231,7 @@ void st_remap ( shape_range_type < dim_out >                 range ,
         // best case: output array has consecutive memory, no need to buffer
         for ( int a = 0 ; a < aggregates ; a++ , source += vsize , destination += vsize )
         {
-          ev.v_eval ( source , destination ) ;
+          p_ev->v_eval ( source , destination ) ;
         }
         target_it += aggregates * vsize ;
       }
@@ -230,7 +243,7 @@ void st_remap ( shape_range_type < dim_out >                 range ,
         value_type target_buffer [ vsize ] ;
         for ( int a = 0 ; a < aggregates ; a++ , source += vsize )
         {
-          ev.v_eval ( source , target_buffer ) ;
+          p_ev->v_eval ( source , target_buffer ) ;
           for ( int e = 0 ; e < vsize ; e++ )
           {
             *target_it = target_buffer[e] ;
@@ -254,7 +267,7 @@ void st_remap ( shape_range_type < dim_out >                 range ,
             source_buffer[e] = *source_it ;
             ++source_it ;
           }
-          ev.v_eval ( source_buffer , destination ) ;
+          p_ev->v_eval ( source_buffer , destination ) ;
         }
         target_it += aggregates * vsize ;
       }
@@ -269,7 +282,7 @@ void st_remap ( shape_range_type < dim_out >                 range ,
             source_buffer[e] = *source_it ;
             ++source_it ;
           }
-          ev.v_eval ( source_buffer , target_buffer ) ;
+          p_ev->v_eval ( source_buffer , target_buffer ) ;
           for ( int e = 0 ; e < vsize ; e++ )
           {
             *target_it = target_buffer[e] ;
@@ -286,25 +299,26 @@ void st_remap ( shape_range_type < dim_out >                 range ,
   while ( leftover-- )
   {
     // process leftovers with single-value evaluation
-    ev.eval ( *source_it , *target_it ) ;
+    p_ev->eval ( *source_it , *target_it ) ;
     ++source_it ;
     ++target_it ;
   }
 }
 
-/// this remap variant performs the remap with several threads. The routine above is single-threaded,
-/// but all individual evaluations are independent of each other. So making it multi-threaded
-/// is trivial: just split the warp array and target array n-ways and have each thread process
-/// a chunk. Initially I used this approach directly, but later I switched to multithreading code
-/// which doesn't actually touch the containers, but instead only passes partitioning information
-/// to the individual threads, leaving it to them to split the containers accordingly, which
-/// results in simple and transparent code with only few limitations.
+/// this remap variant performs the remap with several threads. The routine above is
+/// single-threaded, but all individual evaluations are independent of each other.
+/// So making it multi-threaded is trivial: just split the warp array and target
+/// array n-ways and have each thread process a chunk. Initially I used this approach
+/// directly, but later I switched to multithreading code which doesn't actually touch
+/// the containers, but instead only passes partitioning information to the individual
+/// threads, leaving it to them to split the containers accordingly, which results in
+/// simple and transparent code with only few limitations.
 
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
            class coordinate_type , // usually float for coordinates
-           int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
-           int dim_out >           // number of dimensions of output array
-void remap ( const bspline < value_type , dim_in > & bspl ,
+           int dim_in ,            // dimensions of input array and of 'warp' coordinates
+           int dim_out >           // dimensions of output array
+void remap ( const evaluator_type < dim_in , value_type , coordinate_type > & ev ,
              MultiArrayView < dim_out , coordinate_type > & warp ,
              MultiArrayView < dim_out , value_type > & output ,
              bool use_vc = true ,
@@ -330,7 +344,7 @@ void remap ( const bspline < value_type , dim_in > & bspl ,
   // that we can't pass anything on by reference and use pointers instead.
 
   multithread ( & st_remap < value_type , coordinate_type , dim_in , dim_out > ,
-                 nthreads , range , &bspl , &warp , &output , use_vc ) ;
+                 nthreads , range , &ev , &warp , &output , use_vc ) ;
 } ;
 
 /// This is a variant of remap, which directly takes an array of values and remaps it,
@@ -370,10 +384,15 @@ int remap ( MultiArrayView < dim_in , value_type > input ,
   
   bsp.prefilter ( use_vc , nthreads , input ) ;
 
+  // create an evaluator over the bspline
+
+  evaluator_type < dim_in , value_type , coordinate_type > ev ( bsp ) ;
+  
   // and call the remap variant taking a bspline object,
-  // passing in the spline, the coordinate array and the target array
+  // passing in the evaluator, the coordinate array and the target array
   
-  remap < value_type , coordinate_type , dim_in , dim_out > ( bsp , warp , output , use_vc , nthreads ) ;
+  remap < value_type , coordinate_type , dim_in , dim_out >
+    ( ev , warp , output , use_vc , nthreads ) ;
     
   return 0 ;
 }
@@ -429,21 +448,21 @@ struct warper
 /// 'multithread'
 
 template < class value_type ,      // like, float, double, complex<>, pixels, TinyVectors etc.
-           class nd_ic_type , 
-           class nd_rc_type ,
+           class ic_type , 
+           class rc_type ,
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
 void st_wremap ( shape_range_type < dim_out > range ,
-                const bspline < value_type , dim_in > * bspl ,
-                MultiArrayView < dim_out , nd_ic_type > _select ,
-                MultiArrayView < dim_out , nd_rc_type > _tune ,
-                MultiArrayView < dim_out , value_type > _output ,
-                bool use_vc = true
-           )
+                 const evaluator < dim_in , value_type , rc_type , ic_type > * const p_ev ,
+                 MultiArrayView < dim_out , vigra::TinyVector < ic_type , dim_in > > _select ,
+                 MultiArrayView < dim_out , vigra::TinyVector < rc_type , dim_in > > _tune ,
+                 MultiArrayView < dim_out , value_type > _output ,
+                 bool use_vc = true
+               )
 {
   typedef bspline < value_type , dim_in > spline_type ;
-  typedef typename nd_rc_type::value_type rc_type ;
-  typedef typename nd_ic_type::value_type ic_type ;
+  typedef vigra::TinyVector < rc_type , dim_in > nd_rc_type ;
+  typedef vigra::TinyVector < ic_type , dim_in > nd_ic_type ;
   typedef evaluator < dim_in , value_type , rc_type , ic_type > evaluator_type ;
   typedef typename evaluator_type::ele_type ele_type ;
 
@@ -451,8 +470,6 @@ void st_wremap ( shape_range_type < dim_out > range ,
   auto tune = _tune.subarray ( range[0] , range[1] ) ;
   auto output = _output.subarray ( range[0] , range[1] ) ;
   
-  evaluator_type ev ( *bspl ) ;
-  
 #ifdef USE_VC
 
   typedef typename evaluator_type::ele_v ele_v ;
@@ -486,7 +503,7 @@ void st_wremap ( shape_range_type < dim_out > range ,
            a < aggregates ;
            a++ , pselect += vsize , ptune += vsize , destination += vsize )
       {
-        ev.v_eval ( (ic_type*) pselect , (rc_type*) ptune , destination ) ;
+        p_ev->v_eval ( (ic_type*) pselect , (rc_type*) ptune , destination ) ;
       }
       target_it += aggregates * vsize ;
     }
@@ -498,7 +515,7 @@ void st_wremap ( shape_range_type < dim_out > range ,
       value_type target_buffer [ vsize ] ;
       for ( int a = 0 ; a < aggregates ; a++ , pselect += vsize , ptune += vsize )
       {
-        ev.v_eval ( (ic_type*) pselect , (rc_type*) ptune , target_buffer ) ;
+        p_ev->v_eval ( (ic_type*) pselect , (rc_type*) ptune , target_buffer ) ;
         for ( int e = 0 ; e < vsize ; e++ )
         {
           *target_it = target_buffer[e] ;
@@ -516,7 +533,7 @@ void st_wremap ( shape_range_type < dim_out > range ,
   while ( leftover-- )
   {
     // process leftovers with single-value evaluation
-    ev.eval ( *select_it , *tune_it , *target_it ) ;
+    p_ev->eval ( *select_it , *tune_it , *target_it ) ;
     ++select_it ;
     ++tune_it ;
     ++target_it ;
@@ -530,11 +547,11 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type ,         // real coordinate part
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-void remap ( const bspline < value_type , dim_in > & bspl ,
-            warper < ic_type , rc_type , dim_out , dim_in > & warp ,
-            MultiArrayView < dim_out , value_type > output ,
-            bool use_vc = true ,
-            int nthreads = ncores
+void remap ( const evaluator < dim_in , value_type , rc_type , ic_type > & ev ,
+             warper < ic_type , rc_type , dim_out , dim_in > & warp ,
+             MultiArrayView < dim_out , value_type > output ,
+             bool use_vc = true ,
+             int nthreads = ncores
            )
 {
   typedef warper < ic_type , rc_type , dim_out , dim_in > warper_type ;
@@ -547,8 +564,8 @@ void remap ( const bspline < value_type , dim_in > & bspl ,
   }
   shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
   
-  multithread ( & st_wremap < value_type , nd_ic_type , nd_rc_type , dim_in , dim_out > ,
-                 nthreads , range , &bspl , warp.select , warp.tune , output , use_vc ) ;
+  multithread ( & st_wremap < value_type , ic_type , rc_type , dim_in , dim_out > ,
+                 nthreads , range , &ev , warp.select , warp.tune , output , use_vc ) ;
 } ;
 
 // Next we have some collateral code to get ready for the transformation-based remap
@@ -836,7 +853,7 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
 void st_tf_remap ( shape_range_type < dim_out > range ,
-                   const bspline < value_type , dim_in > * bspl ,
+                   const evaluator < dim_in , value_type , rc_type , int > * p_ev ,
                    transformation < value_type , rc_type , dim_in , dim_out > tf ,
                    MultiArrayView < dim_out , value_type > _output ,
                    bool use_vc = true
@@ -848,7 +865,6 @@ void st_tf_remap ( shape_range_type < dim_out > range ,
   
   auto output = _output.subarray ( range[0] , range[1] ) ;
   auto offset = range[0] ;
-  evaluator_type ev ( *bspl ) ;
 
 #ifdef USE_VC
 
@@ -866,8 +882,8 @@ void st_tf_remap ( shape_range_type < dim_out > range ,
   
 #endif
   
-  typedef typename CoupledIteratorType < dim_out , value_type > :: type Iterator ;
-  Iterator target_it = createCoupledIterator ( output ) ;
+//   typedef typename CoupledIteratorType < dim_out , value_type > :: type Iterator ;
+  auto target_it = createCoupledIterator ( output ) ;
   int leftover = output.elementCount() ;
   
 #ifdef USE_VC
@@ -888,16 +904,16 @@ void st_tf_remap ( shape_range_type < dim_out > range ,
       if ( output.isUnstrided() )
       {
         // finally, here we evaluate the spline
-        ev.v_eval ( c_out , destination ) ;
+        p_ev->v_eval ( c_out , destination ) ;
         destination += vsize ;
       }
       else
       {
         // alternative evaluation if we can't write straight to destination 
-        ev.v_eval ( c_out , target_buffer ) ;
+        p_ev->v_eval ( c_out , target_buffer ) ;
         for ( int e = 0 ; e < vsize ; e++ )
         {
-          target_it.get<1>() = target_buffer[e] ;
+          target_it.template get<1>() = target_buffer[e] ;
           ++target_it ;
         }
       }
@@ -915,9 +931,9 @@ void st_tf_remap ( shape_range_type < dim_out > range ,
   while ( leftover-- )
   {
     // process leftovers with single-value evaluation of transformed coordinate
-    cs_in = target_it.get<0>() + offset ;
+    cs_in = target_it.template get<0>() + offset ;
     tf ( cs_in , cs_out ) ;
-    ev.eval ( cs_out , target_it.get<1>() ) ;
+    p_ev->eval ( cs_out , target_it.template get<1>() ) ;
     ++target_it ;
   }
 }
@@ -930,17 +946,17 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type ,         // float or double for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-void tf_remap ( const bspline < value_type , dim_in > & bspl ,
-               transformation < value_type , rc_type , dim_in , dim_out > tf ,
-               MultiArrayView < dim_out , value_type > output ,
-               bool use_vc = true ,
-               int nthreads = ncores            
-             )
+void tf_remap ( const evaluator < dim_in , value_type , rc_type , int > & ev ,
+                transformation < value_type , rc_type , dim_in , dim_out > tf ,
+                MultiArrayView < dim_out , value_type > output ,
+                bool use_vc = true ,
+                int nthreads = ncores            
+              )
 {
   shape_range_type < dim_out > range ( shape_type < dim_out > () , output.shape() ) ;
   
   multithread ( st_tf_remap < value_type , rc_type , dim_in , dim_out > ,
-                nthreads , range , &bspl , tf , output , use_vc ) ;
+                nthreads , range , &ev , tf , output , use_vc ) ;
 }
 
 /// this highest-level transform-based remap takes an input array and creates
@@ -952,22 +968,23 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            class rc_type , // usually float for coordinates
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
-int tf_remap ( MultiArrayView < dim_in , value_type > input ,
-               transformation < value_type , rc_type , dim_in , dim_out > tf ,
-               MultiArrayView < dim_out , value_type > output ,
-               bcv_type<dim_in> bcv = bcv_type<dim_in> ( MIRROR ) ,
-               int degree = 3 ,
-               bool use_vc = true ,
-               int nthreads = ncores            
-             )
+void tf_remap ( MultiArrayView < dim_in , value_type > input ,
+                transformation < value_type , rc_type , dim_in , dim_out > tf ,
+                MultiArrayView < dim_out , value_type > output ,
+                bcv_type<dim_in> bcv = bcv_type<dim_in> ( MIRROR ) ,
+                int degree = 3 ,
+                bool use_vc = true ,
+                int nthreads = ncores            
+              )
 {
   // create the bspline object
   bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
   // copy in the data
   bsp.prefilter ( use_vc , nthreads , input ) ;
-
+  // create an evaluator for this spline
+  evaluator < dim_in , value_type , rc_type , int > ev ( bsp ) ;
   tf_remap<value_type,rc_type,dim_in,dim_out>
-           ( bsp , tf , output , use_vc , nthreads ) ;
+           ( ev , tf , output , use_vc , nthreads ) ;
 }
 
 /// this function creates a warp array instead of evaluating a spline. The warp array can
@@ -982,12 +999,12 @@ template < class value_type ,      // like, float, double, complex<>, pixels, Ti
            int dim_in ,            // number of dimensions of input array (and of 'warp' coordinates)
            int dim_out >           // number of dimensions of output array
 void st_make_warp_array ( shape_range_type < dim_out > range ,
-                         transformation < value_type , rc_type , dim_in , dim_out > tf ,
-                         MultiArrayView < dim_out ,
-                                          vigra::TinyVector < rc_type , dim_in > >
+                          transformation < value_type , rc_type , dim_in , dim_out > tf ,
+                          MultiArrayView < dim_out ,
+                                           vigra::TinyVector < rc_type , dim_in > >
                            _warp_array ,
-                         bool use_vc = true
-                       )
+                          bool use_vc = true
+                        )
 {
   auto warp_array = _warp_array.subarray ( range[0] , range[1] ) ;
   auto offset = range[0] ;
@@ -1039,7 +1056,7 @@ void st_make_warp_array ( shape_range_type < dim_out > range ,
       for ( int e = 0 ; e < vsize ; e++ )
       {
         for ( int d = 0 ; d < dim_in ; d++ )
-          target_it.get<1>()[d] = target_buffer[d][e] ;
+          target_it.template get<1>()[d] = target_buffer[d][e] ;
         ++target_it ;
       }
     }
@@ -1051,9 +1068,9 @@ void st_make_warp_array ( shape_range_type < dim_out > range ,
   while ( leftover-- )
   {
     // process leftovers
-    cs_in = target_it.get<0>() + offset ;
+    cs_in = target_it.template get<0>() + offset ;
     tf ( cs_in , cs_out ) ;
-    target_it.get<1>() = cs_out ;
+    target_it.template get<1>() = cs_out ;
     ++target_it ;
   }
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git