Sun Jul 2 09:02:38 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit a74968a6d3ae4fe6ecebc8098678fec85cc44b39
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Wed Dec 14 12:57:34 2016 +0100

    stutter-free panning in pv
    I found ways to avoid dropped frames in pv. I took several measures:
    - the frames are now rendered in a separate thread with some rendering ahead-of-time
      so that the overall rate is steadier
    - if there are still dropped frames, I decrease the apparent frame rate by
      showing all frames twice thrice etc. - with showing them twice I'm still
      at 30 fps, which looks smooth if it's not panning too fast
    So now I can run fine with linear interpolation at 30fps and with NN interpolation
    at 60 fps; just linear interpolation and 60 fps don't usually work, and with some
    situations (large fov) I need to switch to 30fps and NN.
    So I run 'technical' 60 fps all the time; the graphics system copes just fine,
    the dropped frames were really because some arrived too late.
    Ah, and I implemented direct interpolation to RGBA, saving the intermediate
    float RGB image. This should also have helped with speeding up.
---
 README.rst           |   4 +-
 bspline.h            |  39 +++--
 example/roundtrip.cc |   2 +-
 filter.h             | 412 ++++++++++++++++++++++++++++++---------------------
 interpolator.h       |  85 ++++++++---
 prefilter.h          |  25 ++--
 remap.h              |   4 +-
 7 files changed, 347 insertions(+), 224 deletions(-)

diff --git a/README.rst b/README.rst
index 70d2313..20e3621 100644
--- a/README.rst
+++ b/README.rst
@@ -30,8 +30,8 @@ provided via class 'bspline' defined in bspline.h, and via the remap functions
 defined in remap.h.
 
 While I made an attempt to write code which is portable, vspline is only tested with
-gcc on Linux. It may work in other environments, but it's unlikely it will do so
-without modification. An installation of Vigra_ is needed to compile, installation
+g++ and clang++ on Linux. It may work in other environments, but it's unlikely it will
+do so without modification. An installation of Vigra_ is needed to compile, installation
 of Vc_ is optional but recommended.
 
 vspline is relatively new, the current version might qualify as late beta.
diff --git a/bspline.h b/bspline.h
index 048b800..d2fda9b 100644
--- a/bspline.h
+++ b/bspline.h
@@ -173,17 +173,20 @@ struct bspline
   /// nD type for one boundary condition per axis
   typedef typename vigra::TinyVector < bc_code , dimension > bcv_type ;
 
-  /// type for pointer to a prefiltering method
-  typedef void (*p_solve) ( view_type& ,
-                            view_type& ,
-                            bcv_type ,
-                            int ,
-                            double ,
-                            double ,
-                            int ) ;
-
   /// elementary type of value_type, float or double
   typedef typename ExpandElementResult < value_type >::type real_type ;
+  
+  // for internal calculations in the filter, we use the elementary type of value_type.
+  // Note how in class bspline we make very specific choices about the
+  // source data type, the target data type and the type used for arithmetics: we use
+  // the same value_type for source and target array and perform the arithmetics with
+  // this value_type's elementary type. The filter itself is much more flexible; all of
+  // the three types can be different, the only requirements are that the value_types
+  // must be vigra element-expandable types with an elementary type that can be cast
+  // to and from math_type, and math_type must be a real data type, with the additional
+  // requirement that it can be vectorized by VC if Vc is used.
+
+  typedef real_type math_type ; // arbitrary, can use float or double here.
 
 private:
 
@@ -304,6 +307,11 @@ public:
   // multiple of the Vc:Vector's Size for the given data type to have better-aligned
   // access. This may or may not help, has to be tested. We might also want to position
   // the origin of the brace to an aligned position, since evaluation is based there.
+  
+  // TODO: while the coice to keep the value_types and math_type closely related makes
+  // for simple code, with the more flexible formulation of the prefiltering code we might
+  // widen class bspline's scope to accept input of other types and/or use a different
+  // math_type.
 
   bspline ( shape_type _core_shape ,  ///< shape of knot point data
             int _spline_degree = 3 , ///< spline degree with reasonable default
@@ -443,7 +451,8 @@ public:
     {
       case UNBRACED:
         // only call the solver, don't do any bracing
-        solve ( data ,
+        solve < view_type , view_type , math_type >
+              ( data ,
                 core ,
                 bcv ,
                 spline_degree ,
@@ -456,7 +465,8 @@ public:
       case BRACED:
         // solve first, passing in BC codes to pick out the appropriate functions to
         // calculate the initial causal and anticausal coefficient, then brace result
-        solve ( data ,
+        solve < view_type , view_type , math_type >
+              ( data ,
                 core ,
                 bcv ,
                 spline_degree ,
@@ -479,7 +489,8 @@ public:
         // large frame size. This is debatable.
         for ( int d = 0 ; d < dimension ; d++ )
           br.apply ( container , bcv[d] , left_frame[d] , right_frame[d] , d ) ;
-        solve ( container ,
+        solve < view_type , view_type , math_type >
+              ( container ,
                 container ,
                 explicit_bcv ,
                 spline_degree ,
@@ -496,7 +507,8 @@ public:
         // modes. Note that if any data were passed into this routine, in this case
         // they will be silently ignored (makes no sense overwriting the core after
         // having manually framed it in some way)
-        solve ( container ,
+        solve < view_type , view_type , math_type >
+              ( container ,
                 container ,
                 explicit_bcv ,
                 spline_degree ,
@@ -581,6 +593,7 @@ public:
     else
     {
       // can't shift
+      std::cout << "can't shift" << std::endl ;
       d = 0 ;
     }
 
diff --git a/example/roundtrip.cc b/example/roundtrip.cc
index 5c88ce2..cd0ce1f 100644
--- a/example/roundtrip.cc
+++ b/example/roundtrip.cc
@@ -393,7 +393,7 @@ void process_image ( char * name )
   for ( int b = 0 ; b < 4 ; b++ )
   {
     vspline::bc_code bc = bcs[b] ;
-    for ( int spline_degree = 2 ; spline_degree < 8 ; spline_degree++ )
+    for ( int spline_degree = 0 ; spline_degree < 8 ; spline_degree++ )
     {
       cout << "testing bc code " << vspline::bc_name[bc]
           << " spline degree " << spline_degree << endl ;
diff --git a/filter.h b/filter.h
index cadd889..851ac74 100644
--- a/filter.h
+++ b/filter.h
@@ -128,7 +128,7 @@ class filter
                   "prefilter input and output iterator must have the same value_type" ) ;
   
 //   // both iterators should be random access iterators.
-//   // currently not enforced, too lazy to code the traits for vector stripe iterators...
+//   // currently not enforced
 //   typedef typename std::iterator_traits < in_iter > :: iterator_category in_cat ;
 //   static_assert ( std::is_same < in_cat , std::random_access_iterator_tag > :: value ,
 //                   "prefilter input iterator must be random access iterator"  ) ;
@@ -136,7 +136,6 @@ class filter
 //   typedef typename std::iterator_traits < out_iter > :: iterator_category out_cat ;
 //   static_assert ( std::is_same < out_cat , std::random_access_iterator_tag > :: value ,
 //                   "prefilter output iterator must be random access iterator" ) ;
-                  
   
   /// typedef the fully qualified type for brevity, to make the typedefs below
   /// a bit more legible
@@ -798,35 +797,36 @@ filter ( int _M ,               ///< number of input/output elements (DataLength
 // on forward declarations. The section of code immediately following doesn't use vectorization,
 // the vector code follows.
 
-/// 'monadic' gather and scatter. gather picks up count T which are stride apart,
+/// 'monadic' gather and scatter. gather picks up count source_type which are stride apart,
 /// starting at source and deposting compactly at target. scatter performs the reverse
-/// operation
-
-template < typename T > void
-gather ( const T* source ,
-         T* target ,
-         const int & stride ,
-         int count
-       )
+/// operation. source_type and target_type can be different; on assignment source_type is
+/// simply cast to target_type.
+
+template < typename source_type , typename target_type = source_type >
+void gather ( const source_type* source ,
+              target_type* target ,
+              const int & stride ,
+              int count
+            )
 {
   while ( count-- )
   {
-    *target = *source ;
+    *target = target_type ( *source ) ;
     source += stride ;
     ++target ;
   }
 }
 
-template < typename T > void
-scatter ( const T* source ,
-          T* target ,
-          const int & stride ,
-          int count
-        )
+template < typename source_type , typename target_type = source_type >
+void scatter ( const source_type* source ,
+               target_type* target ,
+               const int & stride ,
+               int count
+             )
 {
   while ( count-- )
   {
-    *target = *source ;
+    *target = target_type ( *source ) ;
     ++source ;
     target += stride ;
   }
@@ -840,11 +840,12 @@ scatter ( const T* source ,
 /// one pole, so there is no special treatment here for low-order filtering (TODO confirm)
 /// note the use of range_type<T>, which is from multithread.h
 
-template < class source_type ,
-           class target_type >
-void nonaggregating_filter ( range_type < typename source_type::difference_type > range ,
-                             source_type original_source ,
-                             target_type original_target ,
+template < class source_view_type ,
+           class target_view_type ,
+           class math_type >
+void nonaggregating_filter ( range_type < typename source_view_type::difference_type > range ,
+                             source_view_type * p_original_source ,
+                             target_view_type * p_original_target ,
                              int axis ,
                              double gain ,
                              bc_code bc ,
@@ -853,30 +854,29 @@ void nonaggregating_filter ( range_type < typename source_type::difference_type
                              double tolerance
                            )
 {
-  const int dim = source_type::actual_dimension ;
-  typedef typename source_type::value_type value_type ;
-  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
+  typedef typename source_view_type::value_type source_type ;
+  typedef typename target_view_type::value_type target_type ;
 
   // we're in the single-threaded code now. multithread() has simply forwarded
   // the source and target MultiArrayViews and a range, here we use the range
   // to pick out the subarrays of original_source and original_target which we
   // are meant to process in this thread:
 
-  auto source = original_source.subarray ( range[0] , range[1] ) ;
-  auto target = original_target.subarray ( range[0] , range[1] ) ;
-
+  const auto source = p_original_source->subarray ( range[0] , range[1] ) ;
+  auto target = p_original_target->subarray ( range[0] , range[1] ) ;
+  
   int count = source.shape ( axis ) ; 
 
   /// we use a buffer of count value_types
 
-  vigra::MultiArray < 1 , value_type > buffer ( count ) ;
+  vigra::MultiArray < 1 , math_type > buffer ( count ) ;
 
   // avoiding being specific about the iterator's type allows us to slot in
   // any old iterator we can get by calling begin() on buffer 
   
   typedef decltype ( buffer.begin() ) iter_type ;
-  typedef filter < iter_type , iter_type , value_type > filter_type ;
-  filter_type solver ( source.shape(axis) , gain , bc , nbpoles , pole , tolerance ) ;
+  typedef filter < iter_type , iter_type , math_type > filter_type ;
+  filter_type solver ( count , gain , bc , nbpoles , pole , tolerance ) ;
 
   // offset into the source slice which will be used for gather/scatter
 
@@ -886,9 +886,9 @@ void nonaggregating_filter ( range_type < typename source_type::difference_type
 
   int source_stride = source.stride ( axis ) ;
 
-  value_type * source_base_adress = source.data() ;
-  value_type * buffer_base_adress = buffer.data() ;
-  value_type * target_base_adress = target.data() ;
+  auto source_base_adress = source.data() ;
+  auto buffer_base_adress = buffer.data() ;
+  auto target_base_adress = target.data() ;
 
   if ( source.stride() == target.stride() )
   {
@@ -917,22 +917,28 @@ void nonaggregating_filter ( range_type < typename source_type::difference_type
 
     while ( source_sliter < source_sliter_end )
     {
-      // copy from the array to the buffer with a monadic gather
+      // copy from the array to the buffer with a monadic gather, casting to
+      // math_type in the process
       
       source_index = &(*source_sliter) - source_base_adress ;
       
-      gather<value_type> ( source_base_adress + source_index , buffer_base_adress ,
-                           source_stride , count ) ;
+      gather < source_type , math_type > ( source_base_adress + source_index ,
+                                           buffer_base_adress ,
+                                           source_stride ,
+                                           count ) ;
                               
       // finally (puh): apply the prefilter, using the solver in-place, iterating over
       // the vectors in buffer with maximum efficiency.
                               
       solver.solve ( buffer.begin() ) ;
       
-      // and perform a monadic scatter to write the filtered data to the destination
+      // and perform a monadic scatter to write the filtered data to the destination,
+      // casting to target_type in the process
 
-      scatter<value_type> ( buffer_base_adress , target_base_adress + source_index  ,
-                            source_stride , count ) ;
+      scatter< math_type , target_type > ( buffer_base_adress ,
+                                           target_base_adress + source_index ,
+                                           source_stride ,
+                                           count ) ;
       ++source_sliter ;
     }
   }
@@ -958,11 +964,17 @@ void nonaggregating_filter ( range_type < typename source_type::difference_type
       source_index = &(*source_sliter) - source_base_adress ;
       target_index = &(*target_sliter) - target_base_adress ;
       
-      gather<value_type> ( source_base_adress + source_index , buffer_base_adress ,
-                           source_stride , count ) ;
+      gather < source_type , math_type > ( source_base_adress + source_index ,
+                                           buffer_base_adress ,
+                                           source_stride ,
+                                           count ) ;
+                                           
       solver.solve ( buffer.begin() ) ;
-      scatter<value_type> ( buffer_base_adress , target_base_adress + target_index  ,
-                            target_stride , count ) ;
+      
+      scatter< math_type , target_type > ( buffer_base_adress ,
+                                           target_base_adress + target_index ,
+                                           target_stride ,
+                                           count ) ;
       ++source_sliter ;
       ++target_sliter ;
     }
@@ -1003,16 +1015,17 @@ bool sequential ( const IT & indexes )
 /// relatively cheap as it occurs only once, while the inner loop can just
 /// rip away.
 
-template < typename VT > void
-gather ( const typename VT::value_type * source ,
-         typename VT::value_type * target ,
-         const typename VT::IndexType & indexes ,
-         const typename VT::Mask & mask ,
+template < typename source_type , typename target_type = source_type >
+void
+gather ( const typename source_type::value_type * source ,
+         typename target_type::value_type * target ,
+         const typename source_type::IndexType & indexes ,
+         const typename source_type::Mask & mask ,
          const int & stride ,
          int count
        )
 {
-  register VT x ;
+  register source_type x ;
   if ( mask.isFull() )
   {
 // this is strange: using this code is actually slower on my system:
@@ -1034,9 +1047,10 @@ gather ( const typename VT::value_type * source ,
       while ( count-- )
       {
         x.gather ( source , indexes ) ;
-        x.store ( target , Vc::Aligned ) ;
+        auto tx = target_type ( x ) ;
+        tx.store ( target , Vc::Aligned ) ;
         source += stride ;
-        target += VT::Size ;
+        target += target_type::Size ;
       }
     }
   }
@@ -1048,22 +1062,24 @@ gather ( const typename VT::value_type * source ,
     {
       x.gather ( source , indexes , mask ) ;
       source += stride ;
-      x.store ( target , Vc::Aligned ) ;
-      target += VT::Size ;
+      auto tx = target_type ( x ) ;
+      tx.store ( target , Vc::Aligned ) ;
+      target += target_type::Size ;
     }
   }
 }
 
-template < typename VT > void
-scatter ( const typename VT::value_type * source ,
-          typename VT::value_type * target ,
-          const typename VT::IndexType & indexes ,
-          const typename VT::Mask & mask ,
+template < typename source_type , typename target_type = source_type >
+void
+scatter ( const typename source_type::value_type * source ,
+          typename target_type::value_type * target ,
+          const typename target_type::IndexType & indexes ,
+          const typename target_type::Mask & mask ,
           const int & stride ,
           int count
         )
 {
-  register VT x ;
+  register source_type x ;
   if ( mask.isFull() )
   {
 // this is strange: using this code is actually slower on my system:
@@ -1085,8 +1101,9 @@ scatter ( const typename VT::value_type * source ,
       while ( count-- )
       {
         x.load ( source , Vc::Aligned ) ;
-        x.scatter ( target , indexes ) ;
-        source += VT::Size ;
+        auto tx = target_type ( x ) ;
+        tx.scatter ( target , indexes ) ;
+        source += source_type::Size ;
         target += stride ;
       }
     }
@@ -1098,8 +1115,9 @@ scatter ( const typename VT::value_type * source ,
     while ( count-- )
     {
       x.load ( source , Vc::Aligned ) ;
-      source += VT::Size ;
-      x.scatter ( target , indexes , mask ) ;
+      source += source_type::Size ;
+      auto tx = target_type ( x ) ;
+      tx.scatter ( target , indexes , mask ) ;
       target += stride ;
     }
   }
@@ -1118,10 +1136,11 @@ scatter ( const typename VT::value_type * source ,
 /// It's used by aggregating_filter, either directly if the arrays' value_type is already
 /// an elementary type, or after element-expanding the array(s).
 
-template < class source_type ,
-           class target_type >
-void ele_aggregating_filter ( source_type &source ,
-                              target_type &target ,
+template < typename source_view_type ,
+           typename target_view_type ,
+           typename math_type >
+void ele_aggregating_filter ( source_view_type &source ,
+                              target_view_type &target ,
                               int axis ,
                               double gain ,
                               bc_code bc ,
@@ -1130,18 +1149,22 @@ void ele_aggregating_filter ( source_type &source ,
                               double tolerance
                             )
 {
-  typedef typename source_type::value_type ele_type ;    // elementary type in source
+//   typedef typename source_type::value_type ele_type ;    // elementary type in source
 
-  // we have a traits class fixing the simdized type (in common.h):
-  
-//   typedef typename vector_traits < ele_type > :: type simdized_type ;
+  // for prefiltering, using Vc:Vectors seems faster than using SimdArrays of twice the size,
+  // which are used as simdized type in evaluation
 
-  // for prefiltering, using Vc:Vectors seems faster than using SimdArrays of twice the size
-  typedef typename Vc::Vector < ele_type > simdized_type ;
-
-  const int vsize ( simdized_type::Size ) ;              // number of ele_type in a simdized_type
-  typedef typename simdized_type::IndexType index_type ; // indexes for gather/scatter
-  typedef typename simdized_type::MaskType mask_type ;   // and mask for masked operation
+  typedef typename Vc::Vector < math_type > simdized_math_type ;
+  const int vsize ( simdized_math_type::Size ) ;              // number of math_type in a simdized_type
+  
+  typedef typename source_view_type::value_type source_type ;
+  typedef typename Vc::SimdArray < source_type , vsize > simdized_source_type ;
+  
+  typedef typename target_view_type::value_type target_type ;
+  typedef typename Vc::SimdArray < target_type , vsize > simdized_target_type ;
+  
+  typedef typename simdized_math_type::IndexType index_type ; // indexes for gather/scatter
+  typedef typename simdized_math_type::MaskType mask_type ;   // and mask for masked operation
   
   int count = source.shape ( axis ) ;                    // number of vectors we'll process
 
@@ -1152,7 +1175,8 @@ void ele_aggregating_filter ( source_type &source ,
   
   // Vc::Memory < simdized_type > buffer ( count * vsize ) ; // doesn't work for me
 
-  vigra::MultiArray < 1 , simdized_type , Vc::Allocator<simdized_type> > buffer ( count ) ;
+  vigra::MultiArray < 1 , simdized_math_type , Vc::Allocator<simdized_math_type> >
+    buffer ( count ) ;
 
   // avoiding being specific about the iterator's type allows us to slot in
   // any old iterator we can get by calling begin() on buffer 
@@ -1172,21 +1196,21 @@ void ele_aggregating_filter ( source_type &source ,
   int source_stride = source.stride ( axis ) ;
 
   // we want to use the extended gather/scatter (with 'extrusion'), so we need the
-  // source and target pointers. Casting buffer's data pointer to element_type is safe,
-  // Since the simdized_type objects stored there are merely raw elemnt_type data
+  // source and target pointers. Casting buffer's data pointer to math_type is safe,
+  // Since the simdized_type objects stored there are merely raw math_type data
   // in disguise.
 
-  ele_type * source_base_adress = source.data() ;
-  ele_type * buffer_base_adress = (ele_type*) (buffer.data()) ;
-  ele_type * target_base_adress = target.data() ;
+  auto source_base_adress = source.data() ;
+  auto buffer_base_adress = (math_type*) (buffer.data()) ;
+  auto target_base_adress = target.data() ;
 
   // we create a solver object capable of handling the iterator producing the successive
   // simdized_types from the buffer. While the unvectorized code can omit passing the third
   // template argument (the elementary type used inside the solver) we pass it here, as we
   // don't define an element-expansion via vigra::ExpandElementResult for simdized_type.
 
-  typedef filter < viter_type , viter_type , ele_type > filter_type ;
-  filter_type solver ( source.shape(axis) , gain , bc , nbpoles , pole , tolerance ) ;
+  typedef filter < viter_type , viter_type , math_type > filter_type ;
+  filter_type solver ( count , gain , bc , nbpoles , pole , tolerance ) ;
 
   if ( source.stride() == target.stride() )
   {
@@ -1227,12 +1251,17 @@ void ele_aggregating_filter ( source_type &source ,
         
         // have got less than vsize only? mask the operation
         
-        mask = ( simdized_type::IndexesFromZero() < e ) ;
+        mask = ( simdized_math_type::IndexesFromZero() < e ) ;
       
       // perform extended gather with extrusion parameters
       
-      gather<simdized_type> ( source_base_adress , buffer_base_adress ,
-                              source_indexes , mask , source_stride , count ) ;
+      gather < simdized_source_type , simdized_math_type >
+        ( source_base_adress ,
+          buffer_base_adress ,
+          source_indexes ,
+          mask ,
+          source_stride ,
+          count ) ;
                               
       // finally (puh): apply the prefilter, using the solver in-place, iterating over
       // the vectors in buffer with maximum efficiency.
@@ -1242,8 +1271,13 @@ void ele_aggregating_filter ( source_type &source ,
       // and perform extended scatter with extrusion parameters to write the filtered data
       // to the destination
 
-      scatter<simdized_type> ( buffer_base_adress , target_base_adress ,
-                               source_indexes , mask , source_stride , count ) ;
+      scatter < simdized_math_type , simdized_target_type >
+        ( buffer_base_adress ,
+          target_base_adress ,
+          source_indexes ,
+          mask ,
+          source_stride ,
+          count ) ;
     }
   }
   else
@@ -1272,12 +1306,22 @@ void ele_aggregating_filter ( source_type &source ,
         target_indexes[e] = &(*target_sliter) - target_base_adress ;
       }
       if ( e < vsize )
-        mask = ( simdized_type::IndexesFromZero() < e ) ;
-      gather<simdized_type> ( source_base_adress , buffer_base_adress ,
-                              source_indexes , mask , source_stride , count ) ;
+        mask = ( simdized_math_type::IndexesFromZero() < e ) ;
+      gather < simdized_source_type , simdized_math_type >
+        ( source_base_adress ,
+          buffer_base_adress ,
+          source_indexes ,
+          mask ,
+          source_stride ,
+          count ) ;
       solver.solve ( buffer.begin() ) ;
-      scatter<simdized_type> ( buffer_base_adress , target_base_adress ,
-                               target_indexes , mask , target_stride , count ) ;
+      scatter < simdized_math_type , simdized_target_type >
+        ( buffer_base_adress ,
+          target_base_adress ,
+          target_indexes ,
+          mask ,
+          target_stride ,
+          count ) ;
     }
   }
 }
@@ -1292,10 +1336,11 @@ void ele_aggregating_filter ( source_type &source ,
 /// for further processing.
 
 template < class source_type ,
-           class target_type >
+           class target_type ,
+           typename math_type >
 void aggregating_filter ( range_type < typename source_type::difference_type > range ,
-                          source_type original_source ,
-                          target_type original_target ,
+                          source_type * p_original_source ,
+                          target_type * p_original_target ,
                           int axis ,
                           double gain ,
                           bc_code bc ,
@@ -1312,8 +1357,8 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
 
   // continue processing on the subarrays of source and target specified by 'range':
 
-  auto source = original_source.subarray ( range[0] , range[1] ) ;
-  auto target = original_target.subarray ( range[0] , range[1] ) ;
+  auto source = p_original_source->subarray ( range[0] , range[1] ) ;
+  auto target = p_original_target->subarray ( range[0] , range[1] ) ;
   
   if ( vigra::ExpandElementResult < value_type > :: size != 1 )
   {
@@ -1322,9 +1367,17 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
     // arrays with elementary types.
     auto expanded_source = source.expandElements ( 0 ) ;
     auto expanded_target = target.expandElements ( 0 ) ;
-    ele_aggregating_filter ( expanded_source , expanded_target ,
-                             axis + 1 , gain , bc ,
-                             nbpoles , pole , tolerance ) ;
+    ele_aggregating_filter < decltype ( expanded_source ) ,
+                             decltype ( expanded_target ) ,
+                             math_type >
+                ( expanded_source ,
+                  expanded_target ,
+                  axis + 1 ,
+                  gain ,
+                  bc ,
+                  nbpoles ,
+                  pole ,
+                  tolerance ) ;
   }
   else
   {
@@ -1337,9 +1390,10 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
       es ( source.shape() , source.stride() , (ele_type*)(source.data()) ) ;
     vigra::MultiArrayView < target.actual_dimension , ele_type >
       et ( target.shape() , target.stride() , (ele_type*)(target.data()) ) ;
-    ele_aggregating_filter ( es , et ,
-                             axis , gain , bc ,
-                             nbpoles , pole , tolerance ) ;
+    ele_aggregating_filter < decltype ( es ) ,
+                             decltype ( es ) ,
+                             math_type >
+      ( es , et , axis , gain , bc , nbpoles , pole , tolerance ) ;
   }
 }
 
@@ -1353,9 +1407,10 @@ void aggregating_filter ( range_type < typename source_type::difference_type > r
 /// partial specialization for 1D arrays, where all of our usual schemes of multithreading and
 /// vectorization don't intrinsically work and we have to employ a different method, see there.
 
-template < typename input_array_type ,      ///< type of array with knot point data
-           typename output_array_type ,     ///< type of array for coefficients (may be the same)
-           int dim = input_array_type::actual_dimension >
+template < typename input_array_type ,  ///< type of array with knot point data
+           typename output_array_type , ///< type of array for coefficients (may be the same)
+           typename math_type ,         ///< real data type used for calculations inside the filter
+           int dim >
 class filter_1d
 {
 public:
@@ -1383,8 +1438,8 @@ public:
   
   typedef
   std::function < void ( range_t ,
-                         input_array_type ,
-                         output_array_type ,
+                         input_array_type * ,
+                         output_array_type * ,
                          int ,
                          double ,
                          bc_code ,
@@ -1407,26 +1462,32 @@ public:
 //   
 // #endif
 
-  auto pf = & nonaggregating_filter < input_array_type , output_array_type > ;
+  auto pf = & nonaggregating_filter < input_array_type ,
+                                      output_array_type ,
+                                      typename input_array_type::value_type > ; // for now...
   
 #ifdef USE_VC
 
+  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;
+
   if ( use_vc )
-    pf = &aggregating_filter < input_array_type , output_array_type > ;
+    pf = & aggregating_filter < input_array_type ,
+                                output_array_type ,
+                                ele_type > ;
 
 #endif
   
   // obtain a partitioning of the data array into subranges. We do this 'manually' here
   // because we must instruct shape_splitter not to chop up the current processing axis
   // (by passing the 'forbid' parameter, which is taken as -1 (no effect) in the default
-  // partitioning of nD arrays.
+  // partitioning of nD arrays).
 
   partition_t partitioning = shape_splitter < dim > :: part ( input.shape() , nslices , axis ) ;
   
   multithread ( pf ,
                 partitioning ,
-                input ,
-                output ,
+                &input ,
+                &output ,
                 axis ,
                 gain ,
                 bc ,
@@ -1443,23 +1504,25 @@ public:
 /// which contain wrong results due to the fact that some boundary condition appropriate
 /// for the 2D case was applied.
 
-template < typename input_array_type ,      ///< type of array with knot point data
-           typename output_array_type >     ///< type of array for coefficients (may be the same)
+template < typename input_array_type ,  ///< type of array with knot point data
+           typename output_array_type , ///< type of array for coefficients (may be the same)
+           typename math_type >         ///< type for calculations inside filter
 class filter_1d < input_array_type ,
                   output_array_type ,
+                  math_type ,
                   1 >                 // specialize for 1D
 {
 public:
   void operator() ( input_array_type &input ,   ///< source data. can operate in-place
-                   output_array_type &output ,  ///< where input == output.
-                   int axis ,
-                   double gain ,
-                   bc_code bc ,                 ///< boundary treatment for this solver
-                   int nbpoles ,
-                   const double * pole ,
-                   double tolerance ,
-                   bool use_vc = true ,
-                   int nslices = ncores )  ///< number of threads to use
+                    output_array_type &output ,  ///< where input == output.
+                    int axis ,
+                    double gain ,
+                    bc_code bc ,                 ///< boundary treatment for this solver
+                    int nbpoles ,
+                    const double * pole ,
+                    double tolerance ,
+                    bool use_vc = true ,
+                    int nslices = ncores )  ///< number of threads to use
 {
   typedef typename input_array_type::value_type value_type ;
   typedef decltype ( input.begin() ) input_iter_type ;
@@ -1628,16 +1691,17 @@ public:
 
   vigra::MultiArray < 2 , value_type > margin_buffer ( fake_2d_margin.shape() ) ;
   
-  filter_1d < fake_2d_type , fake_2d_type , 2 > () ( fake_2d_margin ,
-                                                     margin_buffer ,
-                                                     0 ,
-                                                     gain ,
-                                                     IGNORE ,
-                                                     nbpoles ,
-                                                     pole ,
-                                                     tolerance ,
-                                                     0 ,
-                                                     1 ) ;
+  filter_1d < fake_2d_type , fake_2d_type , math_type , 2 > ()
+    ( fake_2d_margin ,
+      margin_buffer ,
+      0 ,
+      gain ,
+      GUESS ,
+      nbpoles ,
+      pole ,
+      tolerance ,
+      0 ,
+      1 ) ;
  
   // now we have filtered data for the margins in margin_buffer, of which the central half
   // is usable, the remainder being runup data which we'll ignore. Here's a view to the
@@ -1671,16 +1735,17 @@ public:
   
   // now we filter the fake 2D source to the fake 2D target
 
-  filter_1d < fake_2d_type , fake_2d_type , 2 > () ( fake_2d_source ,
-                                                     fake_2d_target ,
-                                                     0 ,
-                                                     gain ,
-                                                     IGNORE ,
-                                                     nbpoles ,
-                                                     pole ,
-                                                     tolerance ,
-                                                     use_vc ,
-                                                     nslices ) ;
+  filter_1d < fake_2d_type , fake_2d_type , math_type , 2 > ()
+    ( fake_2d_source ,
+      fake_2d_target ,
+      0 ,
+      gain ,
+      GUESS ,
+      nbpoles ,
+      pole ,
+      tolerance ,
+      use_vc ,
+      nslices ) ;
 
   // we now have filtered data in target, but the stripes along the magin
   // in x-direction (1 runup wide) are wrong, because we applied IGNORE BC.
@@ -1719,8 +1784,9 @@ public:
 /// - use_vc: flag whether to use vector code or not (if present)
 /// - nslices: number of threads to use (per default as many as physical cores)
 
-template < typename input_array_type ,      // type of array with knot point data
-           typename output_array_type >     // type of array for coefficients (may be the same)
+template < typename input_array_type ,  // type of array with knot point data
+           typename output_array_type , // type of array for coefficients (may be the same)
+           typename math_type >         // type used for arithmetic operations in filter
 void filter_nd ( input_array_type & input ,
                  output_array_type & output ,
                  vigra::TinyVector<bc_code,input_array_type::actual_dimension> bc ,
@@ -1779,16 +1845,17 @@ void filter_nd ( input_array_type & input ,
   // even if degree <= 1, we'll only arrive here if input != output.
   // So we still have to copy the input data to the output (solve_identity)
   
-  filter_1d<input_array_type,output_array_type,dim>() ( input ,
-                                                        output ,
-                                                        0 ,
-                                                        gain_d0 ,
-                                                        bc[0] ,
-                                                        nbpoles ,
-                                                        pole ,
-                                                        tolerance ,
-                                                        use_vc ,
-                                                        nslices ) ;
+  filter_1d < input_array_type , output_array_type , math_type , dim > ()
+    ( input ,
+      output ,
+      0 ,
+      gain_d0 ,
+      bc[0] ,
+      nbpoles ,
+      pole ,
+      tolerance ,
+      use_vc ,
+      nslices ) ;
 
   // but if degree <= 1 we're done already, since copying the data again
   // in dimensions 1... is futile
@@ -1797,16 +1864,17 @@ void filter_nd ( input_array_type & input ,
   {
     // so for the remaining dimensions we also call the filter.
     for ( int d = 1 ; d < dim ; d++ )
-      filter_1d<output_array_type,output_array_type,dim>() ( output ,
-                                                             output ,
-                                                             d ,
-                                                             gain_dn ,
-                                                             bc[d] ,
-                                                             nbpoles ,
-                                                             pole ,
-                                                             tolerance ,
-                                                             use_vc ,
-                                                             nslices ) ;
+      filter_1d < output_array_type , output_array_type , math_type , dim > ()
+        ( output ,
+          output ,
+          d ,
+          gain_dn ,
+          bc[d] ,
+          nbpoles ,
+          pole ,
+          tolerance ,
+          use_vc ,
+          nslices ) ;
   }
 }
 
diff --git a/interpolator.h b/interpolator.h
index 3aae68f..8898ac6 100644
--- a/interpolator.h
+++ b/interpolator.h
@@ -123,13 +123,41 @@ struct interpolator
   virtual void v_eval ( const nd_rc_v & input ,         // simdized nD coordinate
                         mc_ele_v & result ) const = 0 ; // simdized value_type
 
-  /// require overload of v_eval taking pointers to vsize coordinates and
-  /// vsize values, expecting these data to be contiguous in memory. With this
-  /// variant, the implementation of this v_eval takes responsibility for
-  /// (de)interleaving the data.
+  /// v_eval taking pointers to vsize coordinates and vsize values,
+  /// expecting these data to be contiguous in memory. This variant
+  /// provides (de)interleaving of the the data.
 
-  virtual void v_eval ( const nd_rc_type * const pmc ,    // -> vsize coordinates
-                        value_type * result ) const = 0 ; // -> vsize result values
+  void v_eval ( const nd_rc_type * const pmc , // -> vsize coordinates
+                value_type * result ) const    // -> vsize result values
+  {
+    nd_rc_v cv ;
+    mc_ele_v ev ;
+
+    if ( dimension == 1 )
+    {
+      cv[0].load ( (rc_type*) pmc ) ;
+    }
+    else
+    {
+      for ( int d = 0 ; d < dimension ; d++ )
+      {
+        cv[d].gather ( (rc_type*) pmc ,
+                      ic_v::IndexesFromZero() * dimension + d ) ;
+      }
+    }
+    v_eval ( cv , ev ) ;
+    if ( channels == 1 )
+    {
+      ev[0].store ( (ele_type*) result ) ;
+    }
+    else
+    {
+      for ( int ch = 0 ; ch < channels ; ch++ )
+      {
+        ev[ch].scatter ( (ele_type*) result , ic_v::IndexesFromZero() * channels + ch ) ;
+      }
+    }
+  } ;
   
 #endif
 } ;
@@ -167,7 +195,8 @@ struct nearest_neighbour
   // data to interpolate over
 
   const substrate_type & substrate ;
-  
+  const nd_ic_type cutoff ;
+
 #ifdef USE_VC
 
   // import vectorized types
@@ -185,11 +214,14 @@ struct nearest_neighbour
 
 #endif
   
-  nearest_neighbour ( const substrate_type & _substrate )
+  nearest_neighbour ( const substrate_type & _substrate ,
+                      nd_ic_type _cutoff
+  )
   : substrate ( _substrate )
 #ifdef USE_VC
   , p_data ( (ele_type*) ( substrate.data() ) )
 #endif
+  , cutoff ( _cutoff )
   {
   } ;
 
@@ -204,7 +236,7 @@ struct nearest_neighbour
     for ( int d = 0 ; d < dimension ; d++ )
     {
       where[d] = ic_type ( c[d] + rc_type ( 0.5 ) ) ;
-      if ( where[d] < 0 || where[d] >= substrate.stride(d) )
+      if ( where[d] < 0 || where[d] >= cutoff[d] )
       {
         result = 0 ;
         return ;
@@ -227,7 +259,7 @@ struct nearest_neighbour
     {
       ic_v select ( c[d] + rc_type ( 0.5 ) ) ;
       mask |= ( select < 0 ) ;
-      mask |= ( select >= substrate.shape(d) ) ;
+      mask |= ( select >= cutoff[d] ) ;
       select ( mask ) = 0 ;
       offset += select * ic_type ( substrate.stride(d) ) ;
     }
@@ -244,20 +276,27 @@ struct nearest_neighbour
   void v_eval ( const nd_rc_type * const pmc ,  // pointer to vsize muli_coordinates
                 value_type * result ) const     // pointer to vsize result values
   {
-    nd_rc_v cv ;
-    mc_ele_v ev ;
-
-    for ( int d = 0 ; d < dimension ; d++ )
-    {
-      cv[d].gather ( (rc_type*) pmc ,
-                     ic_v::IndexesFromZero() * dimension + d ) ;
-    }
-    v_eval ( cv , ev ) ;
-    for ( int ch = 0 ; ch < channels ; ch++ )
-    {
-      ev[ch].scatter ( (ele_type*) result , ic_v::IndexesFromZero() * channels + ch ) ;
-    }
+    // delegate to base class method
+    ((base_type*)this)->v_eval ( pmc , result ) ;
   } ;
+
+//   void v_eval ( const nd_rc_type * const pmc ,  // pointer to vsize muli_coordinates
+//                 value_type * result ) const     // pointer to vsize result values
+//   {
+//     nd_rc_v cv ;
+//     mc_ele_v ev ;
+// 
+//     for ( int d = 0 ; d < dimension ; d++ )
+//     {
+//       cv[d].gather ( (rc_type*) pmc ,
+//                      ic_v::IndexesFromZero() * dimension + d ) ;
+//     }
+//     v_eval ( cv , ev ) ;
+//     for ( int ch = 0 ; ch < channels ; ch++ )
+//     {
+//       ev[ch].scatter ( (ele_type*) result , ic_v::IndexesFromZero() * channels + ch ) ;
+//     }
+//   } ;
   
 #endif
 } ;
diff --git a/prefilter.h b/prefilter.h
index e5e5560..c57f74c 100644
--- a/prefilter.h
+++ b/prefilter.h
@@ -168,16 +168,17 @@ using namespace vigra ;
 
 // TODO: establish the proper maths for this smoothing method
 
-template < typename input_array_type ,      // type of array with knot point data
-           typename output_array_type >     // type of array for coefficients (may be the same)
+template < typename input_array_type ,  ///< type of array with knot point data
+           typename output_array_type , ///< type of array for coefficients (may be the same)
+           typename math_type >         ///< type for arithmetic operations in filter
 void solve ( input_array_type & input ,
-                   output_array_type & output ,
-                   TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
-                   int degree ,
-                   double tolerance ,
-                   double smoothing = 0.0 ,
-                   bool use_vc = true ,
-                   int nslices = ncores )
+             output_array_type & output ,
+             TinyVector<bc_code,input_array_type::actual_dimension> bcv ,
+             int degree ,
+             double tolerance ,
+             double smoothing = 0.0 ,
+             bool use_vc = true ,
+             int nslices = ncores )
 {
   if ( smoothing != 0.0 )
   {
@@ -187,7 +188,8 @@ void solve ( input_array_type & input ,
     pole[0] = smoothing ;
     for ( int i = 1 ; i < npoles ; i++ )
       pole[i] = precomputed_poles [ degree ] [ i - 1 ] ;
-    filter_nd ( input ,
+    filter_nd < input_array_type , output_array_type , math_type >
+              ( input ,
                 output ,
                 bcv ,
                 npoles ,
@@ -197,7 +199,8 @@ void solve ( input_array_type & input ,
                 nslices ) ;
   }
   else
-    filter_nd ( input ,
+    filter_nd < input_array_type , output_array_type , math_type >
+              ( input ,
                 output ,
                 bcv ,
                 degree / 2 ,
diff --git a/remap.h b/remap.h
index d0ec6b8..16531fc 100644
--- a/remap.h
+++ b/remap.h
@@ -334,8 +334,8 @@ template < typename coordinate_type , // type of coordinates in the warp array
            int dim_out >              // number of dimensions of output array
 int remap ( const MultiArrayView < coordinate_traits < coordinate_type > :: dimension ,
                              value_type > input ,
-            const MultiArrayView < dim_out , coordinate_type > warp ,
-            MultiArrayView < dim_out , value_type > output ,
+            const MultiArrayView < dim_out , coordinate_type > & warp ,
+            MultiArrayView < dim_out , value_type > & output ,
             bcv_type < coordinate_traits < coordinate_type > :: dimension > bcv
               = bcv_type < coordinate_traits < coordinate_type > :: dimension >
                 ( MIRROR ) ,

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git